50 files changed, 51850 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 0000000..590a52d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,108 @@
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "adce"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+static bool aggressiveDCE(Function& F) {
+  SmallPtrSet<Instruction*, 128> Alive;
+  SmallVector<Instruction*, 128> Worklist;
+
+  // Collect the set of "root" instructions that are known live.
+  for (Instruction &I : instructions(F)) {
+    if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() ||
+        I.mayHaveSideEffects()) {
+      Alive.insert(&I);
+      Worklist.push_back(&I);
+    }
+  }
+
+  // Propagate liveness backwards to operands.
+  while (!Worklist.empty()) {
+    Instruction *Curr = Worklist.pop_back_val();
+    for (Use &OI : Curr->operands()) {
+      if (Instruction *Inst = dyn_cast<Instruction>(OI))
+        if (Alive.insert(Inst).second)
+          Worklist.push_back(Inst);
+    }
+  }
+
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the Worklist vector here for memory efficiency.
+  for (Instruction &I : instructions(F)) {
+    if (!Alive.count(&I)) {
+      Worklist.push_back(&I);
+      I.dropAllReferences();
+    }
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return !Worklist.empty();
+}
+
+PreservedAnalyses ADCEPass::run(Function &F) {
+  if (aggressiveDCE(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+struct ADCELegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  ADCELegacyPass() : FunctionPass(ID) {
+    initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function& F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+    return aggressiveDCE(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char ADCELegacyPass::ID = 0;
+INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+                false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
new file mode 100644
index 0000000..4b721d3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -0,0 +1,428 @@
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+//                  Set Load/Store Alignments From Assumptions
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+  "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+  "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+  "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  AlignmentFromAssumptions() : FunctionPass(ID) {
+    initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+
+    AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  // For memory transfers, we need a common alignment for both the source and
+  // destination. If we have a new alignment for only one operand of a transfer
+  // instruction, save it in these maps.  If we reach the other operand through
+  // another assumption later, then we may change the alignment at that point.
+  DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
+
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+
+  bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
+                            const SCEV *&OffSCEV);
+  bool processAssumption(CallInst *I);
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+                      aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+                    aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+  return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+                                    const SCEV *AlignSCEV,
+                                    ScalarEvolution *SE) {
+  // DiffUnits = Diff % int64_t(Alignment)
+  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+  DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+                  *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+  if (const SCEVConstant *ConstDUSCEV =
+      dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+    int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+    // If the displacement is an exact multiple of the alignment, then the
+    // displaced pointer has the same alignment as the aligned pointer, so
+    // return the alignment value.
+    if (!DiffUnits)
+      return (unsigned)
+        cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+    // If the displacement is not an exact multiple, but the remainder is a
+    // constant, then return this remainder (but only if it is a power of 2).
+    uint64_t DiffUnitsAbs = std::abs(DiffUnits);
+    if (isPowerOf2_64(DiffUnitsAbs))
+      return (unsigned) DiffUnitsAbs;
+  }
+
+  return 0;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+                                const SCEV *OffSCEV, Value *Ptr,
+                                ScalarEvolution *SE) {
+  const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+  const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+  // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+  // sign-extended OffSCEV to i64, so make sure they agree again.
+  DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+  // What we really want to know is the overall offset to the aligned
+  // address. This address is displaced by the provided offset.
+  DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+  DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
+                  *AlignSCEV << " and offset " << *OffSCEV <<
+                  " using diff " << *DiffSCEV << "\n");
+
+  unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+  DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+  if (NewAlignment) {
+    return NewAlignment;
+  } else if (const SCEVAddRecExpr *DiffARSCEV =
+             dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+    // The relative offset to the alignment assumption did not yield a constant,
+    // but we should try harder: if we assume that a is 32-byte aligned, then in
+    // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+    // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+    // As a result, the new alignment will not be a constant, but can still
+    // be improved over the default (of 4) to 16.
+
+    const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+    const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+    DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+                    *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+    // Now compute the new alignment using the displacement to the value in the
+    // first iteration, and also the alignment using the per-iteration delta.
+    // If these are the same, then use that answer. Otherwise, use the smaller
+    // one, but only if it divides the larger one.
+    NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+    unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+    DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+    DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+    if (!NewAlignment || !NewIncAlignment) {
+      return 0;
+    } else if (NewAlignment > NewIncAlignment) {
+      if (NewAlignment % NewIncAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewIncAlignment << "\n");
+        return NewIncAlignment;
+      }
+    } else if (NewIncAlignment > NewAlignment) {
+      if (NewIncAlignment % NewAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewAlignment << "\n");
+        return NewAlignment;
+      }
+    } else if (NewIncAlignment == NewAlignment) {
+      DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                      NewAlignment << "\n");
+      return NewAlignment;
+    }
+  }
+
+  return 0;
+}
+
+bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
+                                 Value *&AAPtr, const SCEV *&AlignSCEV,
+                                 const SCEV *&OffSCEV) {
+  // An alignment assume must be a statement about the least-significant
+  // bits of the pointer being zero, possibly with some offset.
+  ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+  if (!ICI)
+    return false;
+
+  // This must be an expression of the form: x & m == 0.
+  if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+    return false;
+
+  // Swap things around so that the RHS is 0.
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+  const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+  if (CmpLHSSCEV->isZero())
+    std::swap(CmpLHS, CmpRHS);
+  else if (!CmpRHSSCEV->isZero())
+    return false;
+
+  BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+  if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+    return false;
+
+  // Swap things around so that the right operand of the and is a constant
+  // (the mask); we cannot deal with variable masks.
+  Value *AndLHS = CmpBO->getOperand(0);
+  Value *AndRHS = CmpBO->getOperand(1);
+  const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+  const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+  if (isa<SCEVConstant>(AndLHSSCEV)) {
+    std::swap(AndLHS, AndRHS);
+    std::swap(AndLHSSCEV, AndRHSSCEV);
+  }
+
+  const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+  if (!MaskSCEV)
+    return false;
+
+  // The mask must have some trailing ones (otherwise the condition is
+  // trivial and tells us nothing about the alignment of the left operand).
+  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
+  if (!TrailingOnes)
+    return false;
+
+  // Cap the alignment at the maximum with which LLVM can deal (and make sure
+  // we don't overflow the shift).
+  uint64_t Alignment;
+  TrailingOnes = std::min(TrailingOnes,
+    unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+  Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+  Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+  AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+  // The LHS might be a ptrtoint instruction, or it might be the pointer
+  // with an offset.
+  AAPtr = nullptr;
+  OffSCEV = nullptr;
+  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+    AAPtr = PToI->getPointerOperand();
+    OffSCEV = SE->getZero(Int64Ty);
+  } else if (const SCEVAddExpr* AndLHSAddSCEV =
+             dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+    // Try to find the ptrtoint; subtract it and the rest is the offset.
+    for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+         JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+      if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+        if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+          AAPtr = PToI->getPointerOperand();
+          OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+          break;
+        }
+  }
+
+  if (!AAPtr)
+    return false;
+
+  // Sign extend the offset to 64 bits (so that it is like all of the other
+  // expressions). 
+  unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+  if (OffSCEVBits < 64)
+    OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+  else if (OffSCEVBits > 64)
+    return false;
+
+  AAPtr = AAPtr->stripPointerCasts();
+  return true;
+}
+
+bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
+  Value *AAPtr;
+  const SCEV *AlignSCEV, *OffSCEV;
+  if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+    return false;
+
+  const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+  // Apply the assumption to all other users of the specified pointer.
+  SmallPtrSet<Instruction *, 32> Visited;
+  SmallVector<Instruction*, 16> WorkList;
+  for (User *J : AAPtr->users()) {
+    if (J == ACall)
+      continue;
+
+    if (Instruction *K = dyn_cast<Instruction>(J))
+      if (isValidAssumeForContext(ACall, K, DT))
+        WorkList.push_back(K);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        LI->getPointerOperand(), SE);
+
+      if (NewAlignment > LI->getAlignment()) {
+        LI->setAlignment(NewAlignment);
+        ++NumLoadAlignChanged;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        SI->getPointerOperand(), SE);
+
+      if (NewAlignment > SI->getAlignment()) {
+        SI->setAlignment(NewAlignment);
+        ++NumStoreAlignChanged;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+      unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        MI->getDest(), SE);
+
+      // For memory transfers, we need a common alignment for both the
+      // source and destination. If we have a new alignment for this
+      // instruction, but only for one operand, save it. If we reach the
+      // other operand through another assumption later, then we may
+      // change the alignment at that point.
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+        unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+          MTI->getSource(), SE);
+
+        DenseMap<MemTransferInst *, unsigned>::iterator DI =
+          NewDestAlignments.find(MTI);
+        unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+                                    0 : DI->second;
+
+        DenseMap<MemTransferInst *, unsigned>::iterator SI =
+          NewSrcAlignments.find(MTI);
+        unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+                                   0 : SI->second;
+
+        DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+                        AltDestAlignment << " " << NewSrcAlignment <<
+                        " " << AltSrcAlignment << "\n");
+
+        // Of these four alignments, pick the largest possible...
+        unsigned NewAlignment = 0;
+        if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+          NewAlignment = std::max(NewAlignment, NewDestAlignment);
+        if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+          NewAlignment = std::max(NewAlignment, AltDestAlignment);
+        if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+          NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+        if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+          NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+        if (NewAlignment > MI->getAlignment()) {
+          MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+            MI->getParent()->getContext()), NewAlignment));
+          ++NumMemIntAlignChanged;
+        }
+
+        NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+        NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+      } else if (NewDestAlignment > MI->getAlignment()) {
+        assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
+               "Unknown memory intrinsic");
+
+        MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+          MI->getParent()->getContext()), NewDestAlignment));
+        ++NumMemIntAlignChanged;
+      }
+    }
+
+    // Now that we've updated that use of the pointer, look for other uses of
+    // the pointer to update.
+    Visited.insert(J);
+    for (User *UJ : J->users()) {
+      Instruction *K = cast<Instruction>(UJ);
+      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
+        WorkList.push_back(K);
+    }
+  }
+
+  return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+  bool Changed = false;
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  NewDestAlignments.clear();
+  NewSrcAlignments.clear();
+
+  for (auto &AssumeVH : AC.assumptions())
+    if (AssumeVH)
+      Changed |= processAssumption(cast<CallInst>(AssumeVH));
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
new file mode 100644
index 0000000..cb9b8b6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -0,0 +1,102 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+namespace {
+struct BDCE : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BDCE() : FunctionPass(ID) {
+    initializeBDCEPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function& F) override;
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DemandedBits>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char BDCE::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBits)
+INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+                    false, false)
+
+bool BDCE::runOnFunction(Function& F) {
+  if (skipOptnoneFunction(F))
+    return false;
+  DemandedBits &DB = getAnalysis<DemandedBits>();
+
+  SmallVector<Instruction*, 128> Worklist;
+  bool Changed = false;
+  for (Instruction &I : instructions(F)) {
+    if (I.getType()->isIntegerTy() &&
+        !DB.getDemandedBits(&I).getBoolValue()) {
+      // For live instructions that have all dead bits, first make them dead by
+      // replacing all uses with something else. Then, if they don't need to
+      // remain live (because they have side effects, etc.) we can remove them.
+      DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+      // FIXME: In theory we could substitute undef here instead of zero.
+      // This should be reconsidered once we settle on the semantics of
+      // undef, poison, etc.
+      Value *Zero = ConstantInt::get(I.getType(), 0);
+      ++NumSimplified;
+      I.replaceAllUsesWith(Zero);
+      Changed = true;
+    }
+    if (!DB.isInstructionDead(&I))
+      continue;
+
+    Worklist.push_back(&I);
+    I.dropAllReferences();
+    Changed = true;
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createBitTrackingDCEPass() {
+  return new BDCE();
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
new file mode 100644
index 0000000..84f7f5f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -0,0 +1,606 @@
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCost will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "consthoist"
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+namespace {
+struct ConstantUser;
+struct RebasedConstantInfo;
+
+typedef SmallVector<ConstantUser, 8> ConstantUseListType;
+typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType;
+
+/// \brief Keeps track of the user of a constant and the operand index where the
+/// constant is used.
+struct ConstantUser {
+  Instruction *Inst;
+  unsigned OpndIdx;
+
+  ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { }
+};
+
+/// \brief Keeps track of a constant candidate and its uses.
+struct ConstantCandidate {
+  ConstantUseListType Uses;
+  ConstantInt *ConstInt;
+  unsigned CumulativeCost;
+
+  ConstantCandidate(ConstantInt *ConstInt)
+    : ConstInt(ConstInt), CumulativeCost(0) { }
+
+  /// \brief Add the user to the use list and update the cost.
+  void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) {
+    CumulativeCost += Cost;
+    Uses.push_back(ConstantUser(Inst, Idx));
+  }
+};
+
+/// \brief This represents a constant that has been rebased with respect to a
+/// base constant. The difference to the base constant is recorded in Offset.
+struct RebasedConstantInfo {
+  ConstantUseListType Uses;
+  Constant *Offset;
+
+  RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset)
+    : Uses(std::move(Uses)), Offset(Offset) { }
+};
+
+/// \brief A base constant and all its rebased constants.
+struct ConstantInfo {
+  ConstantInt *BaseConstant;
+  RebasedConstantListType RebasedConstants;
+};
+
+/// \brief The constant hoisting pass.
+class ConstantHoisting : public FunctionPass {
+  typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType;
+  typedef std::vector<ConstantCandidate> ConstCandVecType;
+
+  const TargetTransformInfo *TTI;
+  DominatorTree *DT;
+  BasicBlock *Entry;
+
+  /// Keeps track of constant candidates found in the function.
+  ConstCandVecType ConstCandVec;
+
+  /// Keep track of cast instructions we already cloned.
+  SmallDenseMap<Instruction *, Instruction *> ClonedCastMap;
+
+  /// These are the final constants we decided to hoist.
+  SmallVector<ConstantInfo, 8> ConstantVec;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr),
+                       Entry(nullptr) {
+    initializeConstantHoistingPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  const char *getPassName() const override { return "Constant Hoisting"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  /// \brief Initialize the pass.
+  void setup(Function &Fn) {
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+    Entry = &Fn.getEntryBlock();
+  }
+
+  /// \brief Cleanup.
+  void cleanup() {
+    ConstantVec.clear();
+    ClonedCastMap.clear();
+    ConstCandVec.clear();
+
+    TTI = nullptr;
+    DT = nullptr;
+    Entry = nullptr;
+  }
+
+  Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
+  Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const;
+  void collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                 Instruction *Inst, unsigned Idx,
+                                 ConstantInt *ConstInt);
+  void collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                 Instruction *Inst);
+  void collectConstantCandidates(Function &Fn);
+  void findAndMakeBaseConstant(ConstCandVecType::iterator S,
+                               ConstCandVecType::iterator E);
+  void findBaseConstants();
+  void emitBaseConstants(Instruction *Base, Constant *Offset,
+                         const ConstantUser &ConstUser);
+  bool emitBaseConstants();
+  void deleteDeadCastInst() const;
+  bool optimizeConstants(Function &Fn);
+};
+}
+
+char ConstantHoisting::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting",
+                    false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+  return new ConstantHoisting();
+}
+
+/// \brief Perform the constant hoisting optimization for the given function.
+bool ConstantHoisting::runOnFunction(Function &Fn) {
+  if (skipOptnoneFunction(Fn))
+    return false;
+
+  DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+  DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+  setup(Fn);
+
+  bool MadeChange = optimizeConstants(Fn);
+
+  if (MadeChange) {
+    DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+                 << Fn.getName() << '\n');
+    DEBUG(dbgs() << Fn);
+  }
+  DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+  cleanup();
+
+  return MadeChange;
+}
+
+
+/// \brief Find the constant materialization insertion point.
+Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
+                                               unsigned Idx) const {
+  // If the operand is a cast instruction, then we have to materialize the
+  // constant before the cast instruction.
+  if (Idx != ~0U) {
+    Value *Opnd = Inst->getOperand(Idx);
+    if (auto CastInst = dyn_cast<Instruction>(Opnd))
+      if (CastInst->isCast())
+        return CastInst;
+  }
+
+  // The simple and common case. This also includes constant expressions.
+  if (!isa<PHINode>(Inst) && !Inst->isEHPad())
+    return Inst;
+
+  // We can't insert directly before a phi node or an eh pad. Insert before
+  // the terminator of the incoming or dominating block.
+  assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+  if (Idx != ~0U && isa<PHINode>(Inst))
+    return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+  BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
+  return IDom->getTerminator();
+}
+
+/// \brief Find an insertion point that dominates all uses.
+Instruction *ConstantHoisting::
+findConstantInsertionPoint(const ConstantInfo &ConstInfo) const {
+  assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+  // Collect all basic blocks.
+  SmallPtrSet<BasicBlock *, 8> BBs;
+  for (auto const &RCI : ConstInfo.RebasedConstants)
+    for (auto const &U : RCI.Uses)
+      BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
+
+  if (BBs.count(Entry))
+    return &Entry->front();
+
+  while (BBs.size() >= 2) {
+    BasicBlock *BB, *BB1, *BB2;
+    BB1 = *BBs.begin();
+    BB2 = *std::next(BBs.begin());
+    BB = DT->findNearestCommonDominator(BB1, BB2);
+    if (BB == Entry)
+      return &Entry->front();
+    BBs.erase(BB1);
+    BBs.erase(BB2);
+    BBs.insert(BB);
+  }
+  assert((BBs.size() == 1) && "Expected only one element.");
+  Instruction &FirstInst = (*BBs.begin())->front();
+  return findMatInsertPt(&FirstInst);
+}
+
+
+/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+// constant integer.
+void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                                 Instruction *Inst,
+                                                 unsigned Idx,
+                                                 ConstantInt *ConstInt) {
+  unsigned Cost;
+  // Ask the target about the cost of materializing the constant for the given
+  // instruction and operand index.
+  if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+    Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx,
+                              ConstInt->getValue(), ConstInt->getType());
+  else
+    Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(),
+                              ConstInt->getType());
+
+  // Ignore cheap integer constants.
+  if (Cost > TargetTransformInfo::TCC_Basic) {
+    ConstCandMapType::iterator Itr;
+    bool Inserted;
+    std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0));
+    if (Inserted) {
+      ConstCandVec.push_back(ConstantCandidate(ConstInt));
+      Itr->second = ConstCandVec.size() - 1;
+    }
+    ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
+    DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
+            dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+                   << " with cost " << Cost << '\n';
+          else
+          dbgs() << "Collect constant " << *ConstInt << " indirectly from "
+                 << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
+                 << Cost << '\n';
+    );
+  }
+}
+
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                                 Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Can't handle inline asm. Skip it.
+  if (auto Call = dyn_cast<CallInst>(Inst))
+    if (isa<InlineAsm>(Call->getCalledValue()))
+      return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    Value *Opnd = Inst->getOperand(Idx);
+
+    // Visit constant integers.
+    if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      continue;
+    }
+
+    // Visit cast instructions that have constant integers.
+    if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+      // Only visit cast instructions, which have been skipped. All other
+      // instructions should have already been visited.
+      if (!CastInst->isCast())
+        continue;
+
+      if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+        // Pretend the constant is directly used by the instruction and ignore
+        // the cast instruction.
+        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+        continue;
+      }
+    }
+
+    // Visit constant expressions that have constant integers.
+    if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+      // Only visit constant cast expressions.
+      if (!ConstExpr->isCast())
+        continue;
+
+      if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+        // Pretend the constant is directly used by the instruction and ignore
+        // the constant expression.
+        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+        continue;
+      }
+    }
+  } // end of for all operands
+}
+
+/// \brief Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoisting::collectConstantCandidates(Function &Fn) {
+  ConstCandMapType ConstCandMap;
+  for (BasicBlock &BB : Fn)
+    for (Instruction &Inst : BB)
+      collectConstantCandidates(ConstCandMap, &Inst);
+}
+
+/// \brief Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S,
+                                               ConstCandVecType::iterator E) {
+  auto MaxCostItr = S;
+  unsigned NumUses = 0;
+  // Use the constant that has the maximum cost as base constant.
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    NumUses += ConstCand->Uses.size();
+    if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+      MaxCostItr = ConstCand;
+  }
+
+  // Don't hoist constants that have only one use.
+  if (NumUses <= 1)
+    return;
+
+  ConstantInfo ConstInfo;
+  ConstInfo.BaseConstant = MaxCostItr->ConstInt;
+  Type *Ty = ConstInfo.BaseConstant->getType();
+
+  // Rebase the constants with respect to the base constant.
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    APInt Diff = ConstCand->ConstInt->getValue() -
+                 ConstInfo.BaseConstant->getValue();
+    Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+    ConstInfo.RebasedConstants.push_back(
+      RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
+  }
+  ConstantVec.push_back(std::move(ConstInfo));
+}
+
+/// \brief Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoisting::findBaseConstants() {
+  // Sort the constants by value and type. This invalidates the mapping!
+  std::sort(ConstCandVec.begin(), ConstCandVec.end(),
+            [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+    if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+      return LHS.ConstInt->getType()->getBitWidth() <
+             RHS.ConstInt->getType()->getBitWidth();
+    return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+  });
+
+  // Simple linear scan through the sorted constant candidate vector for viable
+  // merge candidates.
+  auto MinValItr = ConstCandVec.begin();
+  for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+       CC != E; ++CC) {
+    if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+      // Check if the constant is in range of an add with immediate.
+      APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+      if ((Diff.getBitWidth() <= 64) &&
+          TTI->isLegalAddImmediate(Diff.getSExtValue()))
+        continue;
+    }
+    // We either have now a different constant type or the constant is not in
+    // range of an add with immediate anymore.
+    findAndMakeBaseConstant(MinValItr, CC);
+    // Start a new base constant search.
+    MinValItr = CC;
+  }
+  // Finalize the last base constant search.
+  findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
+}
+
+/// \brief Updates the operand at Idx in instruction Inst with the result of
+///        instruction Mat. If the instruction is a PHI node then special
+///        handling for duplicate values form the same incomming basic block is
+///        required.
+/// \return The update will always succeed, but the return value indicated if
+///         Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+  if (auto PHI = dyn_cast<PHINode>(Inst)) {
+    // Check if any previous operand of the PHI node has the same incoming basic
+    // block. This is a very odd case that happens when the incoming basic block
+    // has a switch statement. In this case use the same value as the previous
+    // operand(s), otherwise we will fail verification due to different values.
+    // The values are actually the same, but the variable names are different
+    // and the verifier doesn't like that.
+    BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+    for (unsigned i = 0; i < Idx; ++i) {
+      if (PHI->getIncomingBlock(i) == IncomingBB) {
+        Value *IncomingVal = PHI->getIncomingValue(i);
+        Inst->setOperand(Idx, IncomingVal);
+        return false;
+      }
+    }
+  }
+
+  Inst->setOperand(Idx, Mat);
+  return true;
+}
+
+/// \brief Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset,
+                                         const ConstantUser &ConstUser) {
+  Instruction *Mat = Base;
+  if (Offset) {
+    Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+                                               ConstUser.OpndIdx);
+    Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+                                 "const_mat", InsertionPt);
+
+    DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+                 << " + " << *Offset << ") in BB "
+                 << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+    Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+  }
+  Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+  // Visit constant integer.
+  if (isa<ConstantInt>(Opnd)) {
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+      Mat->eraseFromParent();
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit cast instruction.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    assert(CastInst->isCast() && "Expected an cast instruction!");
+    // Check if we already have visited this cast instruction before to avoid
+    // unnecessary cloning.
+    Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+    if (!ClonedCastInst) {
+      ClonedCastInst = CastInst->clone();
+      ClonedCastInst->setOperand(0, Mat);
+      ClonedCastInst->insertAfter(CastInst);
+      // Use the same debug location as the original cast instruction.
+      ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+      DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+                   << "To               : " << *ClonedCastInst << '\n');
+    }
+
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit constant expression.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    ConstExprInst->setOperand(0, Mat);
+    ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+                                                ConstUser.OpndIdx));
+
+    // Use the same debug location as the instruction we are about to update.
+    ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+    DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+                 << "From              : " << *ConstExpr << '\n');
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+      ConstExprInst->eraseFromParent();
+      if (Offset)
+        Mat->eraseFromParent();
+    }
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+}
+
+/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoisting::emitBaseConstants() {
+  bool MadeChange = false;
+  for (auto const &ConstInfo : ConstantVec) {
+    // Hoist and hide the base constant behind a bitcast.
+    Instruction *IP = findConstantInsertionPoint(ConstInfo);
+    IntegerType *Ty = ConstInfo.BaseConstant->getType();
+    Instruction *Base =
+      new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+    DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB "
+                 << IP->getParent()->getName() << '\n' << *Base << '\n');
+    NumConstantsHoisted++;
+
+    // Emit materialization code for all rebased constants.
+    for (auto const &RCI : ConstInfo.RebasedConstants) {
+      NumConstantsRebased++;
+      for (auto const &U : RCI.Uses)
+        emitBaseConstants(Base, RCI.Offset, U);
+    }
+
+    // Use the same debug location as the last user of the constant.
+    assert(!Base->use_empty() && "The use list is empty!?");
+    assert(isa<Instruction>(Base->user_back()) &&
+           "All uses should be instructions.");
+    Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
+
+    // Correct for base constant, which we counted above too.
+    NumConstantsRebased--;
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// \brief Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoisting::deleteDeadCastInst() const {
+  for (auto const &I : ClonedCastMap)
+    if (I.first->use_empty())
+      I.first->eraseFromParent();
+}
+
+/// \brief Optimize expensive integer constants in the given function.
+bool ConstantHoisting::optimizeConstants(Function &Fn) {
+  // Collect all constant candidates.
+  collectConstantCandidates(Fn);
+
+  // There are no constant candidates to worry about.
+  if (ConstCandVec.empty())
+    return false;
+
+  // Combine constants that can be easily materialized with an add from a common
+  // base constant.
+  findBaseConstants();
+
+  // There are no constants to emit.
+  if (ConstantVec.empty())
+    return false;
+
+  // Finally hoist the base constant and emit materialization code for dependent
+  // constants.
+  bool MadeChange = emitBaseConstants();
+
+  // Cleanup dead instructions.
+  deleteDeadCastInst();
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 0000000..c974ebb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,98 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+//   * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "constprop"
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+  struct ConstantPropagation : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantPropagation() : FunctionPass(ID) {
+      initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+  };
+}
+
+char ConstantPropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+
+FunctionPass *llvm::createConstantPropagationPass() {
+  return new ConstantPropagation();
+}
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+  // Initialize the worklist to all of the instructions ready to process...
+  std::set<Instruction*> WorkList;
+  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      WorkList.insert(&*i);
+  }
+  bool Changed = false;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  while (!WorkList.empty()) {
+    Instruction *I = *WorkList.begin();
+    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
+
+    if (!I->use_empty())                 // Don't muck with dead instructions...
+      if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
+        // Add all of the users of this instruction to the worklist, they might
+        // be constant propagatable now...
+        for (User *U : I->users())
+          WorkList.insert(cast<Instruction>(U));
+
+        // Replace all of the uses of a variable with uses of the constant.
+        I->replaceAllUsesWith(C);
+
+        // Remove the dead instruction.
+        WorkList.erase(I);
+        I->eraseFromParent();
+
+        // We made a change to the function...
+        Changed = true;
+        ++NumInstKilled;
+      }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
new file mode 100644
index 0000000..686bd40
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -0,0 +1,419 @@
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "correlated-value-propagation"
+
+STATISTIC(NumPhis,      "Number of phis propagated");
+STATISTIC(NumSelects,   "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumReturns,   "Number of return values propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
+
+namespace {
+  class CorrelatedValuePropagation : public FunctionPass {
+    LazyValueInfo *LVI;
+
+    bool processSelect(SelectInst *SI);
+    bool processPHI(PHINode *P);
+    bool processMemAccess(Instruction *I);
+    bool processCmp(CmpInst *C);
+    bool processSwitch(SwitchInst *SI);
+    bool processCallSite(CallSite CS);
+
+    /// Return a constant value for V usable at At and everything it
+    /// dominates.  If no such Constant can be found, return nullptr.
+    Constant *getConstantAt(Value *V, Instruction *At);
+
+  public:
+    static char ID;
+    CorrelatedValuePropagation(): FunctionPass(ID) {
+     initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LazyValueInfo>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+  };
+}
+
+char CorrelatedValuePropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+  return new CorrelatedValuePropagation();
+}
+
+bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
+  if (S->getType()->isVectorTy()) return false;
+  if (isa<Constant>(S->getOperand(0))) return false;
+
+  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
+  if (!C) return false;
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(C);
+  if (!CI) return false;
+
+  Value *ReplaceWith = S->getOperand(1);
+  Value *Other = S->getOperand(2);
+  if (!CI->isOne()) std::swap(ReplaceWith, Other);
+  if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
+
+  S->replaceAllUsesWith(ReplaceWith);
+  S->eraseFromParent();
+
+  ++NumSelects;
+
+  return true;
+}
+
+bool CorrelatedValuePropagation::processPHI(PHINode *P) {
+  bool Changed = false;
+
+  BasicBlock *BB = P->getParent();
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (isa<Constant>(Incoming)) continue;
+
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
+
+    // Look if the incoming value is a select with a scalar condition for which
+    // LVI can tells us the value. In that case replace the incoming value with
+    // the appropriate value of the select. This often allows us to remove the
+    // select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Value *Condition = SI->getCondition();
+      if (!Condition->getType()->isVectorTy()) {
+        if (Constant *C = LVI->getConstantOnEdge(
+                Condition, P->getIncomingBlock(i), BB, P)) {
+          if (C->isOneValue()) {
+            V = SI->getTrueValue();
+          } else if (C->isZeroValue()) {
+            V = SI->getFalseValue();
+          }
+          // Once LVI learns to handle vector types, we could also add support
+          // for vector type constants that are not all zeroes or all ones.
+        }
+      }
+
+      // Look if the select has a constant but LVI tells us that the incoming
+      // value can never be that constant. In that case replace the incoming
+      // value with the other value of the select. This often allows us to
+      // remove the select later.
+      if (!V) {
+        Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+        if (!C) continue;
+
+        if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+              P->getIncomingBlock(i), BB, P) !=
+            LazyValueInfo::False)
+          continue;
+        V = SI->getTrueValue();
+      }
+
+      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+    }
+
+    P->setIncomingValue(i, V);
+    Changed = true;
+  }
+
+  // FIXME: Provide TLI, DT, AT to SimplifyInstruction.
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  if (Value *V = SimplifyInstruction(P, DL)) {
+    P->replaceAllUsesWith(V);
+    P->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Changed)
+    ++NumPhis;
+
+  return Changed;
+}
+
+bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
+  Value *Pointer = nullptr;
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    Pointer = L->getPointerOperand();
+  else
+    Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+  if (isa<Constant>(Pointer)) return false;
+
+  Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
+  if (!C) return false;
+
+  ++NumMemAccess;
+  I->replaceUsesOfWith(Pointer, C);
+  return true;
+}
+
+/// processCmp - See if LazyValueInfo's ability to exploit edge conditions,
+/// or range information is sufficient to prove this comparison.  Even for
+/// local conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
+  Value *Op0 = C->getOperand(0);
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return false;
+
+  // As a policy choice, we choose not to waste compile time on anything where
+  // the comparison is testing local values.  While LVI can sometimes reason
+  // about such cases, it's not its primary purpose.  We do make sure to do
+  // the block local query for uses from terminator instructions, but that's
+  // handled in the code for each terminator.
+  auto *I = dyn_cast<Instruction>(Op0);
+  if (I && I->getParent() == C->getParent())
+    return false;
+
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
+  if (Result == LazyValueInfo::Unknown) return false;
+
+  ++NumCmps;
+  if (Result == LazyValueInfo::True)
+    C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
+  else
+    C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
+  C->eraseFromParent();
+
+  return true;
+}
+
+/// processSwitch - Simplify a switch instruction by removing cases which can
+/// never fire.  If the uselessness of a case could be determined locally then
+/// constant propagation would already have figured it out.  Instead, walk the
+/// predecessors and statically evaluate cases based on information available
+/// on that edge.  Cases that cannot fire no matter what the incoming edge can
+/// safely be removed.  If a case fires on every incoming edge then the entire
+/// switch can be removed and replaced with a branch to the case destination.
+bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  BasicBlock *BB = SI->getParent();
+
+  // If the condition was defined in same block as the switch then LazyValueInfo
+  // currently won't say anything useful about it, though in theory it could.
+  if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+    return false;
+
+  // If the switch is unreachable then trying to improve it is a waste of time.
+  pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+  if (PB == PE) return false;
+
+  // Analyse each switch case in turn.  This is done in reverse order so that
+  // removing a case doesn't cause trouble for the iteration.
+  bool Changed = false;
+  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
+       ) {
+    ConstantInt *Case = CI.getCaseValue();
+
+    // Check to see if the switch condition is equal to/not equal to the case
+    // value on every incoming edge, equal/not equal being the same each time.
+    LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      // Is the switch condition equal to the case value?
+      LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+                                                              Cond, Case, *PI,
+                                                              BB, SI);
+      // Give up on this case if nothing is known.
+      if (Value == LazyValueInfo::Unknown) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+
+      // If this was the first edge to be visited, record that all other edges
+      // need to give the same result.
+      if (PI == PB) {
+        State = Value;
+        continue;
+      }
+
+      // If this case is known to fire for some edges and known not to fire for
+      // others then there is nothing we can do - give up.
+      if (Value != State) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+    }
+
+    if (State == LazyValueInfo::False) {
+      // This case never fires - remove it.
+      CI.getCaseSuccessor()->removePredecessor(BB);
+      SI->removeCase(CI); // Does not invalidate the iterator.
+
+      // The condition can be modified by removePredecessor's PHI simplification
+      // logic.
+      Cond = SI->getCondition();
+
+      ++NumDeadCases;
+      Changed = true;
+    } else if (State == LazyValueInfo::True) {
+      // This case always fires.  Arrange for the switch to be turned into an
+      // unconditional branch by replacing the switch condition with the case
+      // value.
+      SI->setCondition(Case);
+      NumDeadCases += SI->getNumCases();
+      Changed = true;
+      break;
+    }
+  }
+
+  if (Changed)
+    // If the switch has been simplified to the point where it can be replaced
+    // by a branch then do so now.
+    ConstantFoldTerminator(BB);
+
+  return Changed;
+}
+
+/// processCallSite - Infer nonnull attributes for the arguments at the
+/// specified callsite.
+bool CorrelatedValuePropagation::processCallSite(CallSite CS) {
+  SmallVector<unsigned, 4> Indices;
+  unsigned ArgNo = 0;
+
+  for (Value *V : CS.args()) {
+    PointerType *Type = dyn_cast<PointerType>(V->getType());
+
+    if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+        LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+                            ConstantPointerNull::get(Type),
+                            CS.getInstruction()) == LazyValueInfo::False)
+      Indices.push_back(ArgNo + 1);
+    ArgNo++;
+  }
+
+  assert(ArgNo == CS.arg_size() && "sanity check");
+
+  if (Indices.empty())
+    return false;
+
+  AttributeSet AS = CS.getAttributes();
+  LLVMContext &Ctx = CS.getInstruction()->getContext();
+  AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+  CS.setAttributes(AS);
+
+  return true;
+}
+
+Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) {
+  if (Constant *C = LVI->getConstant(V, At->getParent(), At))
+    return C;
+
+  // TODO: The following really should be sunk inside LVI's core algorithm, or
+  // at least the outer shims around such.
+  auto *C = dyn_cast<CmpInst>(V);
+  if (!C) return nullptr;
+
+  Value *Op0 = C->getOperand(0);
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return nullptr;
+  
+  LazyValueInfo::Tristate Result =
+    LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+  if (Result == LazyValueInfo::Unknown)
+    return nullptr;
+  
+  return (Result == LazyValueInfo::True) ?
+    ConstantInt::getTrue(C->getContext()) :
+    ConstantInt::getFalse(C->getContext());
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  LVI = &getAnalysis<LazyValueInfo>();
+
+  bool FnChanged = false;
+
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    bool BBChanged = false;
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+      Instruction *II = &*BI++;
+      switch (II->getOpcode()) {
+      case Instruction::Select:
+        BBChanged |= processSelect(cast<SelectInst>(II));
+        break;
+      case Instruction::PHI:
+        BBChanged |= processPHI(cast<PHINode>(II));
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        BBChanged |= processCmp(cast<CmpInst>(II));
+        break;
+      case Instruction::Load:
+      case Instruction::Store:
+        BBChanged |= processMemAccess(II);
+        break;
+      case Instruction::Call:
+      case Instruction::Invoke:
+        BBChanged |= processCallSite(CallSite(II));
+        break;
+      }
+    }
+
+    Instruction *Term = FI->getTerminator();
+    switch (Term->getOpcode()) {
+    case Instruction::Switch:
+      BBChanged |= processSwitch(cast<SwitchInst>(Term));
+      break;
+    case Instruction::Ret: {
+      auto *RI = cast<ReturnInst>(Term);
+      // Try to determine the return value if we can.  This is mainly here to
+      // simplify the writing of unit tests, but also helps to enable IPO by
+      // constant folding the return values of callees.
+      auto *RetVal = RI->getReturnValue();
+      if (!RetVal) break; // handle "ret void"
+      if (isa<Constant>(RetVal)) break; // nothing to do
+      if (auto *C = getConstantAt(RetVal, RI)) {
+        ++NumReturns;
+        RI->replaceUsesOfWith(RetVal, C);
+        BBChanged = true;        
+      }
+    }
+    };
+
+    FnChanged |= BBChanged;
+  }
+
+  return FnChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 0000000..b67c3c7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,156 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dce"
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadInstElimination pass implementation
+  //
+  struct DeadInstElimination : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    DeadInstElimination() : BasicBlockPass(ID) {
+      initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnBasicBlock(BasicBlock &BB) override {
+      if (skipOptnoneFunction(BB))
+        return false;
+      auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+      TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+        Instruction *Inst = &*DI++;
+        if (isInstructionTriviallyDead(Inst, TLI)) {
+          Inst->eraseFromParent();
+          Changed = true;
+          ++DIEEliminated;
+        }
+      }
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char DeadInstElimination::ID = 0;
+INITIALIZE_PASS(DeadInstElimination, "die",
+                "Dead Instruction Elimination", false, false)
+
+Pass *llvm::createDeadInstEliminationPass() {
+  return new DeadInstElimination();
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadCodeElimination pass implementation
+  //
+  struct DCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DCE() : FunctionPass(ID) {
+      initializeDCEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+    }
+ };
+}
+
+char DCE::ID = 0;
+INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
+
+static bool DCEInstruction(Instruction *I,
+                           SmallSetVector<Instruction *, 16> &WorkList,
+                           const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          WorkList.insert(OpI);
+    }
+
+    I->eraseFromParent();
+    ++DCEEliminated;
+    return true;
+  }
+  return false;
+}
+
+bool DCE::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+
+  bool MadeChange = false;
+  SmallSetVector<Instruction *, 16> WorkList;
+  // Iterate over the original function, only adding insts to the worklist
+  // if they actually need to be revisited. This avoids having to pre-init
+  // the worklist with the entire function's worth of instructions.
+  for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+    Instruction *I = &*FI;
+    ++FI;
+
+    // We're visiting this instruction now, so make sure it's not in the
+    // worklist from an earlier visit.
+    if (!WorkList.count(I))
+      MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+    MadeChange |= DCEInstruction(I, WorkList, TLI);
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCE();
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 0000000..36ad0a5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,961 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dse"
+
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+
+namespace {
+  struct DSE : public FunctionPass {
+    AliasAnalysis *AA;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+    const TargetLibraryInfo *TLI;
+
+    static char ID; // Pass identification, replacement for typeid
+    DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) {
+      initializeDSEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override {
+      if (skipOptnoneFunction(F))
+        return false;
+
+      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+      MD = &getAnalysis<MemoryDependenceAnalysis>();
+      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+      bool Changed = false;
+      for (BasicBlock &I : F)
+        // Only check non-dead blocks.  Dead blocks may have strange pointer
+        // cycles that will confuse alias analysis.
+        if (DT->isReachableFromEntry(&I))
+          Changed |= runOnBasicBlock(I);
+
+      AA = nullptr; MD = nullptr; DT = nullptr;
+      return Changed;
+    }
+
+    bool runOnBasicBlock(BasicBlock &BB);
+    bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI);
+    bool HandleFree(CallInst *F);
+    bool handleEndBlock(BasicBlock &BB);
+    void RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
+                               SmallSetVector<Value *, 16> &DeadStackObjects,
+                               const DataLayout &DL);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+  };
+}
+
+char DSE::ID = 0;
+INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// and zero out all the operands of this instruction.  If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+///
+static void DeleteDeadInstruction(Instruction *I,
+                               MemoryDependenceAnalysis &MD,
+                               const TargetLibraryInfo &TLI,
+                               SmallSetVector<Value*, 16> *ValueSet = nullptr) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+
+  NowDeadInsts.push_back(I);
+  --NumFastOther;
+
+  // Before we touch this instruction, remove it from memdep!
+  do {
+    Instruction *DeadInst = NowDeadInsts.pop_back_val();
+    ++NumFastOther;
+
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // MemDep, which needs to know the operands and needs it to be in the
+    // function.
+    MD.removeInstruction(DeadInst);
+
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, nullptr);
+
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI, &TLI))
+          NowDeadInsts.push_back(OpI);
+    }
+
+    DeadInst->eraseFromParent();
+
+    if (ValueSet) ValueSet->remove(DeadInst);
+  } while (!NowDeadInsts.empty());
+}
+
+
+/// hasMemoryWrite - Does this instruction write some memory?  This only returns
+/// true for things that we can analyze with other helpers below.
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
+  if (isa<StoreInst>(I))
+    return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+    case Intrinsic::init_trampoline:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
+  }
+  if (auto CS = CallSite(I)) {
+    if (Function *F = CS.getCalledFunction()) {
+      if (TLI.has(LibFunc::strcpy) &&
+          F->getName() == TLI.getName(LibFunc::strcpy)) {
+        return true;
+      }
+      if (TLI.has(LibFunc::strncpy) &&
+          F->getName() == TLI.getName(LibFunc::strncpy)) {
+        return true;
+      }
+      if (TLI.has(LibFunc::strcat) &&
+          F->getName() == TLI.getName(LibFunc::strcat)) {
+        return true;
+      }
+      if (TLI.has(LibFunc::strncat) &&
+          F->getName() == TLI.getName(LibFunc::strncat)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getLocForWrite - Return a Location stored to by the specified instruction.
+/// If isRemovable returns true, this function and getLocForRead completely
+/// describe the memory operations for this instruction.
+static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return MemoryLocation::get(SI);
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+    // memcpy/memmove/memset.
+    MemoryLocation Loc = MemoryLocation::getForDest(MI);
+    return Loc;
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+  if (!II)
+    return MemoryLocation();
+
+  switch (II->getIntrinsicID()) {
+  default:
+    return MemoryLocation(); // Unhandled intrinsic.
+  case Intrinsic::init_trampoline:
+    // FIXME: We don't know the size of the trampoline, so we can't really
+    // handle it here.
+    return MemoryLocation(II->getArgOperand(0));
+  case Intrinsic::lifetime_end: {
+    uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+    return MemoryLocation(II->getArgOperand(1), Len);
+  }
+  }
+}
+
+/// getLocForRead - Return the location read by the specified "hasMemoryWrite"
+/// instruction if any.
+static MemoryLocation getLocForRead(Instruction *Inst,
+                                    const TargetLibraryInfo &TLI) {
+  assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case");
+
+  // The only instructions that both read and write are the mem transfer
+  // instructions (memcpy/memmove).
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+    return MemoryLocation::getForSource(MTI);
+  return MemoryLocation();
+}
+
+
+/// isRemovable - If the value of this instruction and the memory it writes to
+/// is unused, may we delete this instruction?
+static bool isRemovable(Instruction *I) {
+  // Don't remove volatile/atomic stores.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+    case Intrinsic::lifetime_end:
+      // Never remove dead lifetime_end's, e.g. because it is followed by a
+      // free.
+      return false;
+    case Intrinsic::init_trampoline:
+      // Always safe to remove init_trampoline.
+      return true;
+
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      // Don't remove volatile memory intrinsics.
+      return !cast<MemIntrinsic>(II)->isVolatile();
+    }
+  }
+
+  if (auto CS = CallSite(I))
+    return CS.getInstruction()->use_empty();
+
+  return false;
+}
+
+
+/// isShortenable - Returns true if this instruction can be safely shortened in
+/// length.
+static bool isShortenable(Instruction *I) {
+  // Don't shorten stores for now
+  if (isa<StoreInst>(I))
+    return false;
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+      default: return false;
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+        // Do shorten memory intrinsics.
+        return true;
+    }
+  }
+
+  // Don't shorten libcalls calls for now.
+
+  return false;
+}
+
+/// getStoredPointerOperand - Return the pointer that is being written to.
+static Value *getStoredPointerOperand(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return MI->getDest();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::init_trampoline:
+      return II->getArgOperand(0);
+    }
+  }
+
+  CallSite CS(I);
+  // All the supported functions so far happen to have dest as their first
+  // argument.
+  return CS.getArgument(0);
+}
+
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+                               const TargetLibraryInfo &TLI) {
+  uint64_t Size;
+  if (getObjectSize(V, Size, DL, &TLI))
+    return Size;
+  return MemoryLocation::UnknownSize;
+}
+
+namespace {
+  enum OverwriteResult
+  {
+    OverwriteComplete,
+    OverwriteEnd,
+    OverwriteUnknown
+  };
+}
+
+/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
+/// completely overwrites a store to the 'Earlier' location.
+/// 'OverwriteEnd' if the end of the 'Earlier' location is completely
+/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
+static OverwriteResult isOverwrite(const MemoryLocation &Later,
+                                   const MemoryLocation &Earlier,
+                                   const DataLayout &DL,
+                                   const TargetLibraryInfo &TLI,
+                                   int64_t &EarlierOff, int64_t &LaterOff) {
+  const Value *P1 = Earlier.Ptr->stripPointerCasts();
+  const Value *P2 = Later.Ptr->stripPointerCasts();
+
+  // If the start pointers are the same, we just have to compare sizes to see if
+  // the later store was larger than the earlier store.
+  if (P1 == P2) {
+    // If we don't know the sizes of either access, then we can't do a
+    // comparison.
+    if (Later.Size == MemoryLocation::UnknownSize ||
+        Earlier.Size == MemoryLocation::UnknownSize)
+      return OverwriteUnknown;
+
+    // Make sure that the Later size is >= the Earlier size.
+    if (Later.Size >= Earlier.Size)
+      return OverwriteComplete;
+  }
+
+  // Otherwise, we have to have size information, and the later store has to be
+  // larger than the earlier one.
+  if (Later.Size == MemoryLocation::UnknownSize ||
+      Earlier.Size == MemoryLocation::UnknownSize)
+    return OverwriteUnknown;
+
+  // Check to see if the later store is to the entire object (either a global,
+  // an alloca, or a byval/inalloca argument).  If so, then it clearly
+  // overwrites any other store to the same object.
+  const Value *UO1 = GetUnderlyingObject(P1, DL),
+              *UO2 = GetUnderlyingObject(P2, DL);
+
+  // If we can't resolve the same pointers to the same object, then we can't
+  // analyze them at all.
+  if (UO1 != UO2)
+    return OverwriteUnknown;
+
+  // If the "Later" store is to a recognizable object, get its size.
+  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
+  if (ObjectSize != MemoryLocation::UnknownSize)
+    if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
+      return OverwriteComplete;
+
+  // Okay, we have stores to two completely different pointers.  Try to
+  // decompose the pointer into a "base + constant_offset" form.  If the base
+  // pointers are equal, then we can reason about the two stores.
+  EarlierOff = 0;
+  LaterOff = 0;
+  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
+
+  // If the base pointers still differ, we have two completely different stores.
+  if (BP1 != BP2)
+    return OverwriteUnknown;
+
+  // The later store completely overlaps the earlier store if:
+  //
+  // 1. Both start at the same offset and the later one's size is greater than
+  //    or equal to the earlier one's, or
+  //
+  //      |--earlier--|
+  //      |--   later   --|
+  //
+  // 2. The earlier store has an offset greater than the later offset, but which
+  //    still lies completely within the later store.
+  //
+  //        |--earlier--|
+  //    |-----  later  ------|
+  //
+  // We have to be careful here as *Off is signed while *.Size is unsigned.
+  if (EarlierOff >= LaterOff &&
+      Later.Size >= Earlier.Size &&
+      uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
+    return OverwriteComplete;
+
+  // The other interesting case is if the later store overwrites the end of
+  // the earlier store
+  //
+  //      |--earlier--|
+  //                |--   later   --|
+  //
+  // In this case we may want to trim the size of earlier to avoid generating
+  // writes to addresses which will definitely be overwritten later
+  if (LaterOff > EarlierOff &&
+      LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+      int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))
+    return OverwriteEnd;
+
+  // Otherwise, they don't completely overlap.
+  return OverwriteUnknown;
+}
+
+/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense.  Consider this case:
+///
+///   memcpy(A <- B)
+///   memcpy(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+                               const MemoryLocation &InstStoreLoc,
+                               Instruction *DepWrite,
+                               const TargetLibraryInfo &TLI,
+                               AliasAnalysis &AA) {
+  // Self reads can only happen for instructions that read memory.  Get the
+  // location read.
+  MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
+  if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.
+
+  // If the read and written loc obviously don't alias, it isn't a read.
+  if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
+
+  // Okay, 'Inst' may copy over itself.  However, we can still remove a the
+  // DepWrite instruction if we can prove that it reads from the same location
+  // as Inst.  This handles useful cases like:
+  //   memcpy(A <- B)
+  //   memcpy(A <- B)
+  // Here we don't know if A/B may alias, but we do know that B/B are must
+  // aliases, so removing the first memcpy is safe (assuming it writes <= #
+  // bytes as the second one.
+  MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+  if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+    return false;
+
+  // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+  // then it can't be considered dead.
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+
+bool DSE::runOnBasicBlock(BasicBlock &BB) {
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+  bool MadeChange = false;
+
+  // Do a top-down walk on the BB.
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    Instruction *Inst = &*BBI++;
+
+    // Handle 'free' calls specially.
+    if (CallInst *F = isFreeCall(Inst, TLI)) {
+      MadeChange |= HandleFree(F);
+      continue;
+    }
+
+    // If we find something that writes memory, get its memory dependence.
+    if (!hasMemoryWrite(Inst, *TLI))
+      continue;
+
+    // If we're storing the same value back to a pointer that we just
+    // loaded from, then the store can be removed.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+
+      auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) {
+        // DeleteDeadInstruction can delete the current instruction.  Save BBI
+        // in case we need it.
+        WeakVH NextInst(&*BBI);
+
+        DeleteDeadInstruction(DeadInst, *MD, *TLI);
+
+        if (!NextInst) // Next instruction deleted.
+          BBI = BB.begin();
+        else if (BBI != BB.begin()) // Revisit this instruction if possible.
+          --BBI;
+        ++NumRedundantStores;
+        MadeChange = true;
+      };
+
+      if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+        if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+            isRemovable(SI) &&
+            MemoryIsNotModifiedBetween(DepLoad, SI)) {
+
+          DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
+                       << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
+
+          RemoveDeadInstAndUpdateBBI(SI);
+          continue;
+        }
+      }
+
+      // Remove null stores into the calloc'ed objects
+      Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+
+      if (StoredConstant && StoredConstant->isNullValue() &&
+          isRemovable(SI)) {
+        Instruction *UnderlyingPointer = dyn_cast<Instruction>(
+            GetUnderlyingObject(SI->getPointerOperand(), DL));
+
+        if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+            MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) {
+          DEBUG(dbgs()
+                << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
+                << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
+
+          RemoveDeadInstAndUpdateBBI(SI);
+          continue;
+        }
+      }
+    }
+
+    MemDepResult InstDep = MD->getDependency(Inst);
+
+    // Ignore any store where we can't find a local dependence.
+    // FIXME: cross-block DSE would be fun. :)
+    if (!InstDep.isDef() && !InstDep.isClobber())
+      continue;
+
+    // Figure out what location is being stored to.
+    MemoryLocation Loc = getLocForWrite(Inst, *AA);
+
+    // If we didn't get a useful location, fail.
+    if (!Loc.Ptr)
+      continue;
+
+    while (InstDep.isDef() || InstDep.isClobber()) {
+      // Get the memory clobbered by the instruction we depend on.  MemDep will
+      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
+      // end up depending on a may- or must-aliased load, then we can't optimize
+      // away the store and we bail out.  However, if we depend on on something
+      // that overwrites the memory location we *can* potentially optimize it.
+      //
+      // Find out what memory location the dependent instruction stores.
+      Instruction *DepWrite = InstDep.getInst();
+      MemoryLocation DepLoc = getLocForWrite(DepWrite, *AA);
+      // If we didn't get a useful location, or if it isn't a size, bail out.
+      if (!DepLoc.Ptr)
+        break;
+
+      // If we find a write that is a) removable (i.e., non-volatile), b) is
+      // completely obliterated by the store to 'Loc', and c) which we know that
+      // 'Inst' doesn't load from, then we can remove it.
+      if (isRemovable(DepWrite) &&
+          !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+        int64_t InstWriteOffset, DepWriteOffset;
+        OverwriteResult OR =
+            isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset);
+        if (OR == OverwriteComplete) {
+          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
+
+          // Delete the store and now-dead instructions that feed it.
+          DeleteDeadInstruction(DepWrite, *MD, *TLI);
+          ++NumFastStores;
+          MadeChange = true;
+
+          // DeleteDeadInstruction can delete the current instruction in loop
+          // cases, reset BBI.
+          BBI = Inst->getIterator();
+          if (BBI != BB.begin())
+            --BBI;
+          break;
+        } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
+          // TODO: base this on the target vector size so that if the earlier
+          // store was too small to get vector writes anyway then its likely
+          // a good idea to shorten it
+          // Power of 2 vector writes are probably always a bad idea to optimize
+          // as any store/memset/memcpy is likely using vector instructions so
+          // shortening it to not vector size is likely to be slower
+          MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+          if (llvm::isPowerOf2_64(InstWriteOffset) ||
+              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
+
+            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
+                  << *DepWrite << "\n  KILLER (offset "
+                  << InstWriteOffset << ", "
+                  << DepLoc.Size << ")"
+                  << *Inst << '\n');
+
+            Value* DepWriteLength = DepIntrinsic->getLength();
+            Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
+                                                    InstWriteOffset -
+                                                    DepWriteOffset);
+            DepIntrinsic->setLength(TrimmedLength);
+            MadeChange = true;
+          }
+        }
+      }
+
+      // If this is a may-aliased store that is clobbering the store value, we
+      // can keep searching past it for another must-aliased pointer that stores
+      // to the same location.  For example, in:
+      //   store -> P
+      //   store -> Q
+      //   store -> P
+      // we can remove the first store to P even though we don't know if P and Q
+      // alias.
+      if (DepWrite == &BB.front()) break;
+
+      // Can't look past this instruction if it might read 'Loc'.
+      if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref)
+        break;
+
+      InstDep = MD->getPointerDependencyFrom(Loc, false,
+                                             DepWrite->getIterator(), &BB);
+    }
+  }
+
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB);
+
+  return MadeChange;
+}
+
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
+bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI,
+                                     Instruction *SecondI) {
+  SmallVector<BasicBlock *, 16> WorkList;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  BasicBlock::iterator FirstBBI(FirstI);
+  ++FirstBBI;
+  BasicBlock::iterator SecondBBI(SecondI);
+  BasicBlock *FirstBB = FirstI->getParent();
+  BasicBlock *SecondBB = SecondI->getParent();
+  MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+
+  // Start checking the store-block.
+  WorkList.push_back(SecondBB);
+  bool isFirstBlock = true;
+
+  // Check all blocks going backward until we reach the load-block.
+  while (!WorkList.empty()) {
+    BasicBlock *B = WorkList.pop_back_val();
+
+    // Ignore instructions before LI if this is the FirstBB.
+    BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+    BasicBlock::iterator EI;
+    if (isFirstBlock) {
+      // Ignore instructions after SI if this is the first visit of SecondBB.
+      assert(B == SecondBB && "first block is not the store block");
+      EI = SecondBBI;
+      isFirstBlock = false;
+    } else {
+      // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+      // In this case we also have to look at instructions after SI.
+      EI = B->end();
+    }
+    for (; BI != EI; ++BI) {
+      Instruction *I = &*BI;
+      if (I->mayWriteToMemory() && I != SecondI) {
+        auto Res = AA->getModRefInfo(I, MemLoc);
+        if (Res != MRI_NoModRef)
+          return false;
+      }
+    }
+    if (B != FirstBB) {
+      assert(B != &FirstBB->getParent()->getEntryBlock() &&
+          "Should not hit the entry block because SI must be dominated by LI");
+      for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+        if (!Visited.insert(*PredI).second)
+          continue;
+        WorkList.push_back(*PredI);
+      }
+    }
+  }
+  return true;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+                                   BasicBlock *BB, DominatorTree *DT) {
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    BasicBlock *Pred = *I;
+    if (Pred == BB) continue;
+    TerminatorInst *PredTI = Pred->getTerminator();
+    if (PredTI->getNumSuccessors() != 1)
+      continue;
+
+    if (DT->isReachableFromEntry(Pred))
+      Blocks.push_back(Pred);
+  }
+}
+
+/// HandleFree - Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+bool DSE::HandleFree(CallInst *F) {
+  bool MadeChange = false;
+
+  MemoryLocation Loc = MemoryLocation(F->getOperand(0));
+  SmallVector<BasicBlock *, 16> Blocks;
+  Blocks.push_back(F->getParent());
+  const DataLayout &DL = F->getModule()->getDataLayout();
+
+  while (!Blocks.empty()) {
+    BasicBlock *BB = Blocks.pop_back_val();
+    Instruction *InstPt = BB->getTerminator();
+    if (BB == F->getParent()) InstPt = F;
+
+    MemDepResult Dep =
+        MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+      if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency))
+        break;
+
+      Value *DepPointer =
+          GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
+
+      // Check for aliasing.
+      if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+        break;
+
+      auto Next = ++Dependency->getIterator();
+
+      // DCE instructions only used to calculate that store
+      DeleteDeadInstruction(Dependency, *MD, *TLI);
+      ++NumFastStores;
+      MadeChange = true;
+
+      // Inst's old Dependency is now deleted. Compute the next dependency,
+      // which may also be dead, as in
+      //    s[0] = 0;
+      //    s[1] = 0; // This has just been deleted.
+      //    free(s);
+      Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB);
+    }
+
+    if (Dep.isNonLocal())
+      FindUnconditionalPreds(Blocks, BB, DT);
+  }
+
+  return MadeChange;
+}
+
+/// handleEndBlock - Remove dead stores to stack-allocated locations in the
+/// function end block.  Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+bool DSE::handleEndBlock(BasicBlock &BB) {
+  bool MadeChange = false;
+
+  // Keep track of all of the stack objects that are dead at the end of the
+  // function.
+  SmallSetVector<Value*, 16> DeadStackObjects;
+
+  // Find all of the alloca'd pointers in the entry block.
+  BasicBlock &Entry = BB.getParent()->front();
+  for (Instruction &I : Entry) {
+    if (isa<AllocaInst>(&I))
+      DeadStackObjects.insert(&I);
+
+    // Okay, so these are dead heap objects, but if the pointer never escapes
+    // then it's leaked by this function anyways.
+    else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+      DeadStackObjects.insert(&I);
+  }
+
+  // Treat byval or inalloca arguments the same, stores to them are dead at the
+  // end of the function.
+  for (Argument &AI : BB.getParent()->args())
+    if (AI.hasByValOrInAllocaAttr())
+      DeadStackObjects.insert(&AI);
+
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+
+    // If we find a store, check to see if it points into a dead stack value.
+    if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+      // See through pointer-to-pointer bitcasts
+      SmallVector<Value *, 4> Pointers;
+      GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
+
+      // Stores to stack values are valid candidates for removal.
+      bool AllDead = true;
+      for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+           E = Pointers.end(); I != E; ++I)
+        if (!DeadStackObjects.count(*I)) {
+          AllDead = false;
+          break;
+        }
+
+      if (AllDead) {
+        Instruction *Dead = &*BBI++;
+
+        DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
+                     << *Dead << "\n  Objects: ";
+              for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+                   E = Pointers.end(); I != E; ++I) {
+                dbgs() << **I;
+                if (std::next(I) != E)
+                  dbgs() << ", ";
+              }
+              dbgs() << '\n');
+
+        // DCE instructions only used to calculate that store.
+        DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects);
+        ++NumFastStores;
+        MadeChange = true;
+        continue;
+      }
+    }
+
+    // Remove any dead non-memory-mutating instructions.
+    if (isInstructionTriviallyDead(&*BBI, TLI)) {
+      Instruction *Inst = &*BBI++;
+      DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects);
+      ++NumFastOther;
+      MadeChange = true;
+      continue;
+    }
+
+    if (isa<AllocaInst>(BBI)) {
+      // Remove allocas from the list of dead stack objects; there can't be
+      // any references before the definition.
+      DeadStackObjects.remove(&*BBI);
+      continue;
+    }
+
+    if (auto CS = CallSite(&*BBI)) {
+      // Remove allocation function calls from the list of dead stack objects; 
+      // there can't be any references before the definition.
+      if (isAllocLikeFn(&*BBI, TLI))
+        DeadStackObjects.remove(&*BBI);
+
+      // If this call does not access memory, it can't be loading any of our
+      // pointers.
+      if (AA->doesNotAccessMemory(CS))
+        continue;
+
+      // If the call might load from any of our allocas, then any store above
+      // the call is live.
+      DeadStackObjects.remove_if([&](Value *I) {
+        // See if the call site touches the value.
+        ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI));
+
+        return A == MRI_ModRef || A == MRI_Ref;
+      });
+
+      // If all of the allocas were clobbered by the call then we're not going
+      // to find anything else to process.
+      if (DeadStackObjects.empty())
+        break;
+
+      continue;
+    }
+
+    MemoryLocation LoadedLoc;
+
+    // If we encounter a use of the pointer, it is no longer considered dead
+    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
+      LoadedLoc = MemoryLocation::get(L);
+    } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+      LoadedLoc = MemoryLocation::get(V);
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
+      LoadedLoc = MemoryLocation::getForSource(MTI);
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
+      continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
+    }
+
+    // Remove any allocas from the DeadPointer set that are loaded, as this
+    // makes any stores above the access live.
+    RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL);
+
+    // If all of the allocas were clobbered by the access then we're not going
+    // to find anything else to process.
+    if (DeadStackObjects.empty())
+      break;
+  }
+
+  return MadeChange;
+}
+
+/// RemoveAccessedObjects - Check to see if the specified location may alias any
+/// of the stack objects in the DeadStackObjects set.  If so, they become live
+/// because the location is being loaded.
+void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc,
+                                SmallSetVector<Value *, 16> &DeadStackObjects,
+                                const DataLayout &DL) {
+  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
+
+  // A constant can't be in the dead pointer set.
+  if (isa<Constant>(UnderlyingPointer))
+    return;
+
+  // If the kill pointer can be easily reduced to an alloca, don't bother doing
+  // extraneous AA queries.
+  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
+    return;
+  }
+
+  // Remove objects that could alias LoadedLoc.
+  DeadStackObjects.remove_if([&](Value *I) {
+    // See if the loaded location could alias the stack location.
+    MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI));
+    return !AA->isNoAlias(StackLoc, LoadedLoc);
+  });
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
new file mode 100644
index 0000000..7ef062e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -0,0 +1,890 @@
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <deque>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "early-cse"
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE,      "Number of instructions CSE'd");
+STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
+STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
+STATISTIC(NumDSE,      "Number of trivial dead stores removed");
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+  Instruction *Inst;
+
+  SimpleValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    // This can only handle non-void readnone functions.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst))
+      return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+    return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+           isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+  }
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<SimpleValue> {
+  static inline SimpleValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline SimpleValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(SimpleValue Val);
+  static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+}
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash in all of the operands as pointers.
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+      std::swap(LHS, RHS);
+
+    if (isa<OverflowingBinaryOperator>(BinOp)) {
+      // Hash the overflow behavior
+      unsigned Overflow =
+          BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap |
+          BinOp->hasNoUnsignedWrap() *
+              OverflowingBinaryOperator::NoUnsignedWrap;
+      return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
+    }
+
+    return hash_combine(BinOp->getOpcode(), LHS, RHS);
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+    Value *LHS = CI->getOperand(0);
+    Value *RHS = CI->getOperand(1);
+    CmpInst::Predicate Pred = CI->getPredicate();
+    if (Inst->getOperand(0) > Inst->getOperand(1)) {
+      std::swap(LHS, RHS);
+      Pred = CI->getSwappedPredicate();
+    }
+    return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(Inst))
+    return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+    return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+                        hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+    return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+                        IVI->getOperand(1),
+                        hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+  assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
+          isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+          isa<ShuffleVectorInst>(Inst)) &&
+         "Invalid/unknown instruction");
+
+  // Mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+
+  if (LHSI->getOpcode() != RHSI->getOpcode())
+    return false;
+  if (LHSI->isIdenticalTo(RHSI))
+    return true;
+
+  // If we're not strictly identical, we still might be a commutable instruction
+  if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+    if (!LHSBinOp->isCommutative())
+      return false;
+
+    assert(isa<BinaryOperator>(RHSI) &&
+           "same opcode, but different instruction type?");
+    BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+    // Check overflow attributes
+    if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
+      assert(isa<OverflowingBinaryOperator>(RHSBinOp) &&
+             "same opcode, but different operator type?");
+      if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
+          LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
+        return false;
+    }
+
+    // Commuted equality
+    return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+           LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+  }
+  if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+    assert(isa<CmpInst>(RHSI) &&
+           "same opcode, but different instruction type?");
+    CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+    // Commuted equality
+    return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+           LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+           LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+  Instruction *Inst;
+
+  CallValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    // Don't value number anything that returns void.
+    if (Inst->getType()->isVoidTy())
+      return false;
+
+    CallInst *CI = dyn_cast<CallInst>(Inst);
+    if (!CI || !CI->onlyReadsMemory())
+      return false;
+    return true;
+  }
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<CallValue> {
+  static inline CallValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline CallValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(CallValue Val);
+  static bool isEqual(CallValue LHS, CallValue RHS);
+};
+}
+
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash all of the operands as pointers and mix in the opcode.
+  return hash_combine(
+      Inst->getOpcode(),
+      hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+  return LHSI->isIdenticalTo(RHSI);
+}
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
+public:
+  const TargetLibraryInfo &TLI;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+  typedef RecyclingAllocator<
+      BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
+  typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+                          AllocatorTy> ScopedHTType;
+
+  /// \brief A scoped hash table of the current values of all of our simple
+  /// scalar expressions.
+  ///
+  /// As we walk down the domtree, we look to see if instructions are in this:
+  /// if so, we replace them with what we find, otherwise we insert them so
+  /// that dominated values can succeed in their lookup.
+  ScopedHTType AvailableValues;
+
+  /// A scoped hash table of the current values of previously encounted memory
+  /// locations.
+  ///
+  /// This allows us to get efficient access to dominating loads or stores when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is incremented
+  /// after every possibly writing memory operation, which ensures that we only
+  /// CSE loads with other loads that have no intervening store.  Ordering
+  /// events (such as fences or atomic instructions) increment the generation
+  /// count as well; essentially, we model these as writes to all possible
+  /// locations.  Note that atomic and/or volatile loads and stores can be
+  /// present the table; it is the responsibility of the consumer to inspect
+  /// the atomicity/volatility if needed.
+  struct LoadValue {
+    Value *Data;
+    unsigned Generation;
+    int MatchingId;
+    bool IsAtomic;
+    LoadValue()
+      : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {}
+    LoadValue(Value *Data, unsigned Generation, unsigned MatchingId,
+              bool IsAtomic)
+      : Data(Data), Generation(Generation), MatchingId(MatchingId),
+        IsAtomic(IsAtomic) {}
+  };
+  typedef RecyclingAllocator<BumpPtrAllocator,
+                             ScopedHashTableVal<Value *, LoadValue>>
+      LoadMapAllocator;
+  typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+                          LoadMapAllocator> LoadHTType;
+  LoadHTType AvailableLoads;
+
+  /// \brief A scoped hash table of the current values of read-only call
+  /// values.
+  ///
+  /// It uses the same generation count as loads.
+  typedef ScopedHashTable<CallValue, std::pair<Value *, unsigned>> CallHTType;
+  CallHTType AvailableCalls;
+
+  /// \brief This is the current generation of the memory value.
+  unsigned CurrentGeneration;
+
+  /// \brief Set up the EarlyCSE runner for a particular function.
+  EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
+           DominatorTree &DT, AssumptionCache &AC)
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}
+
+  bool run();
+
+private:
+  // Almost a POD, but needs to call the constructors for the scoped hash
+  // tables so that a new scope gets pushed on. These are RAII so that the
+  // scope gets popped when the NodeScope is destroyed.
+  class NodeScope {
+  public:
+    NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              CallHTType &AvailableCalls)
+        : Scope(AvailableValues), LoadScope(AvailableLoads),
+          CallScope(AvailableCalls) {}
+
+  private:
+    NodeScope(const NodeScope &) = delete;
+    void operator=(const NodeScope &) = delete;
+
+    ScopedHTType::ScopeTy Scope;
+    LoadHTType::ScopeTy LoadScope;
+    CallHTType::ScopeTy CallScope;
+  };
+
+  // Contains all the needed information to create a stack for doing a depth
+  // first tranversal of the tree. This includes scopes for values, loads, and
+  // calls as well as the generation. There is a child iterator so that the
+  // children do not need to be store spearately.
+  class StackNode {
+  public:
+    StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+              CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
+              DomTreeNode::iterator child, DomTreeNode::iterator end)
+        : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+          EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
+          Processed(false) {}
+
+    // Accessors.
+    unsigned currentGeneration() { return CurrentGeneration; }
+    unsigned childGeneration() { return ChildGeneration; }
+    void childGeneration(unsigned generation) { ChildGeneration = generation; }
+    DomTreeNode *node() { return Node; }
+    DomTreeNode::iterator childIter() { return ChildIter; }
+    DomTreeNode *nextChild() {
+      DomTreeNode *child = *ChildIter;
+      ++ChildIter;
+      return child;
+    }
+    DomTreeNode::iterator end() { return EndIter; }
+    bool isProcessed() { return Processed; }
+    void process() { Processed = true; }
+
+  private:
+    StackNode(const StackNode &) = delete;
+    void operator=(const StackNode &) = delete;
+
+    // Members.
+    unsigned CurrentGeneration;
+    unsigned ChildGeneration;
+    DomTreeNode *Node;
+    DomTreeNode::iterator ChildIter;
+    DomTreeNode::iterator EndIter;
+    NodeScope Scopes;
+    bool Processed;
+  };
+
+  /// \brief Wrapper class to handle memory instructions, including loads,
+  /// stores and intrinsic loads and stores defined by the target.
+  class ParseMemoryInst {
+  public:
+    ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+      : IsTargetMemInst(false), Inst(Inst) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+          IsTargetMemInst = true;
+    }
+    bool isLoad() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return isa<LoadInst>(Inst);
+    }
+    bool isStore() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return isa<StoreInst>(Inst);
+    }
+    bool isAtomic() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      return Inst->isAtomic();
+    }
+    bool isUnordered() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return true;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isUnordered();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isUnordered();
+      }
+      // Conservative answer
+      return !Inst->isAtomic();
+    }
+
+    bool isVolatile() const {
+      if (IsTargetMemInst) {
+        assert(Info.IsSimple && "need to refine IsSimple in TTI");
+        return false;
+      }
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->isVolatile();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->isVolatile();
+      }
+      // Conservative answer
+      return true;
+    }
+
+    
+    bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
+      return (getPointerOperand() == Inst.getPointerOperand() &&
+              getMatchingId() == Inst.getMatchingId());
+    }
+    bool isValid() const { return getPointerOperand() != nullptr; }
+
+    // For regular (non-intrinsic) loads/stores, this is set to -1. For
+    // intrinsic loads/stores, the id is retrieved from the corresponding
+    // field in the MemIntrinsicInfo structure.  That field contains
+    // non-negative values only.
+    int getMatchingId() const {
+      if (IsTargetMemInst) return Info.MatchingId;
+      return -1;
+    }
+    Value *getPointerOperand() const {
+      if (IsTargetMemInst) return Info.PtrVal;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        return LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        return SI->getPointerOperand();
+      }
+      return nullptr;
+    }
+    bool mayReadFromMemory() const {
+      if (IsTargetMemInst) return Info.ReadMem;
+      return Inst->mayReadFromMemory();
+    }
+    bool mayWriteToMemory() const {
+      if (IsTargetMemInst) return Info.WriteMem;
+      return Inst->mayWriteToMemory();
+    }
+
+  private:
+    bool IsTargetMemInst;
+    MemIntrinsicInfo Info;
+    Instruction *Inst;
+  };
+
+  bool processNode(DomTreeNode *Node);
+
+  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      return LI;
+    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      return SI->getValueOperand();
+    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+    return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+                                                 ExpectedType);
+  }
+};
+}
+
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+  BasicBlock *BB = Node->getBlock();
+
+  // If this block has a single predecessor, then the predecessor is the parent
+  // of the domtree node and all of the live out memory values are still current
+  // in this block.  If this block has multiple predecessors, then they could
+  // have invalidated the live-out memory values of our parent value.  For now,
+  // just be conservative and invalidate memory if this block has multiple
+  // predecessors.
+  if (!BB->getSinglePredecessor())
+    ++CurrentGeneration;
+
+  // If this node has a single predecessor which ends in a conditional branch,
+  // we can infer the value of the branch condition given that we took this
+  // path.  We need the single predeccesor to ensure there's not another path
+  // which reaches this block where the condition might hold a different
+  // value.  Since we're adding this to the scoped hash table (like any other
+  // def), it will have been popped if we encounter a future merge block.
+  if (BasicBlock *Pred = BB->getSinglePredecessor())
+    if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+      if (BI->isConditional())
+        if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
+          if (SimpleValue::canHandle(CondInst)) {
+            assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+            auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
+              ConstantInt::getTrue(BB->getContext()) :
+              ConstantInt::getFalse(BB->getContext());
+            AvailableValues.insert(CondInst, ConditionalConstant);
+            DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                  << CondInst->getName() << "' as " << *ConditionalConstant
+                  << " in " << BB->getName() << "\n");
+            // Replace all dominated uses with the known value
+            replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
+                                     BasicBlockEdge(Pred, BB));
+          }
+
+  /// LastStore - Keep track of the last non-volatile store that we saw... for
+  /// as long as there in no instruction that reads memory.  If we see a store
+  /// to the same location, we delete the dead store.  This zaps trivial dead
+  /// stores which can occur in bitfield code among other things.
+  Instruction *LastStore = nullptr;
+
+  bool Changed = false;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+
+  // See if any instructions in the block can be eliminated.  If so, do it.  If
+  // not, add them to AvailableValues.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
+
+    // Dead instructions should just be removed.
+    if (isInstructionTriviallyDead(Inst, &TLI)) {
+      DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+      Inst->eraseFromParent();
+      Changed = true;
+      ++NumSimplify;
+      continue;
+    }
+
+    // Skip assume intrinsics, they don't really have side effects (although
+    // they're marked as such to ensure preservation of control dependencies),
+    // and this pass will not disturb any of the assumption's control
+    // dependencies.
+    if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+      DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+      continue;
+    }
+
+    // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+    // its simpler value.
+    if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
+      DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
+      Inst->replaceAllUsesWith(V);
+      Inst->eraseFromParent();
+      Changed = true;
+      ++NumSimplify;
+      continue;
+    }
+
+    // If this is a simple instruction that we can value number, process it.
+    if (SimpleValue::canHandle(Inst)) {
+      // See if the instruction has an available value.  If so, use it.
+      if (Value *V = AvailableValues.lookup(Inst)) {
+        DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
+        Inst->replaceAllUsesWith(V);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSE;
+        continue;
+      }
+
+      // Otherwise, just remember that this value is available.
+      AvailableValues.insert(Inst, Inst);
+      continue;
+    }
+
+    ParseMemoryInst MemInst(Inst, TTI);
+    // If this is a non-volatile load, process it.
+    if (MemInst.isValid() && MemInst.isLoad()) {
+      // (conservatively) we can't peak past the ordering implied by this
+      // operation, but we can add this load to our set of available values
+      if (MemInst.isVolatile() || !MemInst.isUnordered()) {
+        LastStore = nullptr;
+        ++CurrentGeneration;
+      }
+
+      // If we have an available version of this load, and if it is the right
+      // generation, replace this instruction.
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration &&
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing loads with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered() &&
+          // We can't replace an atomic load with one which isn't also atomic.
+          InVal.IsAtomic >= MemInst.isAtomic()) {
+        Value *Op = getOrCreateResult(InVal.Data, Inst->getType());
+        if (Op != nullptr) {
+          DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+                       << "  to: " << *InVal.Data << '\n');
+          if (!Inst->use_empty())
+            Inst->replaceAllUsesWith(Op);
+          Inst->eraseFromParent();
+          Changed = true;
+          ++NumCSELoad;
+          continue;
+        }
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableLoads.insert(
+          MemInst.getPointerOperand(),
+          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                    MemInst.isAtomic()));
+      LastStore = nullptr;
+      continue;
+    }
+
+    // If this instruction may read from memory, forget LastStore.
+    // Load/store intrinsics will indicate both a read and a write to
+    // memory.  The target may override this (e.g. so that a store intrinsic
+    // does not read  from memory, and thus will be treated the same as a
+    // regular store for commoning purposes).
+    if (Inst->mayReadFromMemory() &&
+        !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
+      LastStore = nullptr;
+
+    // If this is a read-only call, process it.
+    if (CallValue::canHandle(Inst)) {
+      // If we have an available version of this call, and if it is the right
+      // generation, replace this instruction.
+      std::pair<Value *, unsigned> InVal = AvailableCalls.lookup(Inst);
+      if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
+        DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+                     << "  to: " << *InVal.first << '\n');
+        if (!Inst->use_empty())
+          Inst->replaceAllUsesWith(InVal.first);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSECall;
+        continue;
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableCalls.insert(
+          Inst, std::pair<Value *, unsigned>(Inst, CurrentGeneration));
+      continue;
+    }
+
+    // A release fence requires that all stores complete before it, but does
+    // not prevent the reordering of following loads 'before' the fence.  As a
+    // result, we don't need to consider it as writing to memory and don't need
+    // to advance the generation.  We do need to prevent DSE across the fence,
+    // but that's handled above.
+    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+      if (FI->getOrdering() == Release) {
+        assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
+        continue;
+      }
+
+    // write back DSE - If we write back the same value we just loaded from
+    // the same location and haven't passed any intervening writes or ordering
+    // operations, we can remove the write.  The primary benefit is in allowing
+    // the available load table to remain valid and value forward past where
+    // the store originally was.
+    if (MemInst.isValid() && MemInst.isStore()) {
+      LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+      if (InVal.Data &&
+          InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) &&
+          InVal.Generation == CurrentGeneration &&
+          InVal.MatchingId == MemInst.getMatchingId() &&
+          // We don't yet handle removing stores with ordering of any kind.
+          !MemInst.isVolatile() && MemInst.isUnordered()) {
+        assert((!LastStore ||
+                ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+                MemInst.getPointerOperand()) &&
+               "can't have an intervening store!");
+        DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumDSE;
+        // We can avoid incrementing the generation count since we were able
+        // to eliminate this store.
+        continue;
+      }
+    }
+
+    // Okay, this isn't something we can CSE at all.  Check to see if it is
+    // something that could modify memory.  If so, our available memory values
+    // cannot be used so bump the generation count.
+    if (Inst->mayWriteToMemory()) {
+      ++CurrentGeneration;
+
+      if (MemInst.isValid() && MemInst.isStore()) {
+        // We do a trivial form of DSE if there are two stores to the same
+        // location with no intervening loads.  Delete the earlier store.
+        // At the moment, we don't remove ordered stores, but do remove
+        // unordered atomic stores.  There's no special requirement (for
+        // unordered atomics) about removing atomic stores only in favor of
+        // other atomic stores since we we're going to execute the non-atomic
+        // one anyway and the atomic one might never have become visible.
+        if (LastStore) {
+          ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          assert(LastStoreMemInst.isUnordered() &&
+                 !LastStoreMemInst.isVolatile() &&
+                 "Violated invariant");
+          if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+            DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+                         << "  due to: " << *Inst << '\n');
+            LastStore->eraseFromParent();
+            Changed = true;
+            ++NumDSE;
+            LastStore = nullptr;
+          }
+          // fallthrough - we can exploit information about this store
+        }
+
+        // Okay, we just invalidated anything we knew about loaded values.  Try
+        // to salvage *something* by remembering that the stored value is a live
+        // version of the pointer.  It is safe to forward from volatile stores
+        // to non-volatile loads, so we don't have to check for volatility of
+        // the store.
+        AvailableLoads.insert(
+            MemInst.getPointerOperand(),
+            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
+                      MemInst.isAtomic()));
+
+        // Remember that this was the last unordered store we saw for DSE. We
+        // don't yet handle DSE on ordered or volatile stores since we don't
+        // have a good way to model the ordering requirement for following
+        // passes  once the store is removed.  We could insert a fence, but
+        // since fences are slightly stronger than stores in their ordering,
+        // it's not clear this is a profitable transform. Another option would
+        // be to merge the ordering with that of the post dominating store.
+        if (MemInst.isUnordered() && !MemInst.isVolatile())
+          LastStore = Inst;
+        else
+          LastStore = nullptr;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool EarlyCSE::run() {
+  // Note, deque is being used here because there is significant performance
+  // gains over vector when the container becomes very large due to the
+  // specific access patterns. For more information see the mailing list
+  // discussion on this:
+  // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  std::deque<StackNode *> nodesToProcess;
+
+  bool Changed = false;
+
+  // Process the root node.
+  nodesToProcess.push_back(new StackNode(
+      AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
+      DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
+
+  // Save the current generation.
+  unsigned LiveOutGeneration = CurrentGeneration;
+
+  // Process the stack.
+  while (!nodesToProcess.empty()) {
+    // Grab the first item off the stack. Set the current generation, remove
+    // the node from the stack, and process it.
+    StackNode *NodeToProcess = nodesToProcess.back();
+
+    // Initialize class members.
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    // Check if the node needs to be processed.
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+      Changed |= processNode(NodeToProcess->node());
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *child = NodeToProcess->nextChild();
+      nodesToProcess.push_back(
+          new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+                        NodeToProcess->childGeneration(), child, child->begin(),
+                        child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      delete NodeToProcess;
+      nodesToProcess.pop_back();
+    }
+  } // while (!nodes...)
+
+  // Reset the current generation.
+  CurrentGeneration = LiveOutGeneration;
+
+  return Changed;
+}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+                                    AnalysisManager<Function> *AM) {
+  auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
+  auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+  auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+  EarlyCSE CSE(TLI, TTI, DT, AC);
+
+  if (!CSE.run())
+    return PreservedAnalyses::all();
+
+  // CSE preserves the dominator tree because it doesn't mutate the CFG.
+  // FIXME: Bundle this with other CFG-preservation.
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+namespace {
+/// \brief A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSELegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  EarlyCSELegacyPass() : FunctionPass(ID) {
+    initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+    EarlyCSE CSE(TLI, TTI, DT, AC);
+
+    return CSE.run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char EarlyCSELegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass() { return new EarlyCSELegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
new file mode 100644
index 0000000..185cdbd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -0,0 +1,80 @@
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+public:
+  FlattenCFGPass() : FunctionPass(ID) {
+    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+
+private:
+  AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                    false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded...
+    //
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
+      if (FlattenCFG(&*BBIt++, AA)) {
+        LocalChange = true;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  bool EverChanged = false;
+  // iterativelyFlattenCFG can make some blocks dead.
+  while (iterativelyFlattenCFG(F, AA)) {
+    removeUnreachableBlocks(F);
+    EverChanged = true;
+  }
+  return EverChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
new file mode 100644
index 0000000..7f5d786
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -0,0 +1,543 @@
+//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Float2Int pass, which aims to demote floating
+// point operations to work on integers, where that is losslessly possible.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "float2int"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <deque>
+#include <functional> // For std::function
+using namespace llvm;
+
+// The algorithm is simple. Start at instructions that convert from the
+// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
+// graph, using an equivalence datastructure to unify graphs that interfere.
+//
+// Mappable instructions are those with an integer corrollary that, given
+// integer domain inputs, produce an integer output; fadd, for example.
+//
+// If a non-mappable instruction is seen, this entire def-use graph is marked
+// as non-transformable. If we see an instruction that converts from the
+// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
+
+/// The largest integer type worth dealing with.
+static cl::opt<unsigned>
+MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
+             cl::desc("Max integer bitwidth to consider in float2int"
+                      "(default=64)"));
+
+namespace {
+  struct Float2Int : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    Float2Int() : FunctionPass(ID) {
+      initializeFloat2IntPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+
+    void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots);
+    ConstantRange seen(Instruction *I, ConstantRange R);
+    ConstantRange badRange();
+    ConstantRange unknownRange();
+    ConstantRange validateRange(ConstantRange R);
+    void walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots);
+    void walkForwards();
+    bool validateAndTransform();
+    Value *convert(Instruction *I, Type *ToTy);
+    void cleanup();
+
+    MapVector<Instruction*, ConstantRange > SeenInsts;
+    SmallPtrSet<Instruction*,8> Roots;
+    EquivalenceClasses<Instruction*> ECs;
+    MapVector<Instruction*, Value*> ConvertedInsts;
+    LLVMContext *Ctx;
+  };
+}
+
+char Float2Int::ID = 0;
+INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false)
+
+// Given a FCmp predicate, return a matching ICmp predicate if one
+// exists, otherwise return BAD_ICMP_PREDICATE.
+static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
+  switch (P) {
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ:
+    return CmpInst::ICMP_EQ;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT:
+    return CmpInst::ICMP_SGT;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE:
+    return CmpInst::ICMP_SGE;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT:
+    return CmpInst::ICMP_SLT;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE:
+    return CmpInst::ICMP_SLE;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE:
+    return CmpInst::ICMP_NE;
+  default:
+    return CmpInst::BAD_ICMP_PREDICATE;
+  }
+}
+
+// Given a floating point binary operator, return the matching
+// integer version.
+static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case Instruction::FAdd: return Instruction::Add;
+  case Instruction::FSub: return Instruction::Sub;
+  case Instruction::FMul: return Instruction::Mul;
+  }
+}
+
+// Find the roots - instructions that convert from the FP domain to
+// integer domain.
+void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
+  for (auto &I : instructions(F)) {
+    if (isa<VectorType>(I.getType()))
+      continue;
+    switch (I.getOpcode()) {
+    default: break;
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      Roots.insert(&I);
+      break;
+    case Instruction::FCmp:
+      if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+          CmpInst::BAD_ICMP_PREDICATE)
+        Roots.insert(&I);
+      break;
+    }
+  }
+}
+
+// Helper - mark I as having been traversed, having range R.
+ConstantRange Float2Int::seen(Instruction *I, ConstantRange R) {
+  DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+  if (SeenInsts.find(I) != SeenInsts.end())
+    SeenInsts.find(I)->second = R;
+  else
+    SeenInsts.insert(std::make_pair(I, R));
+  return R;
+}
+
+// Helper - get a range representing a poison value.
+ConstantRange Float2Int::badRange() {
+  return ConstantRange(MaxIntegerBW + 1, true);
+}
+ConstantRange Float2Int::unknownRange() {
+  return ConstantRange(MaxIntegerBW + 1, false);
+}
+ConstantRange Float2Int::validateRange(ConstantRange R) {
+  if (R.getBitWidth() > MaxIntegerBW + 1)
+    return badRange();
+  return R;
+}
+
+// The most obvious way to structure the search is a depth-first, eager
+// search from each root. However, that require direct recursion and so
+// can only handle small instruction sequences. Instead, we split the search
+// up into two phases:
+//   - walkBackwards:  A breadth-first walk of the use-def graph starting from
+//                     the roots. Populate "SeenInsts" with interesting
+//                     instructions and poison values if they're obvious and
+//                     cheap to compute. Calculate the equivalance set structure
+//                     while we're here too.
+//   - walkForwards:  Iterate over SeenInsts in reverse order, so we visit
+//                     defs before their uses. Calculate the real range info.
+
+// Breadth-first walk of the use-def graph; determine the set of nodes
+// we care about and eagerly determine if some of them are poisonous.
+void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
+  std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    if (SeenInsts.find(I) != SeenInsts.end())
+      // Seen already.
+      continue;
+
+    switch (I->getOpcode()) {
+      // FIXME: Handle select and phi nodes.
+    default:
+      // Path terminated uncleanly.
+      seen(I, badRange());
+      break;
+
+    case Instruction::UIToFP: {
+      // Path terminated cleanly.
+      unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      APInt Min = APInt::getMinValue(BW).zextOrSelf(MaxIntegerBW+1);
+      APInt Max = APInt::getMaxValue(BW).zextOrSelf(MaxIntegerBW+1);
+      seen(I, validateRange(ConstantRange(Min, Max)));
+      continue;
+    }
+
+    case Instruction::SIToFP: {
+      // Path terminated cleanly.
+      unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      APInt SMin = APInt::getSignedMinValue(BW).sextOrSelf(MaxIntegerBW+1);
+      APInt SMax = APInt::getSignedMaxValue(BW).sextOrSelf(MaxIntegerBW+1);
+      seen(I, validateRange(ConstantRange(SMin, SMax)));
+      continue;
+    }
+
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FCmp:
+      seen(I, unknownRange());
+      break;
+    }
+
+    for (Value *O : I->operands()) {
+      if (Instruction *OI = dyn_cast<Instruction>(O)) {
+        // Unify def-use chains if they interfere.
+        ECs.unionSets(I, OI);
+        if (SeenInsts.find(I)->second != badRange())
+          Worklist.push_back(OI);
+      } else if (!isa<ConstantFP>(O)) {
+        // Not an instruction or ConstantFP? we can't do anything.
+        seen(I, badRange());
+      }
+    }
+  }
+}
+
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2Int::walkForwards() {
+  for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) {
+    if (It.second != unknownRange())
+      continue;
+
+    Instruction *I = It.first;
+    std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
+    switch (I->getOpcode()) {
+      // FIXME: Handle select and phi nodes.
+    default:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+      llvm_unreachable("Should have been handled in walkForwards!");
+
+    case Instruction::FAdd:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "FAdd is a binary operator!");
+        return Ops[0].add(Ops[1]);
+      };
+      break;
+
+    case Instruction::FSub:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "FSub is a binary operator!");
+        return Ops[0].sub(Ops[1]);
+      };
+      break;
+
+    case Instruction::FMul:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "FMul is a binary operator!");
+        return Ops[0].multiply(Ops[1]);
+      };
+      break;
+
+    //
+    // Root-only instructions - we'll only see these if they're the
+    //                          first node in a walk.
+    //
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
+        return Ops[0];
+      };
+      break;
+
+    case Instruction::FCmp:
+      Op = [](ArrayRef<ConstantRange> Ops) {
+        assert(Ops.size() == 2 && "FCmp is a binary operator!");
+        return Ops[0].unionWith(Ops[1]);
+      };
+      break;
+    }
+
+    bool Abort = false;
+    SmallVector<ConstantRange,4> OpRanges;
+    for (Value *O : I->operands()) {
+      if (Instruction *OI = dyn_cast<Instruction>(O)) {
+        assert(SeenInsts.find(OI) != SeenInsts.end() &&
+               "def not seen before use!");
+        OpRanges.push_back(SeenInsts.find(OI)->second);
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+        // Work out if the floating point number can be losslessly represented
+        // as an integer.
+        // APFloat::convertToInteger(&Exact) purports to do what we want, but
+        // the exactness can be too precise. For example, negative zero can
+        // never be exactly converted to an integer.
+        //
+        // Instead, we ask APFloat to round itself to an integral value - this
+        // preserves sign-of-zero - then compare the result with the original.
+        //
+        APFloat F = CF->getValueAPF();
+
+        // First, weed out obviously incorrect values. Non-finite numbers
+        // can't be represented and neither can negative zero, unless
+        // we're in fast math mode.
+        if (!F.isFinite() ||
+            (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+             !I->hasNoSignedZeros())) {
+          seen(I, badRange());
+          Abort = true;
+          break;
+        }
+
+        APFloat NewF = F;
+        auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+        if (Res != APFloat::opOK || NewF.compare(F) != APFloat::cmpEqual) {
+          seen(I, badRange());
+          Abort = true;
+          break;
+        }
+        // OK, it's representable. Now get it.
+        APSInt Int(MaxIntegerBW+1, false);
+        bool Exact;
+        CF->getValueAPF().convertToInteger(Int,
+                                           APFloat::rmNearestTiesToEven,
+                                           &Exact);
+        OpRanges.push_back(ConstantRange(Int));
+      } else {
+        llvm_unreachable("Should have already marked this as badRange!");
+      }
+    }
+
+    // Reduce the operands' ranges to a single range and return.
+    if (!Abort)
+      seen(I, Op(OpRanges));
+  }
+}
+
+// If there is a valid transform to be done, do it.
+bool Float2Int::validateAndTransform() {
+  bool MadeChange = false;
+
+  // Iterate over every disjoint partition of the def-use graph.
+  for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
+    ConstantRange R(MaxIntegerBW + 1, false);
+    bool Fail = false;
+    Type *ConvertedToTy = nullptr;
+
+    // For every member of the partition, union all the ranges together.
+    for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+         MI != ME; ++MI) {
+      Instruction *I = *MI;
+      auto SeenI = SeenInsts.find(I);
+      if (SeenI == SeenInsts.end())
+        continue;
+
+      R = R.unionWith(SeenI->second);
+      // We need to ensure I has no users that have not been seen.
+      // If it does, transformation would be illegal.
+      //
+      // Don't count the roots, as they terminate the graphs.
+      if (Roots.count(I) == 0) {
+        // Set the type of the conversion while we're here.
+        if (!ConvertedToTy)
+          ConvertedToTy = I->getType();
+        for (User *U : I->users()) {
+          Instruction *UI = dyn_cast<Instruction>(U);
+          if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
+            DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+            Fail = true;
+            break;
+          }
+        }
+      }
+      if (Fail)
+        break;
+    }
+
+    // If the set was empty, or we failed, or the range is poisonous,
+    // bail out.
+    if (ECs.member_begin(It) == ECs.member_end() || Fail ||
+        R.isFullSet() || R.isSignWrappedSet())
+      continue;
+    assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
+
+    // The number of bits required is the maximum of the upper and
+    // lower limits, plus one so it can be signed.
+    unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
+                              R.getUpper().getMinSignedBits()) + 1;
+    DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+
+    // If we've run off the realms of the exactly representable integers,
+    // the floating point result will differ from an integer approximation.
+
+    // Do we need more bits than are in the mantissa of the type we converted
+    // to? semanticsPrecision returns the number of mantissa bits plus one
+    // for the sign bit.
+    unsigned MaxRepresentableBits
+      = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
+    if (MinBW > MaxRepresentableBits) {
+      DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+      continue;
+    }
+    if (MinBW > 64) {
+      DEBUG(dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+      continue;
+    }
+
+    // OK, R is known to be representable. Now pick a type for it.
+    // FIXME: Pick the smallest legal type that will fit.
+    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
+    for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+         MI != ME; ++MI)
+      convert(*MI, Ty);
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+Value *Float2Int::convert(Instruction *I, Type *ToTy) {
+  if (ConvertedInsts.find(I) != ConvertedInsts.end())
+    // Already converted this instruction.
+    return ConvertedInsts[I];
+
+  SmallVector<Value*,4> NewOperands;
+  for (Value *V : I->operands()) {
+    // Don't recurse if we're an instruction that terminates the path.
+    if (I->getOpcode() == Instruction::UIToFP ||
+        I->getOpcode() == Instruction::SIToFP) {
+      NewOperands.push_back(V);
+    } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
+      NewOperands.push_back(convert(VI, ToTy));
+    } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      APSInt Val(ToTy->getPrimitiveSizeInBits(), /*IsUnsigned=*/false);
+      bool Exact;
+      CF->getValueAPF().convertToInteger(Val,
+                                         APFloat::rmNearestTiesToEven,
+                                         &Exact);
+      NewOperands.push_back(ConstantInt::get(ToTy, Val));
+    } else {
+      llvm_unreachable("Unhandled operand type?");
+    }
+  }
+
+  // Now create a new instruction.
+  IRBuilder<> IRB(I);
+  Value *NewV = nullptr;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unhandled instruction!");
+
+  case Instruction::FPToUI:
+    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
+    break;
+
+  case Instruction::FPToSI:
+    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
+    break;
+
+  case Instruction::FCmp: {
+    CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
+    assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
+    NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
+    break;
+  }
+
+  case Instruction::UIToFP:
+    NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
+    break;
+
+  case Instruction::SIToFP:
+    NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
+    break;
+
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+    NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
+                           NewOperands[0], NewOperands[1],
+                           I->getName());
+    break;
+  }
+
+  // If we're a root instruction, RAUW.
+  if (Roots.count(I))
+    I->replaceAllUsesWith(NewV);
+
+  ConvertedInsts[I] = NewV;
+  return NewV;
+}
+
+// Perform dead code elimination on the instructions we just modified.
+void Float2Int::cleanup() {
+  for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend()))
+    I.first->eraseFromParent();
+}
+
+bool Float2Int::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+  // Clear out all state.
+  ECs = EquivalenceClasses<Instruction*>();
+  SeenInsts.clear();
+  ConvertedInsts.clear();
+  Roots.clear();
+
+  Ctx = &F.getParent()->getContext();
+
+  findRoots(F, Roots);
+
+  walkBackwards(Roots);
+  walkForwards();
+
+  bool Modified = validateAndTransform();
+  if (Modified)
+    cleanup();
+  return Modified;
+}
+
+FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 0000000..a028b8c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,2931 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions.  It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "gvn"
+
+STATISTIC(NumGVNInstr,  "Number of instructions deleted");
+STATISTIC(NumGVNLoad,   "Number of loads deleted");
+STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad,   "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+                               cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+                cl::desc("Max recurse depth (default = 1000)"));
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+namespace {
+  struct Expression {
+    uint32_t opcode;
+    Type *type;
+    SmallVector<uint32_t, 4> varargs;
+
+    Expression(uint32_t o = ~2U) : opcode(o) { }
+
+    bool operator==(const Expression &other) const {
+      if (opcode != other.opcode)
+        return false;
+      if (opcode == ~0U || opcode == ~1U)
+        return true;
+      if (type != other.type)
+        return false;
+      if (varargs != other.varargs)
+        return false;
+      return true;
+    }
+
+    friend hash_code hash_value(const Expression &Value) {
+      return hash_combine(Value.opcode, Value.type,
+                          hash_combine_range(Value.varargs.begin(),
+                                             Value.varargs.end()));
+    }
+  };
+
+  class ValueTable {
+    DenseMap<Value*, uint32_t> valueNumbering;
+    DenseMap<Expression, uint32_t> expressionNumbering;
+    AliasAnalysis *AA;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+
+    uint32_t nextValueNumber;
+
+    Expression create_expression(Instruction* I);
+    Expression create_cmp_expression(unsigned Opcode,
+                                     CmpInst::Predicate Predicate,
+                                     Value *LHS, Value *RHS);
+    Expression create_extractvalue_expression(ExtractValueInst* EI);
+    uint32_t lookup_or_add_call(CallInst* C);
+  public:
+    ValueTable() : nextValueNumber(1) { }
+    uint32_t lookup_or_add(Value *V);
+    uint32_t lookup(Value *V) const;
+    uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,
+                               Value *LHS, Value *RHS);
+    bool exists(Value *V) const;
+    void add(Value *V, uint32_t num);
+    void clear();
+    void erase(Value *v);
+    void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
+    AliasAnalysis *getAliasAnalysis() const { return AA; }
+    void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+    void setDomTree(DominatorTree* D) { DT = D; }
+    uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+    void verifyRemoved(const Value *) const;
+  };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() {
+    return ~0U;
+  }
+
+  static inline Expression getTombstoneKey() {
+    return ~1U;
+  }
+
+  static unsigned getHashValue(const Expression e) {
+    using llvm::hash_value;
+    return static_cast<unsigned>(hash_value(e));
+  }
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+};
+
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+Expression ValueTable::create_expression(Instruction *I) {
+  Expression e;
+  e.type = I->getType();
+  e.opcode = I->getOpcode();
+  for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookup_or_add(*OI));
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (e.varargs[0] > e.varargs[1])
+      std::swap(e.varargs[0], e.varargs[1]);
+  }
+
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    CmpInst::Predicate Predicate = C->getPredicate();
+    if (e.varargs[0] > e.varargs[1]) {
+      std::swap(e.varargs[0], e.varargs[1]);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (C->getOpcode() << 8) | Predicate;
+  } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
+    for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+         II != IE; ++II)
+      e.varargs.push_back(*II);
+  }
+
+  return e;
+}
+
+Expression ValueTable::create_cmp_expression(unsigned Opcode,
+                                             CmpInst::Predicate Predicate,
+                                             Value *LHS, Value *RHS) {
+  assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+         "Not a comparison!");
+  Expression e;
+  e.type = CmpInst::makeCmpResultType(LHS->getType());
+  e.varargs.push_back(lookup_or_add(LHS));
+  e.varargs.push_back(lookup_or_add(RHS));
+
+  // Sort the operand value numbers so x<y and y>x get the same value number.
+  if (e.varargs[0] > e.varargs[1]) {
+    std::swap(e.varargs[0], e.varargs[1]);
+    Predicate = CmpInst::getSwappedPredicate(Predicate);
+  }
+  e.opcode = (Opcode << 8) | Predicate;
+  return e;
+}
+
+Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
+  assert(EI && "Not an ExtractValueInst?");
+  Expression e;
+  e.type = EI->getType();
+  e.opcode = 0;
+
+  IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+  if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
+    // EI might be an extract from one of our recognised intrinsics. If it
+    // is we'll synthesize a semantically equivalent expression instead on
+    // an extract value expression.
+    switch (I->getIntrinsicID()) {
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::uadd_with_overflow:
+        e.opcode = Instruction::Add;
+        break;
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::usub_with_overflow:
+        e.opcode = Instruction::Sub;
+        break;
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::umul_with_overflow:
+        e.opcode = Instruction::Mul;
+        break;
+      default:
+        break;
+    }
+
+    if (e.opcode != 0) {
+      // Intrinsic recognized. Grab its args to finish building the expression.
+      assert(I->getNumArgOperands() == 2 &&
+             "Expect two args for recognised intrinsics.");
+      e.varargs.push_back(lookup_or_add(I->getArgOperand(0)));
+      e.varargs.push_back(lookup_or_add(I->getArgOperand(1)));
+      return e;
+    }
+  }
+
+  // Not a recognised intrinsic. Fall back to producing an extract value
+  // expression.
+  e.opcode = EI->getOpcode();
+  for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookup_or_add(*OI));
+
+  for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
+         II != IE; ++II)
+    e.varargs.push_back(*II);
+
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value *V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
+  if (AA->doesNotAccessMemory(C)) {
+    Expression exp = create_expression(C);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[C] = e;
+    return e;
+  } else if (AA->onlyReadsMemory(C)) {
+    Expression exp = create_expression(C);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
+    if (!MD) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
+
+    MemDepResult local_dep = MD->getDependency(C);
+
+    if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+      valueNumbering[C] =  nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (local_dep.isDef()) {
+      CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+
+      if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+
+      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+        uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+        uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i));
+        if (c_vn != cd_vn) {
+          valueNumbering[C] = nextValueNumber;
+          return nextValueNumber++;
+        }
+      }
+
+      uint32_t v = lookup_or_add(local_cdep);
+      valueNumbering[C] = v;
+      return v;
+    }
+
+    // Non-local case.
+    const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
+      MD->getNonLocalCallDependency(CallSite(C));
+    // FIXME: Move the checking logic to MemDep!
+    CallInst* cdep = nullptr;
+
+    // Check to see if we have a single dominating call instruction that is
+    // identical to C.
+    for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+      const NonLocalDepEntry *I = &deps[i];
+      if (I->getResult().isNonLocal())
+        continue;
+
+      // We don't handle non-definitions.  If we already have a call, reject
+      // instruction dependencies.
+      if (!I->getResult().isDef() || cdep != nullptr) {
+        cdep = nullptr;
+        break;
+      }
+
+      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+      // FIXME: All duplicated with non-local case.
+      if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+        cdep = NonLocalDepCall;
+        continue;
+      }
+
+      cdep = nullptr;
+      break;
+    }
+
+    if (!cdep) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+      uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i));
+      if (c_vn != cd_vn) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+    }
+
+    uint32_t v = lookup_or_add(cdep);
+    valueNumbering[C] = v;
+    return v;
+
+  } else {
+    valueNumbering[C] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value *V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+
+  if (!isa<Instruction>(V)) {
+    valueNumbering[V] = nextValueNumber;
+    return nextValueNumber++;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  Expression exp;
+  switch (I->getOpcode()) {
+    case Instruction::Call:
+      return lookup_or_add_call(cast<CallInst>(I));
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = create_expression(I);
+      break;
+    case Instruction::ExtractValue:
+      exp = create_extractvalue_expression(cast<ExtractValueInst>(I));
+      break;
+    default:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+  }
+
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  valueNumbering[V] = e;
+  return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value *V) const {
+  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+  assert(VI != valueNumbering.end() && "Value not numbered?");
+  return VI->second;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before.  Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
+                                       CmpInst::Predicate Predicate,
+                                       Value *LHS, Value *RHS) {
+  Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS);
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  return e;
+}
+
+/// Remove all entries from the ValueTable.
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// Remove a value from the value numbering.
+void ValueTable::erase(Value *V) {
+  valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::const_iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class GVN;
+  struct AvailableValueInBlock {
+    /// BB - The basic block in question.
+    BasicBlock *BB;
+    enum ValType {
+      SimpleVal,  // A simple offsetted value that is accessed.
+      LoadVal,    // A value produced by a load.
+      MemIntrin,  // A memory intrinsic which is loaded from.
+      UndefVal    // A UndefValue representing a value from dead block (which
+                  // is not yet physically removed from the CFG). 
+    };
+  
+    /// V - The value that is live out of the block.
+    PointerIntPair<Value *, 2, ValType> Val;
+  
+    /// Offset - The byte offset in Val that is interesting for the load query.
+    unsigned Offset;
+  
+    static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                     unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(V);
+      Res.Val.setInt(SimpleVal);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
+                                       unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(MI);
+      Res.Val.setInt(MemIntrin);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
+                                         unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(LI);
+      Res.Val.setInt(LoadVal);
+      Res.Offset = Offset;
+      return Res;
+    }
+
+    static AvailableValueInBlock getUndef(BasicBlock *BB) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(nullptr);
+      Res.Val.setInt(UndefVal);
+      Res.Offset = 0;
+      return Res;
+    }
+
+    bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+    bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+    bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+    bool isUndefValue() const { return Val.getInt() == UndefVal; }
+  
+    Value *getSimpleValue() const {
+      assert(isSimpleValue() && "Wrong accessor");
+      return Val.getPointer();
+    }
+  
+    LoadInst *getCoercedLoadValue() const {
+      assert(isCoercedLoadValue() && "Wrong accessor");
+      return cast<LoadInst>(Val.getPointer());
+    }
+  
+    MemIntrinsic *getMemIntrinValue() const {
+      assert(isMemIntrinValue() && "Wrong accessor");
+      return cast<MemIntrinsic>(Val.getPointer());
+    }
+  
+    /// Emit code into this block to adjust the value defined here to the
+    /// specified type. This handles various coercion cases.
+    Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const;
+  };
+
+  class GVN : public FunctionPass {
+    bool NoLoads;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+    const TargetLibraryInfo *TLI;
+    AssumptionCache *AC;
+    SetVector<BasicBlock *> DeadBlocks;
+
+    ValueTable VN;
+
+    /// A mapping from value numbers to lists of Value*'s that
+    /// have that value number.  Use findLeader to query it.
+    struct LeaderTableEntry {
+      Value *Val;
+      const BasicBlock *BB;
+      LeaderTableEntry *Next;
+    };
+    DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
+    BumpPtrAllocator TableAllocator;
+
+    // Block-local map of equivalent values to their leader, does not
+    // propagate to any successors. Entries added mid-block are applied
+    // to the remaining instructions in the block.
+    SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap;
+    SmallVector<Instruction*, 8> InstrsToErase;
+
+    typedef SmallVector<NonLocalDepResult, 64> LoadDepVect;
+    typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect;
+    typedef SmallVector<BasicBlock*, 64> UnavailBlkVect;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit GVN(bool noloads = false)
+        : FunctionPass(ID), NoLoads(noloads), MD(nullptr) {
+      initializeGVNPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    /// This removes the specified instruction from
+    /// our various maps and marks it for deletion.
+    void markInstructionForDeletion(Instruction *I) {
+      VN.erase(I);
+      InstrsToErase.push_back(I);
+    }
+
+    DominatorTree &getDominatorTree() const { return *DT; }
+    AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
+    MemoryDependenceAnalysis &getMemDep() const { return *MD; }
+  private:
+    /// Push a new Value to the LeaderTable onto the list for its value number.
+    void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
+      LeaderTableEntry &Curr = LeaderTable[N];
+      if (!Curr.Val) {
+        Curr.Val = V;
+        Curr.BB = BB;
+        return;
+      }
+
+      LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
+      Node->Val = V;
+      Node->BB = BB;
+      Node->Next = Curr.Next;
+      Curr.Next = Node;
+    }
+
+    /// Scan the list of values corresponding to a given
+    /// value number, and remove the given instruction if encountered.
+    void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
+      LeaderTableEntry* Prev = nullptr;
+      LeaderTableEntry* Curr = &LeaderTable[N];
+
+      while (Curr && (Curr->Val != I || Curr->BB != BB)) {
+        Prev = Curr;
+        Curr = Curr->Next;
+      }
+
+      if (!Curr)
+        return;
+
+      if (Prev) {
+        Prev->Next = Curr->Next;
+      } else {
+        if (!Curr->Next) {
+          Curr->Val = nullptr;
+          Curr->BB = nullptr;
+        } else {
+          LeaderTableEntry* Next = Curr->Next;
+          Curr->Val = Next->Val;
+          Curr->BB = Next->BB;
+          Curr->Next = Next->Next;
+        }
+      }
+    }
+
+    // List of critical edges to be split between iterations.
+    SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
+
+    // This transformation requires dominator postdominator info
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      if (!NoLoads)
+        AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
+
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+
+
+    // Helper functions of redundant load elimination 
+    bool processLoad(LoadInst *L);
+    bool processNonLocalLoad(LoadInst *L);
+    bool processAssumeIntrinsic(IntrinsicInst *II);
+    void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+                                 AvailValInBlkVect &ValuesPerBlock,
+                                 UnavailBlkVect &UnavailableBlocks);
+    bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+                        UnavailBlkVect &UnavailableBlocks);
+
+    // Other helper routines
+    bool processInstruction(Instruction *I);
+    bool processBlock(BasicBlock *BB);
+    void dump(DenseMap<uint32_t, Value*> &d);
+    bool iterateOnFunction(Function &F);
+    bool performPRE(Function &F);
+    bool performScalarPRE(Instruction *I);
+    bool performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                   unsigned int ValNo);
+    Value *findLeader(const BasicBlock *BB, uint32_t num);
+    void cleanupGlobalSets();
+    void verifyRemoved(const Instruction *I) const;
+    bool splitCriticalEdges();
+    BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
+    bool replaceOperandsWithConsts(Instruction *I) const;
+    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+                           bool DominatesByEdge);
+    bool processFoldableCondBr(BranchInst *BI);
+    void addDeadBlock(BasicBlock *BB);
+    void assignValNumForDeadCode();
+  };
+
+  char GVN::ID = 0;
+}
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+  return new GVN(NoLoads);
+}
+
+INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+  errs() << "{\n";
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+       E = d.end(); I != E; ++I) {
+      errs() << I->first << "\n";
+      I->second->dump();
+  }
+  errs() << "}\n";
+}
+#endif
+
+/// Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block.  As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
+/// map is actually a tri-state map with the following values:
+///   0) we know the block *is not* fully available.
+///   1) we know the block *is* fully available.
+///   2) we do not know whether the block is fully available or not, but we are
+///      currently speculating that it will be.
+///   3) we are speculating for this block and have used that to speculate for
+///      other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+                            uint32_t RecurseDepth) {
+  if (RecurseDepth > MaxRecurseDepth)
+    return false;
+
+  // Optimistically assume that the block is fully available and check to see
+  // if we already know about this block in one lookup.
+  std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
+    FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+  // If the entry already existed for this block, return the precomputed value.
+  if (!IV.second) {
+    // If this is a speculative "available" value, mark it as being used for
+    // speculation of other blocks.
+    if (IV.first->second == 2)
+      IV.first->second = 3;
+    return IV.first->second != 0;
+  }
+
+  // Otherwise, see if it is fully available in all predecessors.
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+  // If this block has no predecessors, it isn't live-in here.
+  if (PI == PE)
+    goto SpeculationFailure;
+
+  for (; PI != PE; ++PI)
+    // If the value isn't fully available in one of our predecessors, then it
+    // isn't fully available in this block either.  Undo our previous
+    // optimistic assumption and bail out.
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
+      goto SpeculationFailure;
+
+  return true;
+
+// If we get here, we found out that this is not, after
+// all, a fully-available block.  We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+  char &BBVal = FullyAvailableBlocks[BB];
+
+  // If we didn't speculate on this, just return with it set to false.
+  if (BBVal == 2) {
+    BBVal = 0;
+    return false;
+  }
+
+  // If we did speculate on this value, we could have blocks set to 1 that are
+  // incorrect.  Walk the (transitive) successors of this block and mark them as
+  // 0 if set to one.
+  SmallVector<BasicBlock*, 32> BBWorklist;
+  BBWorklist.push_back(BB);
+
+  do {
+    BasicBlock *Entry = BBWorklist.pop_back_val();
+    // Note that this sets blocks to 0 (unavailable) if they happen to not
+    // already be in FullyAvailableBlocks.  This is safe.
+    char &EntryVal = FullyAvailableBlocks[Entry];
+    if (EntryVal == 0) continue;  // Already unavailable.
+
+    // Mark as unavailable.
+    EntryVal = 0;
+
+    BBWorklist.append(succ_begin(Entry), succ_end(Entry));
+  } while (!BBWorklist.empty());
+
+  return false;
+}
+
+
+/// Return true if CoerceAvailableValueToLoadType will succeed.
+static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
+                                            Type *LoadTy,
+                                            const DataLayout &DL) {
+  // If the loaded or stored value is an first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      StoredVal->getType()->isStructTy() ||
+      StoredVal->getType()->isArrayTy())
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (DL.getTypeSizeInBits(StoredVal->getType()) <
+        DL.getTypeSizeInBits(LoadTy))
+    return false;
+
+  return true;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                             IRBuilder<> &IRB,
+                                             const DataLayout &DL) {
+  if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL))
+    return nullptr;
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy);
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoreSize == LoadSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->getScalarType()->isPointerTy() &&
+        LoadedTy->getScalarType()->isPointerTy())
+      return IRB.CreateBitCast(StoredVal, LoadedTy);
+
+    // Convert source pointers to integers, which can be bitcast.
+    if (StoredValTy->getScalarType()->isPointerTy()) {
+      StoredValTy = DL.getIntPtrType(StoredValTy);
+      StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
+    }
+
+    Type *TypeToCastTo = LoadedTy;
+    if (TypeToCastTo->getScalarType()->isPointerTy())
+      TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+    if (StoredValTy != TypeToCastTo)
+      StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
+
+    // Cast to pointer if the load needs a pointer type.
+    if (LoadedTy->getScalarType()->isPointerTy())
+      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
+
+    return StoredVal;
+  }
+
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->getScalarType()->isPointerTy()) {
+    StoredValTy = DL.getIntPtrType(StoredValTy);
+    StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize);
+    StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (DL.isBigEndian()) {
+    StoredVal = IRB.CreateLShr(StoredVal, StoreSize - LoadSize, "tmp");
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
+  StoredVal  = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
+
+  if (LoadedTy == NewIntTy)
+    return StoredVal;
+
+  // If the result is a pointer, inttoptr.
+  if (LoadedTy->getScalarType()->isPointerTy())
+    return IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
+
+  // Otherwise, bitcast.
+  return IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering memory write (store,
+/// memset, memcpy, memmove).  This means that the write *may* provide bits used
+/// by the load but we can't be sure because the pointers don't mustalias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &DL) {
+  // If the loaded or stored value is a first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+#if 0
+  if (LoadOffset == StoreOffset) {
+    dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n"
+    << "Base       = " << *StoreBase << "\n"
+    << "Store Ptr  = " << *WritePtr << "\n"
+    << "Store Offs = " << StoreOffset << "\n"
+    << "Load Ptr   = " << *LoadPtr << "\n";
+    abort();
+  }
+#endif
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits >> 3;  // Convert to bytes.
+  LoadSize >>= 3;
+
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure) {
+#if 0
+    dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n"
+    << "Base       = " << *StoreBase << "\n"
+    << "Store Ptr  = " << *WritePtr << "\n"
+    << "Store Offs = " << StoreOffset << "\n"
+    << "Load Ptr   = " << *LoadPtr << "\n";
+    abort();
+#endif
+    return -1;
+  }
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset+StoreSize < LoadOffset+LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset-StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                          StoreInst *DepSI) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepSI->getValueOperand()->getType()->isStructTy() ||
+      DepSI->getValueOperand()->getType()->isArrayTy())
+    return -1;
+
+  const DataLayout &DL = DepSI->getModule()->getDataLayout();
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+                                        StorePtr, StoreSize, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
+                                         LoadInst *DepLI, const DataLayout &DL){
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+  if (R != -1) return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+  unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
+      LoadBase, LoadOffs, LoadSize, DepLI);
+  if (Size == 0) return -1;
+
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
+}
+
+
+
+static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                            MemIntrinsic *MI,
+                                            const DataLayout &DL) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (!SizeCst) return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, DL);
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (!Src) return -1;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+  if (!GV || !GV->isConstant()) return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+                                              MI->getDest(), MemSizeInBits, DL);
+  if (Offset == -1)
+    return Offset;
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src = ConstantExpr::getBitCast(Src,
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  if (ConstantFoldLoadFromConstPtr(Src, DL))
+    return Offset;
+  return -1;
+}
+
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.  This means
+/// that the store provides bits used by the load but we the pointers don't
+/// mustalias.  Check this case to see if there is anything more we can do
+/// before we give up.
+static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
+                                   Type *LoadTy,
+                                   Instruction *InsertPt, const DataLayout &DL){
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+
+  IRBuilder<> Builder(InsertPt);
+
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->getScalarType()->isPointerTy())
+    SrcVal = Builder.CreatePtrToInt(SrcVal,
+        DL.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (DL.isLittleEndian())
+    ShiftAmt = Offset*8;
+  else
+    ShiftAmt = (StoreSize-LoadSize-Offset)*8;
+
+  if (ShiftAmt)
+    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
+
+  if (LoadSize != StoreSize)
+    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
+
+  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering load.  This means
+/// that the load *may* provide bits used by the load but we can't be sure
+/// because the pointers don't mustalias.  Check this case to see if there is
+/// anything more we can do before we give up.
+static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
+                                  Type *LoadTy, Instruction *InsertPt,
+                                  GVN &gvn) {
+  const DataLayout &DL = SrcVal->getModule()->getDataLayout();
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset+LoadSize > SrcValSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset+LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestPTy =
+      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
+    DestPTy = PointerType::get(DestPTy,
+                               PtrVal->getType()->getPointerAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (DL.isBigEndian())
+      RV = Builder.CreateLShr(RV,
+                    NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    // We would like to use gvn.markInstructionForDeletion here, but we can't
+    // because the load is already memoized into the leader map table that GVN
+    // tracks.  It is potentially possible to remove the load from the table,
+    // but then there all of the operations based on it would need to be
+    // rehashed.  Just leave the dead load around.
+    gvn.getMemDep().removeInstruction(SrcVal);
+    SrcVal = NewLoad;
+  }
+
+  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                     Type *LoadTy, Instruction *InsertPt,
+                                     const DataLayout &DL){
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
+
+  IRBuilder<> Builder(InsertPt);
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    Value *Val = MSI->getValue();
+    if (LoadSize != 1)
+      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
+
+    Value *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet*2 <= LoadSize) {
+        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
+        Val = Builder.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      Value *ShVal = Builder.CreateShl(Val, 1*8);
+      Val = Builder.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src = ConstantExpr::getBitCast(Src,
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  return ConstantFoldLoadFromConstPtr(Src, DL);
+}
+
+
+/// Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI.  This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+                         SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+                                     GVN &gvn) {
+  // Check for the fully redundant, dominating load case.  In this case, we can
+  // just use the dominating value directly.
+  if (ValuesPerBlock.size() == 1 &&
+      gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
+  }
+
+  // Otherwise, we have to construct SSA form.
+  SmallVector<PHINode*, 8> NewPHIs;
+  SSAUpdater SSAUpdate(&NewPHIs);
+  SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+  for (const AvailableValueInBlock &AV : ValuesPerBlock) {
+    BasicBlock *BB = AV.BB;
+
+    if (SSAUpdate.HasValueForBlock(BB))
+      continue;
+
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
+  }
+
+  // Perform PHI construction.
+  return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+}
+
+Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
+                                                       GVN &gvn) const {
+  Value *Res;
+  Type *LoadTy = LI->getType();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  if (isSimpleValue()) {
+    Res = getSimpleValue();
+    if (Res->getType() != LoadTy) {
+      Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL);
+
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+                   << *getSimpleValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else if (isCoercedLoadValue()) {
+    LoadInst *Load = getCoercedLoadValue();
+    if (Load->getType() == LoadTy && Offset == 0) {
+      Res = Load;
+    } else {
+      Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
+                                gvn);
+  
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
+                   << *getCoercedLoadValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else if (isMemIntrinValue()) {
+    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+                                 BB->getTerminator(), DL);
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                 << "  " << *getMemIntrinValue() << '\n'
+                 << *Res << '\n' << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
+  }
+  return Res;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+  if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+    return II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+                                  AvailValInBlkVect &ValuesPerBlock,
+                                  UnavailBlkVect &UnavailableBlocks) {
+
+  // Filter out useless results (non-locals, etc).  Keep track of the blocks
+  // where we have a value available in repl, also keep track of whether we see
+  // dependencies that produce an unknown value for the load (such as a call
+  // that could potentially clobber the load).
+  unsigned NumDeps = Deps.size();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+    BasicBlock *DepBB = Deps[i].getBB();
+    MemDepResult DepInfo = Deps[i].getResult();
+
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
+    if (DepInfo.isClobber()) {
+      // The address being loaded in this non-local block may not be the same as
+      // the pointer operand of the load if PHI translation occurs.  Make sure
+      // to consider the right address.
+      Value *Address = Deps[i].getAddress();
+
+      // If the dependence is to a store that writes to a superset of the bits
+      // read by the load, we can extract the bits we need for the load from the
+      // stored value.
+      if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+        if (Address) {
+          int Offset =
+              AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                                       DepSI->getValueOperand(),
+                                                                Offset));
+            continue;
+          }
+        }
+      }
+
+      // Check to see if we have something like this:
+      //    load i32* P
+      //    load i8* (P+1)
+      // if we have this, replace the later with an extraction from the former.
+      if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+        // If this is a clobber and L is the first instruction in its block, then
+        // we have the first instruction in the entry block.
+        if (DepLI != LI && Address) {
+          int Offset =
+              AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
+                                                                    Offset));
+            continue;
+          }
+        }
+      }
+
+      // If the clobbering value is a memset/memcpy/memmove, see if we can
+      // forward a value on from it.
+      if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+        if (Address) {
+          int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+                                                        DepMI, DL);
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
+                                                                  Offset));
+            continue;
+          }
+        }
+      }
+
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
+    // DepInfo.isDef() here
+
+    Instruction *DepInst = DepInfo.getInst();
+
+    // Loading the allocation -> undef.
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+        // Loading immediately after lifetime begin -> undef.
+        isLifetimeStart(DepInst)) {
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                             UndefValue::get(LI->getType())));
+      continue;
+    }
+
+    // Loading from calloc (which zero initializes memory) -> zero
+    if (isCallocLikeFn(DepInst, TLI)) {
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(
+          DepBB, Constant::getNullValue(LI->getType())));
+      continue;
+    }
+
+    if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+      // Reject loads and stores that are to the same address but are of
+      // different types if we have to.
+      if (S->getValueOperand()->getType() != LI->getType()) {
+        // If the stored value is larger or equal to the loaded value, we can
+        // reuse it.
+        if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                             LI->getType(), DL)) {
+          UnavailableBlocks.push_back(DepBB);
+          continue;
+        }
+      }
+
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                                         S->getValueOperand()));
+      continue;
+    }
+
+    if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+      // If the types mismatch and we can't handle it, reject reuse of the load.
+      if (LD->getType() != LI->getType()) {
+        // If the stored value is larger or equal to the loaded value, we can
+        // reuse it.
+        if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) {
+          UnavailableBlocks.push_back(DepBB);
+          continue;
+        }
+      }
+      ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
+      continue;
+    }
+
+    UnavailableBlocks.push_back(DepBB);
+  }
+}
+
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+                         UnavailBlkVect &UnavailableBlocks) {
+  // Okay, we have *some* definitions of the value.  This means that the value
+  // is available in some of our (transitive) predecessors.  Lets think about
+  // doing PRE of this load.  This will involve inserting a new load into the
+  // predecessor when it's not available.  We could do this in general, but
+  // prefer to not increase code size.  As such, we only do this when we know
+  // that we only have to insert *one* load (which means we're basically moving
+  // the load, not inserting a new one).
+
+  SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+                                        UnavailableBlocks.end());
+
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
+  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *TmpBB = LoadBB;
+
+  while (TmpBB->getSinglePredecessor()) {
+    TmpBB = TmpBB->getSinglePredecessor();
+    if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+      return false;
+    if (Blockers.count(TmpBB))
+      return false;
+
+    // If any of these blocks has more than one successor (i.e. if the edge we
+    // just traversed was critical), then there are other paths through this
+    // block along which the load may not be anticipated.  Hoisting the load
+    // above this block would be adding the load to execution paths along
+    // which it was not previously executed.
+    if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+      return false;
+  }
+
+  assert(TmpBB);
+  LoadBB = TmpBB;
+
+  // Check to see how many predecessors have the loaded value fully
+  // available.
+  MapVector<BasicBlock *, Value *> PredLoads;
+  DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+  for (const AvailableValueInBlock &AV : ValuesPerBlock)
+    FullyAvailableBlocks[AV.BB] = true;
+  for (BasicBlock *UnavailableBB : UnavailableBlocks)
+    FullyAvailableBlocks[UnavailableBB] = false;
+
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
+  for (BasicBlock *Pred : predecessors(LoadBB)) {
+    // If any predecessor block is an EH pad that does not allow non-PHI
+    // instructions before the terminator, we can't PRE the load.
+    if (Pred->getTerminator()->isEHPad()) {
+      DEBUG(dbgs()
+            << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+            << Pred->getName() << "': " << *LI << '\n');
+      return false;
+    }
+
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+      continue;
+    }
+
+    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+      if (isa<IndirectBrInst>(Pred->getTerminator())) {
+        DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      if (LoadBB->isEHPad()) {
+        DEBUG(dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      CriticalEdgePred.push_back(Pred);
+    } else {
+      // Only add the predecessors that will not be split for now.
+      PredLoads[Pred] = nullptr;
+    }
+  }
+
+  // Decide whether PRE is profitable for this load.
+  unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
+  assert(NumUnavailablePreds != 0 &&
+         "Fully available value should already be eliminated!");
+
+  // If this load is unavailable in multiple predecessors, reject it.
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.
+  if (NumUnavailablePreds != 1)
+      return false;
+
+  // Split critical edges, and update the unavailable predecessors accordingly.
+  for (BasicBlock *OrigPred : CriticalEdgePred) {
+    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+    assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+    PredLoads[NewPred] = nullptr;
+    DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                 << LoadBB->getName() << '\n');
+  }
+
+  // Check if the load can safely be moved to all the unavailable predecessors.
+  bool CanDoPRE = true;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  SmallVector<Instruction*, 8> NewInsts;
+  for (auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+
+    // Do PHI translation to get its value in the predecessor if necessary.  The
+    // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+
+    // If all preds have a single successor, then we know it is safe to insert
+    // the load on the pred (?!?), so we can insert code to materialize the
+    // pointer if it is not available.
+    PHITransAddr Address(LI->getPointerOperand(), DL, AC);
+    Value *LoadPtr = nullptr;
+    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+                                                *DT, NewInsts);
+
+    // If we couldn't find or insert a computation of this phi translated value,
+    // we fail PRE.
+    if (!LoadPtr) {
+      DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+            << *LI->getPointerOperand() << "\n");
+      CanDoPRE = false;
+      break;
+    }
+
+    PredLoad.second = LoadPtr;
+  }
+
+  if (!CanDoPRE) {
+    while (!NewInsts.empty()) {
+      Instruction *I = NewInsts.pop_back_val();
+      if (MD) MD->removeInstruction(I);
+      I->eraseFromParent();
+    }
+    // HINT: Don't revert the edge-splitting as following transformation may
+    // also need to split these critical edges.
+    return !CriticalEdgePred.empty();
+  }
+
+  // Okay, we can eliminate this load by inserting a reload in the predecessor
+  // and using PHI construction to get the value in the other predecessors, do
+  // it.
+  DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+  DEBUG(if (!NewInsts.empty())
+          dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
+                 << *NewInsts.back() << '\n');
+
+  // Assign value numbers to the new instructions.
+  for (Instruction *I : NewInsts) {
+    // FIXME: We really _ought_ to insert these value numbers into their
+    // parent's availability map.  However, in doing so, we risk getting into
+    // ordering issues.  If a block hasn't been processed yet, we would be
+    // marking a value as AVAIL-IN, which isn't what we intend.
+    VN.lookup_or_add(I);
+  }
+
+  for (const auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+    Value *LoadPtr = PredLoad.second;
+
+    Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
+                                        LI->getAlignment(),
+                                        UnavailablePred->getTerminator());
+
+    // Transfer the old load's AA tags to the new load.
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags)
+      NewLoad->setAAMetadata(Tags);
+
+    if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+    if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+      NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+
+    // Transfer DebugLoc.
+    NewLoad->setDebugLoc(LI->getDebugLoc());
+
+    // Add the newly created load.
+    ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+                                                        NewLoad));
+    MD->invalidateCachedPointerInfo(LoadPtr);
+    DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+  }
+
+  // Perform PHI construction.
+  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+  LI->replaceAllUsesWith(V);
+  if (isa<PHINode>(V))
+    V->takeName(LI);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    I->setDebugLoc(LI->getDebugLoc());
+  if (V->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(V);
+  markInstructionForDeletion(LI);
+  ++NumPRELoad;
+  return true;
+}
+
+/// Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // non-local speculations are not allowed under asan.
+  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress))
+    return false;
+
+  // Step 1: Find the non-local dependencies of the load.
+  LoadDepVect Deps;
+  MD->getNonLocalPointerDependency(LI, Deps);
+
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+    DEBUG(
+      dbgs() << "GVN: non-local load ";
+      LI->printAsOperand(dbgs());
+      dbgs() << " has unknown dependencies\n";
+    );
+    return false;
+  }
+
+  // If this load follows a GEP, see if we can PRE the indices before analyzing.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+    for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+                                        OE = GEP->idx_end();
+         OI != OE; ++OI)
+      if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+        performScalarPRE(I);
+  }
+
+  // Step 2: Analyze the availability of the load
+  AvailValInBlkVect ValuesPerBlock;
+  UnavailBlkVect UnavailableBlocks;
+  AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty())
+    return false;
+
+  // Step 3: Eliminate fully redundancy.
+  //
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+    // Perform PHI construction.
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+    LI->replaceAllUsesWith(V);
+
+    if (isa<PHINode>(V))
+      V->takeName(LI);
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      if (LI->getDebugLoc())
+        I->setDebugLoc(LI->getDebugLoc());
+    if (V->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(LI);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // Step 4: Eliminate partial redundancy.
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+}
+
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+  assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+         "This function can only be called with llvm.assume intrinsic");
+  Value *V = IntrinsicI->getArgOperand(0);
+
+  if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+    if (Cond->isZero()) {
+      Type *Int8Ty = Type::getInt8Ty(V->getContext());
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  FIXME: We could insert unreachable
+      // instruction directly because we can modify the CFG.
+      new StoreInst(UndefValue::get(Int8Ty),
+                    Constant::getNullValue(Int8Ty->getPointerTo()),
+                    IntrinsicI);
+    }
+    markInstructionForDeletion(IntrinsicI);
+    return false;
+  }
+
+  Constant *True = ConstantInt::getTrue(V->getContext());
+  bool Changed = false;
+
+  for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+    BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+    // This property is only true in dominated successors, propagateEquality
+    // will check dominance for us.
+    Changed |= propagateEquality(V, True, Edge, false);
+  }
+
+  // We can replace assume value with true, which covers cases like this:
+  // call void @llvm.assume(i1 %cmp)
+  // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+  ReplaceWithConstMap[V] = True;
+
+  // If one of *cmp *eq operand is const, adding it to map will cover this:
+  // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+  // call void @llvm.assume(i1 %cmp)
+  // ret float %0 ; will change it to ret float 3.000000e+00
+  if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+    if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ ||
+        CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+        (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+         CmpI->getFastMathFlags().noNaNs())) {
+      Value *CmpLHS = CmpI->getOperand(0);
+      Value *CmpRHS = CmpI->getOperand(1);
+      if (isa<Constant>(CmpLHS))
+        std::swap(CmpLHS, CmpRHS);
+      auto *RHSConst = dyn_cast<Constant>(CmpRHS);
+
+      // If only one operand is constant.
+      if (RHSConst != nullptr && !isa<Constant>(CmpLHS))
+        ReplaceWithConstMap[CmpLHS] = RHSConst;
+    }
+  }
+  return Changed;
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
+  BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl);
+  if (Op && ReplOp)
+    ReplOp->andIRFlags(Op);
+
+  if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
+    // FIXME: If both the original and replacement value are part of the
+    // same control-flow region (meaning that the execution of one
+    // guarantees the execution of the other), then we can combine the
+    // noalias scopes here and do better than the general conservative
+    // answer used in combineMetadata().
+
+    // In general, GVN unifies expressions over different control-flow
+    // regions, and so we need a conservative combination of the noalias
+    // scopes.
+    static const unsigned KnownIDs[] = {
+        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+        LLVMContext::MD_noalias,        LLVMContext::MD_range,
+        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+        LLVMContext::MD_invariant_group};
+    combineMetadata(ReplInst, I, KnownIDs);
+  }
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
+  I->replaceAllUsesWith(Repl);
+}
+
+/// Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+  if (!MD)
+    return false;
+
+  if (!L->isSimple())
+    return false;
+
+  if (L->use_empty()) {
+    markInstructionForDeletion(L);
+    return true;
+  }
+
+  // ... to a pointer that has been loaded from before...
+  MemDepResult Dep = MD->getDependency(L);
+  const DataLayout &DL = L->getModule()->getDataLayout();
+
+  // If we have a clobber and target data is around, see if this is a clobber
+  // that we can fix up through code synthesis.
+  if (Dep.isClobber()) {
+    // Check to see if we have something like this:
+    //   store i32 123, i32* %P
+    //   %A = bitcast i32* %P to i8*
+    //   %B = gep i8* %A, i32 1
+    //   %C = load i8* %B
+    //
+    // We could do that by recognizing if the clobber instructions are obviously
+    // a common base + constant offset, and if the previous store (or memset)
+    // completely covers this load.  This sort of thing can happen in bitfield
+    // access code.
+    Value *AvailVal = nullptr;
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
+      int Offset = AnalyzeLoadFromClobberingStore(
+          L->getType(), L->getPointerOperand(), DepSI);
+      if (Offset != -1)
+        AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
+                                        L->getType(), L, DL);
+    }
+
+    // Check to see if we have something like this:
+    //    load i32* P
+    //    load i8* (P+1)
+    // if we have this, replace the later with an extraction from the former.
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) {
+      // If this is a clobber and L is the first instruction in its block, then
+      // we have the first instruction in the entry block.
+      if (DepLI == L)
+        return false;
+
+      int Offset = AnalyzeLoadFromClobberingLoad(
+          L->getType(), L->getPointerOperand(), DepLI, DL);
+      if (Offset != -1)
+        AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
+    }
+
+    // If the clobbering value is a memset/memcpy/memmove, see if we can forward
+    // a value on from it.
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
+      int Offset = AnalyzeLoadFromClobberingMemInst(
+          L->getType(), L->getPointerOperand(), DepMI, DL);
+      if (Offset != -1)
+        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL);
+    }
+
+    if (AvailVal) {
+      DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
+            << *AvailVal << '\n' << *L << "\n\n\n");
+
+      // Replace the load!
+      L->replaceAllUsesWith(AvailVal);
+      if (AvailVal->getType()->getScalarType()->isPointerTy())
+        MD->invalidateCachedPointerInfo(AvailVal);
+      markInstructionForDeletion(L);
+      ++NumGVNLoad;
+      return true;
+    }
+
+    // If the value isn't available, don't do anything!
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      L->printAsOperand(dbgs());
+      Instruction *I = Dep.getInst();
+      dbgs() << " is clobbered by " << *I << '\n';
+    );
+    return false;
+  }
+
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  if (!Dep.isDef()) {
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      L->printAsOperand(dbgs());
+      dbgs() << " has unknown dependence\n";
+    );
+    return false;
+  }
+
+  Instruction *DepInst = Dep.getInst();
+  if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    Value *StoredVal = DepSI->getValueOperand();
+
+    // The store and load are to a must-aliased pointer, but they may not
+    // actually have the same type.  See if we know how to reuse the stored
+    // value (depending on its type).
+    if (StoredVal->getType() != L->getType()) {
+      IRBuilder<> Builder(L);
+      StoredVal =
+          CoerceAvailableValueToLoadType(StoredVal, L->getType(), Builder, DL);
+      if (!StoredVal)
+        return false;
+
+      DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
+                   << '\n' << *L << "\n\n\n");
+    }
+
+    // Remove it!
+    L->replaceAllUsesWith(StoredVal);
+    if (StoredVal->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(StoredVal);
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    Value *AvailableVal = DepLI;
+
+    // The loads are of a must-aliased pointer, but they may not actually have
+    // the same type.  See if we know how to reuse the previously loaded value
+    // (depending on its type).
+    if (DepLI->getType() != L->getType()) {
+      IRBuilder<> Builder(L);
+      AvailableVal =
+          CoerceAvailableValueToLoadType(DepLI, L->getType(), Builder, DL);
+      if (!AvailableVal)
+        return false;
+
+      DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
+                   << "\n" << *L << "\n\n\n");
+    }
+
+    // Remove it!
+    patchAndReplaceAllUsesWith(L, AvailableVal);
+    if (DepLI->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(DepLI);
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    L->replaceAllUsesWith(UndefValue::get(L->getType()));
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+      L->replaceAllUsesWith(UndefValue::get(L->getType()));
+      markInstructionForDeletion(L);
+      ++NumGVNLoad;
+      return true;
+    }
+  }
+
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  if (isCallocLikeFn(DepInst, TLI)) {
+    L->replaceAllUsesWith(Constant::getNullValue(L->getType()));
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  return false;
+}
+
+// In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question.  This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+  LeaderTableEntry Vals = LeaderTable[num];
+  if (!Vals.Val) return nullptr;
+
+  Value *Val = nullptr;
+  if (DT->dominates(Vals.BB, BB)) {
+    Val = Vals.Val;
+    if (isa<Constant>(Val)) return Val;
+  }
+
+  LeaderTableEntry* Next = Vals.Next;
+  while (Next) {
+    if (DT->dominates(Next->BB, BB)) {
+      if (isa<Constant>(Next->Val)) return Next->Val;
+      if (!Val) Val = Next->Val;
+    }
+
+    Next = Next->Next;
+  }
+
+  return Val;
+}
+
+/// There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+                                       DominatorTree *DT) {
+  // While in theory it is interesting to consider the case in which Dst has
+  // more than one predecessor, because Dst might be part of a loop which is
+  // only reachable from Src, in practice it is pointless since at the time
+  // GVN runs all such loops have preheaders, which means that Dst will have
+  // been changed to have only one predecessor, namely Src.
+  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+  const BasicBlock *Src = E.getStart();
+  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
+  (void)Src;
+  return Pred != nullptr;
+}
+
+// Tries to replace instruction with const, using information from
+// ReplaceWithConstMap.
+bool GVN::replaceOperandsWithConsts(Instruction *Instr) const {
+  bool Changed = false;
+  for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+    Value *Operand = Instr->getOperand(OpNum);
+    auto it = ReplaceWithConstMap.find(Operand);
+    if (it != ReplaceWithConstMap.end()) {
+      assert(!isa<Constant>(Operand) &&
+             "Replacing constants with constants is invalid");
+      DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second
+                   << " in instruction " << *Instr << '\n');
+      Instr->setOperand(OpNum, it->second);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+/// The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+/// If DominatesByEdge is false, then it means that it is dominated by Root.End.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+                            bool DominatesByEdge) {
+  SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+  Worklist.push_back(std::make_pair(LHS, RHS));
+  bool Changed = false;
+  // For speed, compute a conservative fast approximation to
+  // DT->dominates(Root, Root.getEnd());
+  bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+  while (!Worklist.empty()) {
+    std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+    LHS = Item.first; RHS = Item.second;
+
+    if (LHS == RHS)
+      continue;
+    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+    // Don't try to propagate equalities between constants.
+    if (isa<Constant>(LHS) && isa<Constant>(RHS))
+      continue;
+
+    // Prefer a constant on the right-hand side, or an Argument if no constants.
+    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+      std::swap(LHS, RHS);
+    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+    // If there is no obvious reason to prefer the left-hand side over the
+    // right-hand side, ensure the longest lived term is on the right-hand side,
+    // so the shortest lived term will be replaced by the longest lived.
+    // This tends to expose more simplifications.
+    uint32_t LVN = VN.lookup_or_add(LHS);
+    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+      // Move the 'oldest' value to the right-hand side, using the value number
+      // as a proxy for age.
+      uint32_t RVN = VN.lookup_or_add(RHS);
+      if (LVN < RVN) {
+        std::swap(LHS, RHS);
+        LVN = RVN;
+      }
+    }
+
+    // If value numbering later sees that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve
+    // the invariant that instructions only occur in the leader table for their
+    // own value number (this is used by removeFromLeaderTable), do not do this
+    // if RHS is an instruction (if an instruction in the scope is morphed into
+    // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+    // using the leader table is about compiling faster, not optimizing better).
+    // The leader table only tracks basic blocks, not edges. Only add to if we
+    // have the simple case where the edge dominates the end.
+    if (RootDominatesEnd && !isa<Instruction>(RHS))
+      addToLeaderTable(LVN, RHS, Root.getEnd());
+
+    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+    // LHS always has at least one use that is not dominated by Root, this will
+    // never do anything if LHS has only one use.
+    if (!LHS->hasOneUse()) {
+      unsigned NumReplacements =
+          DominatesByEdge
+              ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+              : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd());
+
+      Changed |= NumReplacements > 0;
+      NumGVNEqProp += NumReplacements;
+    }
+
+    // Now try to deduce additional equalities from this one. For example, if
+    // the known equality was "(A != B)" == "false" then it follows that A and B
+    // are equal in the scope. Only boolean equalities with an explicit true or
+    // false RHS are currently supported.
+    if (!RHS->getType()->isIntegerTy(1))
+      // Not a boolean equality - bail out.
+      continue;
+    ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+    if (!CI)
+      // RHS neither 'true' nor 'false' - bail out.
+      continue;
+    // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+    bool isKnownTrue = CI->isAllOnesValue();
+    bool isKnownFalse = !isKnownTrue;
+
+    // If "A && B" is known true then both A and B are known true.  If "A || B"
+    // is known false then both A and B are known false.
+    Value *A, *B;
+    if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+        (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+      Worklist.push_back(std::make_pair(A, RHS));
+      Worklist.push_back(std::make_pair(B, RHS));
+      continue;
+    }
+
+    // If we are propagating an equality like "(A == B)" == "true" then also
+    // propagate the equality A == B.  When propagating a comparison such as
+    // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+    if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+      // If "A == B" is known true, or "A != B" is known false, then replace
+      // A with B everywhere in the scope.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+        Worklist.push_back(std::make_pair(Op0, Op1));
+
+      // Handle the floating point versions of equality comparisons too.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) {
+
+        // Floating point -0.0 and 0.0 compare equal, so we can only
+        // propagate values if we know that we have a constant and that
+        // its value is non-zero.
+        
+        // FIXME: We should do this optimization if 'no signed zeros' is
+        // applicable via an instruction-level fast-math-flag or some other
+        // indicator that relaxed FP semantics are being used.
+
+        if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero())
+          Worklist.push_back(std::make_pair(Op0, Op1));
+      }
+ 
+      // If "A >= B" is known true, replace "A < B" with false everywhere.
+      CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+      // Since we don't have the instruction "A < B" immediately to hand, work
+      // out the value number that it would have and use that to find an
+      // appropriate instruction (if any).
+      uint32_t NextNum = VN.getNextUnusedValueNumber();
+      uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      // If the number we were assigned was brand new then there is no point in
+      // looking for an instruction realizing it: there cannot be one!
+      if (Num < NextNum) {
+        Value *NotCmp = findLeader(Root.getEnd(), Num);
+        if (NotCmp && isa<Instruction>(NotCmp)) {
+          unsigned NumReplacements =
+              DominatesByEdge
+                  ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+                  : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+                                             Root.getEnd());
+          Changed |= NumReplacements > 0;
+          NumGVNEqProp += NumReplacements;
+        }
+      }
+      // Ensure that any instruction in scope that gets the "A < B" value number
+      // is replaced with false.
+      // The leader table only tracks basic blocks, not edges. Only add to if we
+      // have the simple case where the edge dominates the end.
+      if (RootDominatesEnd)
+        addToLeaderTable(Num, NotVal, Root.getEnd());
+
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
+/// When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+  // Ignore dbg info intrinsics.
+  if (isa<DbgInfoIntrinsic>(I))
+    return false;
+
+  // If the instruction can be easily simplified then do so now in preference
+  // to value numbering it.  Value numbering often exposes redundancies, for
+  // example if it determines that %y is equal to %x then the instruction
+  // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+    I->replaceAllUsesWith(V);
+    if (MD && V->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(I);
+    ++NumGVNSimpl;
+    return true;
+  }
+
+  if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+    if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+      return processAssumeIntrinsic(IntrinsicI);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (processLoad(LI))
+      return true;
+
+    unsigned Num = VN.lookup_or_add(LI);
+    addToLeaderTable(Num, LI, LI->getParent());
+    return false;
+  }
+
+  // For conditional branches, we can perform simple conditional propagation on
+  // the condition value itself.
+  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+    if (!BI->isConditional())
+      return false;
+
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
+
+    Value *BranchCond = BI->getCondition();
+    BasicBlock *TrueSucc = BI->getSuccessor(0);
+    BasicBlock *FalseSucc = BI->getSuccessor(1);
+    // Avoid multiple edges early.
+    if (TrueSucc == FalseSucc)
+      return false;
+
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+    BasicBlockEdge TrueE(Parent, TrueSucc);
+    Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+
+    Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+    BasicBlockEdge FalseE(Parent, FalseSucc);
+    Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+
+    return Changed;
+  }
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *Dst = i.getCaseSuccessor();
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
+        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+      }
+    }
+    return Changed;
+  }
+
+  // Instructions with void type don't return a value, so there's
+  // no point in trying to find redundancies in them.
+  if (I->getType()->isVoidTy())
+    return false;
+
+  uint32_t NextNum = VN.getNextUnusedValueNumber();
+  unsigned Num = VN.lookup_or_add(I);
+
+  // Allocations are always uniquely numbered, so we can save time and memory
+  // by fast failing them.
+  if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  if (Num >= NextNum) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // Perform fast-path value-number based elimination of values inherited from
+  // dominators.
+  Value *Repl = findLeader(I->getParent(), Num);
+  if (!Repl) {
+    // Failure, just remember this instance for future use.
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  } else if (Repl == I) {
+    // If I was the result of a shortcut PRE, it might already be in the table
+    // and the best replacement for itself. Nothing to do.
+    return false;
+  }
+
+  // Remove it!
+  patchAndReplaceAllUsesWith(I, Repl);
+  if (MD && Repl->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(Repl);
+  markInstructionForDeletion(I);
+  return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runOnFunction(Function& F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  if (!NoLoads)
+    MD = &getAnalysis<MemoryDependenceAnalysis>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+  VN.setMemDep(MD);
+  VN.setDomTree(DT);
+
+  bool Changed = false;
+  bool ShouldContinue = true;
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+    BasicBlock *BB = &*FI++;
+
+    bool removedBlock =
+        MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD);
+    if (removedBlock) ++NumGVNBlocks;
+
+    Changed |= removedBlock;
+  }
+
+  unsigned Iteration = 0;
+  while (ShouldContinue) {
+    DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+    ShouldContinue = iterateOnFunction(F);
+    Changed |= ShouldContinue;
+    ++Iteration;
+  }
+
+  if (EnablePRE) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
+    bool PREChanged = true;
+    while (PREChanged) {
+      PREChanged = performPRE(F);
+      Changed |= PREChanged;
+    }
+  }
+
+  // FIXME: Should perform GVN again after PRE does something.  PRE can move
+  // computations into blocks where they become fully redundant.  Note that
+  // we can't do this until PRE's critical edge splitting updates memdep.
+  // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+  cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration. 
+  DeadBlocks.clear();
+
+  return Changed;
+}
+
+bool GVN::processBlock(BasicBlock *BB) {
+  // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+  // (and incrementing BI before processing an instruction).
+  assert(InstrsToErase.empty() &&
+         "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
+  // Clearing map before every BB because it can be used only for single BB.
+  ReplaceWithConstMap.clear();
+  bool ChangedFunction = false;
+
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    if (!ReplaceWithConstMap.empty())
+      ChangedFunction |= replaceOperandsWithConsts(&*BI);
+    ChangedFunction |= processInstruction(&*BI);
+
+    if (InstrsToErase.empty()) {
+      ++BI;
+      continue;
+    }
+
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += InstrsToErase.size();
+
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
+         E = InstrsToErase.end(); I != E; ++I) {
+      DEBUG(dbgs() << "GVN removed: " << **I << '\n');
+      if (MD) MD->removeInstruction(*I);
+      DEBUG(verifyRemoved(*I));
+      (*I)->eraseFromParent();
+    }
+    InstrsToErase.clear();
+
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+  }
+
+  return ChangedFunction;
+}
+
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+                                    unsigned int ValNo) {
+  // Because we are going top-down through the block, all value numbers
+  // will be available in the predecessor by the time we need them.  Any
+  // that weren't originally present will have been instantiated earlier
+  // in this loop.
+  bool success = true;
+  for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+    Value *Op = Instr->getOperand(i);
+    if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+      continue;
+    // This could be a newly inserted instruction, in which case, we won't
+    // find a value number, and should give up before we hurt ourselves.
+    // FIXME: Rewrite the infrastructure to let it easier to value number
+    // and process newly inserted instructions.
+    if (!VN.exists(Op)) {
+      success = false;
+      break;
+    }
+    if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+      Instr->setOperand(i, V);
+    } else {
+      success = false;
+      break;
+    }
+  }
+
+  // Fail out if we encounter an operand that is not available in
+  // the PRE predecessor.  This is typically because of loads which
+  // are not value numbered precisely.
+  if (!success)
+    return false;
+
+  Instr->insertBefore(Pred->getTerminator());
+  Instr->setName(Instr->getName() + ".pre");
+  Instr->setDebugLoc(Instr->getDebugLoc());
+  VN.add(Instr, ValNo);
+
+  // Update the availability map to include the new instruction.
+  addToLeaderTable(ValNo, Instr, Pred);
+  return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
+  SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
+
+  if (isa<AllocaInst>(CurInst) || isa<TerminatorInst>(CurInst) ||
+      isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+      isa<DbgInfoIntrinsic>(CurInst))
+    return false;
+
+  // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+  // sinking the compare again, and it would force the code generator to
+  // move the i1 from processor flags or predicate registers into a general
+  // purpose register.
+  if (isa<CmpInst>(CurInst))
+    return false;
+
+  // We don't currently value number ANY inline asm calls.
+  if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+    if (CallI->isInlineAsm())
+      return false;
+
+  uint32_t ValNo = VN.lookup(CurInst);
+
+  // Look for the predecessors for PRE opportunities.  We're
+  // only trying to solve the basic diamond case, where
+  // a value is computed in the successor and one predecessor,
+  // but not the other.  We also explicitly disallow cases
+  // where the successor is its own predecessor, because they're
+  // more complicated to get right.
+  unsigned NumWith = 0;
+  unsigned NumWithout = 0;
+  BasicBlock *PREPred = nullptr;
+  BasicBlock *CurrentBlock = CurInst->getParent();
+  predMap.clear();
+
+  for (BasicBlock *P : predecessors(CurrentBlock)) {
+    // We're not interested in PRE where the block is its
+    // own predecessor, or in blocks with predecessors
+    // that are not reachable.
+    if (P == CurrentBlock) {
+      NumWithout = 2;
+      break;
+    } else if (!DT->isReachableFromEntry(P)) {
+      NumWithout = 2;
+      break;
+    }
+
+    Value *predV = findLeader(P, ValNo);
+    if (!predV) {
+      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+      PREPred = P;
+      ++NumWithout;
+    } else if (predV == CurInst) {
+      /* CurInst dominates this predecessor. */
+      NumWithout = 2;
+      break;
+    } else {
+      predMap.push_back(std::make_pair(predV, P));
+      ++NumWith;
+    }
+  }
+
+  // Don't do PRE when it might increase code size, i.e. when
+  // we would need to insert instructions in more than one pred.
+  if (NumWithout > 1 || NumWith == 0)
+    return false;
+
+  // We may have a case where all predecessors have the instruction,
+  // and we just need to insert a phi node. Otherwise, perform
+  // insertion.
+  Instruction *PREInstr = nullptr;
+
+  if (NumWithout != 0) {
+    // Don't do PRE across indirect branch.
+    if (isa<IndirectBrInst>(PREPred->getTerminator()))
+      return false;
+
+    // We can't do PRE safely on a critical edge, so instead we schedule
+    // the edge to be split and perform the PRE the next time we iterate
+    // on the function.
+    unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+      toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+      return false;
+    }
+    // We need to insert somewhere, so let's give it a shot
+    PREInstr = CurInst->clone();
+    if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
+      // If we failed insertion, make sure we remove the instruction.
+      DEBUG(verifyRemoved(PREInstr));
+      delete PREInstr;
+      return false;
+    }
+  }
+
+  // Either we should have filled in the PRE instruction, or we should
+  // not have needed insertions.
+  assert (PREInstr != nullptr || NumWithout == 0);
+
+  ++NumGVNPRE;
+
+  // Create a PHI to make the value available in this block.
+  PHINode *Phi =
+      PHINode::Create(CurInst->getType(), predMap.size(),
+                      CurInst->getName() + ".pre-phi", &CurrentBlock->front());
+  for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+    if (Value *V = predMap[i].first)
+      Phi->addIncoming(V, predMap[i].second);
+    else
+      Phi->addIncoming(PREInstr, PREPred);
+  }
+
+  VN.add(Phi, ValNo);
+  addToLeaderTable(ValNo, Phi, CurrentBlock);
+  Phi->setDebugLoc(CurInst->getDebugLoc());
+  CurInst->replaceAllUsesWith(Phi);
+  if (MD && Phi->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(Phi);
+  VN.erase(CurInst);
+  removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+  DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+  if (MD)
+    MD->removeInstruction(CurInst);
+  DEBUG(verifyRemoved(CurInst));
+  CurInst->eraseFromParent();
+  ++NumGVNInstr;
+  
+  return true;
+}
+
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+  bool Changed = false;
+  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock())
+      continue;
+
+    // Don't perform PRE on an EH pad.
+    if (CurrentBlock->isEHPad())
+      continue;
+
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+                              BE = CurrentBlock->end();
+         BI != BE;) {
+      Instruction *CurInst = &*BI++;
+      Changed |= performScalarPRE(CurInst);
+    }
+  }
+
+  if (splitCriticalEdges())
+    Changed = true;
+
+  return Changed;
+}
+
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+  BasicBlock *BB =
+      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT));
+  if (MD)
+    MD->invalidateCachedPredecessors();
+  return BB;
+}
+
+/// Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+  if (toSplit.empty())
+    return false;
+  do {
+    std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+    SplitCriticalEdge(Edge.first, Edge.second,
+                      CriticalEdgeSplittingOptions(DT));
+  } while (!toSplit.empty());
+  if (MD) MD->invalidateCachedPredecessors();
+  return true;
+}
+
+/// Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  cleanupGlobalSets();
+
+  // Top-down walk of the dominator tree
+  bool Changed = false;
+  // Save the blocks this function have before transformation begins. GVN may
+  // split critical edge, and hence may invalidate the RPO/DT iterator.
+  //
+  std::vector<BasicBlock *> BBVect;
+  BBVect.reserve(256);
+  // Needed for value numbering with phi construction to work.
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
+                                                           RE = RPOT.end();
+       RI != RE; ++RI)
+    BBVect.push_back(*RI);
+
+  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
+       I != E; I++)
+    Changed |= processBlock(*I);
+
+  return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+  VN.clear();
+  LeaderTable.clear();
+  TableAllocator.Reset();
+}
+
+/// Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+  VN.verifyRemoved(Inst);
+
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+       I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+    const LeaderTableEntry *Node = &I->second;
+    assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+    while (Node->Next) {
+      Node = Node->Next;
+      assert(Node->Val != Inst && "Inst still in value numbering scope!");
+    }
+  }
+}
+
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+    
+    // Figure out the dominance-frontier(D).
+    for (BasicBlock *B : Dom) {
+      for (BasicBlock *S : successors(B)) {
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (BasicBlock *P : predecessors(S))
+          if (!DeadBlocks.count(P)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    for (BasicBlock *P : Preds) {
+      if (!DeadBlocks.count(P))
+        continue;
+
+      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+
+      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+        PHINode &Phi = cast<PHINode>(*II);
+        Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+                             UndefValue::get(Phi.getType()));
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let 
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead predecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If a branch has two identical successors, we cannot declare either dead.
+  if (BI->getSuccessor(0) == BI->getSuccessor(1))
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot = Cond->getZExtValue() ? 
+                         BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it comes across an instruction without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (BasicBlock *BB : DeadBlocks) {
+    for (Instruction &Inst : *BB) {
+      unsigned ValNum = VN.lookup_or_add(&Inst);
+      addToLeaderTable(ValNum, &Inst, BB);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 0000000..ec5e15f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,2199 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumWidened     , "Number of indvars widened");
+STATISTIC(NumReplaced    , "Number of exit values replaced");
+STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+  "verify-indvars", cl::Hidden,
+  cl::desc("Verify the ScalarEvolution result after running indvars"));
+
+static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden,
+  cl::desc("Reduce live induction variables."));
+
+enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, AlwaysRepl };
+
+static cl::opt<ReplaceExitVal> ReplaceExitValue(
+    "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
+    cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
+    cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
+               clEnumValN(OnlyCheapRepl, "cheap",
+                          "only replace exit value when the cost is cheap"),
+               clEnumValN(AlwaysRepl, "always",
+                          "always replace exit value whenever possible"),
+               clEnumValEnd));
+
+namespace {
+struct RewritePhi;
+
+class IndVarSimplify : public LoopPass {
+  LoopInfo                  *LI;
+  ScalarEvolution           *SE;
+  DominatorTree             *DT;
+  TargetLibraryInfo         *TLI;
+  const TargetTransformInfo *TTI;
+
+  SmallVector<WeakVH, 16> DeadInsts;
+  bool Changed;
+public:
+
+  static char ID; // Pass identification, replacement for typeid
+  IndVarSimplify()
+    : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) {
+    initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addPreservedID(LCSSAID);
+    AU.setPreservesCFG();
+  }
+
+private:
+  void releaseMemory() override {
+    DeadInsts.clear();
+  }
+
+  bool isValidRewrite(Value *FromVal, Value *ToVal);
+
+  void handleFloatingPointIV(Loop *L, PHINode *PH);
+  void rewriteNonIntegerIVs(Loop *L);
+
+  void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+
+  bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
+  void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+
+  Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+                                   PHINode *IndVar, SCEVExpander &Rewriter);
+
+  void sinkUnusedInvariants(Loop *L);
+
+  Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L,
+                            Instruction *InsertPt, Type *Ty);
+};
+}
+
+char IndVarSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
+                "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(IndVarSimplify, "indvars",
+                "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplify();
+}
+
+/// Return true if the SCEV expansion generated by the rewriter can replace the
+/// original value. SCEV guarantees that it produces the same value, but the way
+/// it is produced may be illegal IR.  Ideally, this function will only be
+/// called for verification.
+bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
+  // If an SCEV expression subsumed multiple pointers, its expansion could
+  // reassociate the GEP changing the base pointer. This is illegal because the
+  // final address produced by a GEP chain must be inbounds relative to its
+  // underlying object. Otherwise basic alias analysis, among other things,
+  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+  // producing an expression involving multiple pointers. Until then, we must
+  // bail out here.
+  //
+  // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+  // because it understands lcssa phis while SCEV does not.
+  Value *FromPtr = FromVal;
+  Value *ToPtr = ToVal;
+  if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {
+    FromPtr = GEP->getPointerOperand();
+  }
+  if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {
+    ToPtr = GEP->getPointerOperand();
+  }
+  if (FromPtr != FromVal || ToPtr != ToVal) {
+    // Quickly check the common case
+    if (FromPtr == ToPtr)
+      return true;
+
+    // SCEV may have rewritten an expression that produces the GEP's pointer
+    // operand. That's ok as long as the pointer operand has the same base
+    // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+    // base of a recurrence. This handles the case in which SCEV expansion
+    // converts a pointer type recurrence into a nonrecurrent pointer base
+    // indexed by an integer recurrence.
+
+    // If the GEP base pointer is a vector of pointers, abort.
+    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+      return false;
+
+    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+    if (FromBase == ToBase)
+      return true;
+
+    DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
+          << *FromBase << " != " << *ToBase << "\n");
+
+    return false;
+  }
+  return true;
+}
+
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+                                          DominatorTree *DT, LoopInfo *LI) {
+  PHINode *PHI = dyn_cast<PHINode>(User);
+  if (!PHI)
+    return User;
+
+  Instruction *InsertPt = nullptr;
+  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+    if (PHI->getIncomingValue(i) != Def)
+      continue;
+
+    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+    if (!InsertPt) {
+      InsertPt = InsertBB->getTerminator();
+      continue;
+    }
+    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+    InsertPt = InsertBB->getTerminator();
+  }
+  assert(InsertPt && "Missing phi operand");
+
+  auto *DefI = dyn_cast<Instruction>(Def);
+  if (!DefI)
+    return InsertPt;
+
+  assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses");
+
+  auto *L = LI->getLoopFor(DefI->getParent());
+  assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent())));
+
+  for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom())
+    if (LI->getLoopFor(DTN->getBlock()) == L)
+      return DTN->getBlock()->getTerminator();
+
+  llvm_unreachable("DefI dominates InsertPt!");
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+  bool isExact = false;
+  // See if we can convert this to an int64_t
+  uint64_t UIntVal;
+  if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
+                           &isExact) != APFloat::opOK || !isExact)
+    return false;
+  IntVal = UIntVal;
+  return true;
+}
+
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+///   bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+///   bar((double)i);
+///
+void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
+  unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+  unsigned BackEdge     = IncomingEdge^1;
+
+  // Check incoming value.
+  auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+  int64_t InitValue;
+  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+    return;
+
+  // Check IV increment. Reject this PN if increment operation is not
+  // an add or increment value can not be represented by an integer.
+  auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+  if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;
+
+  // If this is not an add of the PHI with a constantfp, or if the constant fp
+  // is not an integer, bail out.
+  ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+  int64_t IncValue;
+  if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
+      !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+    return;
+
+  // Check Incr uses. One user is PN and the other user is an exit condition
+  // used by the conditional terminator.
+  Value::user_iterator IncrUse = Incr->user_begin();
+  Instruction *U1 = cast<Instruction>(*IncrUse++);
+  if (IncrUse == Incr->user_end()) return;
+  Instruction *U2 = cast<Instruction>(*IncrUse++);
+  if (IncrUse != Incr->user_end()) return;
+
+  // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't
+  // only used by a branch, we can't transform it.
+  FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+  if (!Compare)
+    Compare = dyn_cast<FCmpInst>(U2);
+  if (!Compare || !Compare->hasOneUse() ||
+      !isa<BranchInst>(Compare->user_back()))
+    return;
+
+  BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
+
+  // We need to verify that the branch actually controls the iteration count
+  // of the loop.  If not, the new IV can overflow and no one will notice.
+  // The branch block must be in the loop and one of the successors must be out
+  // of the loop.
+  assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+  if (!L->contains(TheBr->getParent()) ||
+      (L->contains(TheBr->getSuccessor(0)) &&
+       L->contains(TheBr->getSuccessor(1))))
+    return;
+
+
+  // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+  // transform it.
+  ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+  int64_t ExitValue;
+  if (ExitValueVal == nullptr ||
+      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+    return;
+
+  // Find new predicate for integer comparison.
+  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+  switch (Compare->getPredicate()) {
+  default: return;  // Unknown comparison.
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+  }
+
+  // We convert the floating point induction variable to a signed i32 value if
+  // we can.  This is only safe if the comparison will not overflow in a way
+  // that won't be trapped by the integer equivalent operations.  Check for this
+  // now.
+  // TODO: We could use i64 if it is native and the range requires it.
+
+  // The start/stride/exit values must all fit in signed i32.
+  if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+    return;
+
+  // If not actually striding (add x, 0.0), avoid touching the code.
+  if (IncValue == 0)
+    return;
+
+  // Positive and negative strides have different safety conditions.
+  if (IncValue > 0) {
+    // If we have a positive stride, we require the init to be less than the
+    // exit value.
+    if (InitValue >= ExitValue)
+      return;
+
+    uint32_t Range = uint32_t(ExitValue-InitValue);
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+      if (++Range == 0) return;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+      return;
+
+  } else {
+    // If we have a negative stride, we require the init to be greater than the
+    // exit value.
+    if (InitValue <= ExitValue)
+      return;
+
+    uint32_t Range = uint32_t(InitValue-ExitValue);
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+      if (++Range == 0) return;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(-IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+      return;
+  }
+
+  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+  // Insert new integer induction variable.
+  PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+  NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+                      PN->getIncomingBlock(IncomingEdge));
+
+  Value *NewAdd =
+    BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+                              Incr->getName()+".int", Incr);
+  NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+  ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+                                      ConstantInt::get(Int32Ty, ExitValue),
+                                      Compare->getName());
+
+  // In the following deletions, PN may become dead and may be deleted.
+  // Use a WeakVH to observe whether this happens.
+  WeakVH WeakPH = PN;
+
+  // Delete the old floating point exit comparison.  The branch starts using the
+  // new comparison.
+  NewCompare->takeName(Compare);
+  Compare->replaceAllUsesWith(NewCompare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
+
+  // Delete the old floating point increment.
+  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
+
+  // If the FP induction variable still has uses, this is because something else
+  // in the loop uses its value.  In order to canonicalize the induction
+  // variable, we chose to eliminate the IV and rewrite it in terms of an
+  // int->fp cast.
+  //
+  // We give preference to sitofp over uitofp because it is faster on most
+  // platforms.
+  if (WeakPH) {
+    Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+                                 &*PN->getParent()->getFirstInsertionPt());
+    PN->replaceAllUsesWith(Conv);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+  }
+  Changed = true;
+}
+
+void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
+  // First step.  Check to see if there are any floating-point recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  //
+  BasicBlock *Header = L->getHeader();
+
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+      handleFloatingPointIV(L, PN);
+
+  // If the loop previously had floating-point IV, ScalarEvolution
+  // may not have been able to compute a trip count. Now that we've done some
+  // re-writing, the trip count may be computable.
+  if (Changed)
+    SE->forgetLoop(L);
+}
+
+namespace {
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+  PHINode *PN;
+  unsigned Ith;  // Ith incoming value.
+  Value *Val;    // Exit value after expansion.
+  bool HighCost; // High Cost when expansion.
+  bool SafePhi;  // LCSSASafePhiForRAUW.
+
+  RewritePhi(PHINode *P, unsigned I, Value *V, bool H, bool S)
+      : PN(P), Ith(I), Val(V), HighCost(H), SafePhi(S) {}
+};
+}
+
+Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S,
+                                          Loop *L, Instruction *InsertPt,
+                                          Type *ResultTy) {
+  // Before expanding S into an expensive LLVM expression, see if we can use an
+  // already existing value as the expansion for S.
+  if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L))
+    if (ExistingValue->getType() == ResultTy)
+      return ExistingValue;
+
+  // We didn't find anything, fall back to using SCEVExpander.
+  return Rewriter.expandCodeFor(S, ResultTy, InsertPt);
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+/// Check to see if this loop has a computable loop-invariant execution count.
+/// If so, this means that we can compute the final value of any expressions
+/// that are recurrent in the loop, and substitute the exit values from the loop
+/// into any instructions outside of the loop that use the final values of the
+/// current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  SmallVector<RewritePhi, 8> RewritePhiSet;
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBB = ExitBlocks[i];
+
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // We would like to be able to RAUW single-incoming value PHI nodes. We
+    // have to be certain this is safe even when this is an LCSSA PHI node.
+    // While the computed exit value is no longer varying in *this* loop, the
+    // exit block may be an exit block for an outer containing loop as well,
+    // the exit value may be varying in the outer loop, and thus it may still
+    // require an LCSSA PHI node. The safe case is when this is
+    // single-predecessor PHI node (LCSSA) and the exit block containing it is
+    // part of the enclosing loop, or this is the outer most loop of the nest.
+    // In either case the exit value could (at most) be varying in the same
+    // loop body as the phi node itself. Thus if it is in turn used outside of
+    // an enclosing loop it will only be via a separate LCSSA node.
+    bool LCSSASafePhiForRAUW =
+        NumPreds == 1 &&
+        (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB));
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+
+      // SCEV only supports integer expressions for now.
+      if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy())
+        continue;
+
+      // It's necessary to tell ScalarEvolution about this explicitly so that
+      // it can walk the def-use list and forget all SCEVs, as it may not be
+      // watching the PHI itself. Once the new exit value is in place, there
+      // may not be a def-use connection between the loop and every instruction
+      // which got a SCEVAddRecExpr for that loop.
+      SE->forgetValue(PN);
+
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.
+        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (!SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE))
+          continue;
+
+        // Computing the value outside of the loop brings no benefit if :
+        //  - it is definitely used inside the loop in a way which can not be
+        //    optimized away.
+        //  - no use outside of the loop can take advantage of hoisting the
+        //    computation out of the loop
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          unsigned NumHardInternalUses = 0;
+          unsigned NumSoftExternalUses = 0;
+          unsigned NumUses = 0;
+          for (auto IB = Inst->user_begin(), IE = Inst->user_end();
+               IB != IE && NumUses <= 6; ++IB) {
+            Instruction *UseInstr = cast<Instruction>(*IB);
+            unsigned Opc = UseInstr->getOpcode();
+            NumUses++;
+            if (L->contains(UseInstr)) {
+              if (Opc == Instruction::Call || Opc == Instruction::Ret)
+                NumHardInternalUses++;
+            } else {
+              if (Opc == Instruction::PHI) {
+                // Do not count the Phi as a use. LCSSA may have inserted
+                // plenty of trivial ones.
+                NumUses--;
+                for (auto PB = UseInstr->user_begin(),
+                          PE = UseInstr->user_end();
+                     PB != PE && NumUses <= 6; ++PB, ++NumUses) {
+                  unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+                  if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+                    NumSoftExternalUses++;
+                }
+                continue;
+              }
+              if (Opc != Instruction::Call && Opc != Instruction::Ret)
+                NumSoftExternalUses++;
+            }
+          }
+          if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+            continue;
+        }
+
+        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
+        Value *ExitVal =
+            expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType());
+
+        DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
+                     << "  LoopVal = " << *Inst << "\n");
+
+        if (!isValidRewrite(Inst, ExitVal)) {
+          DeadInsts.push_back(ExitVal);
+          continue;
+        }
+
+        // Collect all the candidate PHINodes to be rewritten.
+        RewritePhiSet.push_back(
+            RewritePhi(PN, i, ExitVal, HighCost, LCSSASafePhiForRAUW));
+      }
+    }
+  }
+
+  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+
+  // Transformation.
+  for (const RewritePhi &Phi : RewritePhiSet) {
+    PHINode *PN = Phi.PN;
+    Value *ExitVal = Phi.Val;
+
+    // Only do the rewrite when the ExitValue can be expanded cheaply.
+    // If LoopCanBeDel is true, rewrite exit value aggressively.
+    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+      DeadInsts.push_back(ExitVal);
+      continue;
+    }
+
+    Changed = true;
+    ++NumReplaced;
+    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+    PN->setIncomingValue(Phi.Ith, ExitVal);
+
+    // If this instruction is dead now, delete it. Don't do it now to avoid
+    // invalidating iterators.
+    if (isInstructionTriviallyDead(Inst, TLI))
+      DeadInsts.push_back(Inst);
+
+    // If we determined that this PHI is safe to replace even if an LCSSA
+    // PHI, do so.
+    if (Phi.SafePhi) {
+      PN->replaceAllUsesWith(ExitVal);
+      PN->eraseFromParent();
+    }
+  }
+
+  // The insertion point instruction may have been deleted; clear it out
+  // so that the rewriter doesn't trip over it later.
+  Rewriter.clearInsertPoint();
+}
+
+/// Check whether it is possible to delete the loop after rewriting exit
+/// value. If it is possible, ignore ReplaceExitValue and do rewriting
+/// aggressively.
+bool IndVarSimplify::canLoopBeDeleted(
+    Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  // If there is no preheader, the loop will not be deleted.
+  if (!Preheader)
+    return false;
+
+  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+  // We obviate multiple ExitingBlocks case for simplicity.
+  // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+  // after exit value rewriting, we can enhance the logic here.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1)
+    return false;
+
+  BasicBlock *ExitBlock = ExitBlocks[0];
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+    // If the Incoming value of P is found in RewritePhiSet, we know it
+    // could be rewritten to use a loop invariant value in transformation
+    // phase later. Skip it in the loop invariant check below.
+    bool found = false;
+    for (const RewritePhi &Phi : RewritePhiSet) {
+      unsigned i = Phi.Ith;
+      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+        found = true;
+        break;
+      }
+    }
+
+    Instruction *I;
+    if (!found && (I = dyn_cast<Instruction>(Incoming)))
+      if (!L->hasLoopInvariantOperands(I))
+        return false;
+
+    ++BI;
+  }
+
+  for (auto *BB : L->blocks())
+    if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Collect information about induction variables that are used by sign/zero
+// extend operations. This information is recorded by CollectExtend and provides
+// the input to WidenIV.
+struct WideIVInfo {
+  PHINode *NarrowIV = nullptr;
+  Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext
+  bool IsSigned = false;            // Was a sext user seen before a zext?
+};
+}
+
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+                        const TargetTransformInfo *TTI) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
+  Type *Ty = Cast->getType();
+  uint64_t Width = SE->getTypeSizeInBits(Ty);
+  if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+    return;
+
+  // Cast is either an sext or zext up to this point.
+  // We should not widen an indvar if arithmetics on the wider indvar are more
+  // expensive than those on the narrower indvar. We check only the cost of ADD
+  // because at least an ADD is required to increment the induction variable. We
+  // could compute more comprehensively the cost of all instructions on the
+  // induction variable when necessary.
+  if (TTI &&
+      TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+          TTI->getArithmeticInstrCost(Instruction::Add,
+                                      Cast->getOperand(0)->getType())) {
+    return;
+  }
+
+  if (!WI.WidestNativeType) {
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+    WI.IsSigned = IsSigned;
+    return;
+  }
+
+  // We extend the IV to satisfy the sign of its first user, arbitrarily.
+  if (WI.IsSigned != IsSigned)
+    return;
+
+  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+namespace {
+
+/// Record a link in the Narrow IV def-use chain along with the WideIV that
+/// computes the same value as the Narrow IV def.  This avoids caching Use*
+/// pointers.
+struct NarrowIVDefUse {
+  Instruction *NarrowDef = nullptr;
+  Instruction *NarrowUse = nullptr;
+  Instruction *WideDef = nullptr;
+
+  // True if the narrow def is never negative.  Tracking this information lets
+  // us use a sign extension instead of a zero extension or vice versa, when
+  // profitable and legal.
+  bool NeverNegative = false;
+
+  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD,
+                 bool NeverNegative)
+      : NarrowDef(ND), NarrowUse(NU), WideDef(WD),
+        NeverNegative(NeverNegative) {}
+};
+
+/// The goal of this transform is to remove sign and zero extends without
+/// creating any new induction variables. To do this, it creates a new phi of
+/// the wider type and redirects all users, either removing extends or inserting
+/// truncs whenever we stop propagating the type.
+///
+class WidenIV {
+  // Parameters
+  PHINode *OrigPhi;
+  Type *WideType;
+  bool IsSigned;
+
+  // Context
+  LoopInfo        *LI;
+  Loop            *L;
+  ScalarEvolution *SE;
+  DominatorTree   *DT;
+
+  // Result
+  PHINode *WidePhi;
+  Instruction *WideInc;
+  const SCEV *WideIncExpr;
+  SmallVectorImpl<WeakVH> &DeadInsts;
+
+  SmallPtrSet<Instruction*,16> Widened;
+  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+
+public:
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
+          ScalarEvolution *SEv, DominatorTree *DTree,
+          SmallVectorImpl<WeakVH> &DI) :
+    OrigPhi(WI.NarrowIV),
+    WideType(WI.WidestNativeType),
+    IsSigned(WI.IsSigned),
+    LI(LInfo),
+    L(LI->getLoopFor(OrigPhi->getParent())),
+    SE(SEv),
+    DT(DTree),
+    WidePhi(nullptr),
+    WideInc(nullptr),
+    WideIncExpr(nullptr),
+    DeadInsts(DI) {
+    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+  }
+
+  PHINode *createWideIV(SCEVExpander &Rewriter);
+
+protected:
+  Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use);
+
+  Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR);
+  Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                     const SCEVAddRecExpr *WideAR);
+  Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU);
+
+  const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse);
+
+  const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+  const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                              unsigned OpCode) const;
+
+  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+
+  bool widenLoopCompare(NarrowIVDefUse DU);
+
+  void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
+};
+} // anonymous namespace
+
+/// Perform a quick domtree based check for loop invariance assuming that V is
+/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this
+/// purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return true;
+
+  return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
+                                 bool IsSigned, Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
+  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+                    Builder.CreateZExt(NarrowOper, WideType);
+}
+
+/// Instantiate a wide operation to replace a narrow operation. This only needs
+/// to handle operations that can evaluation to SCEVAddRec. It can safely return
+/// 0 for any operation we decide not to clone.
+Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU,
+                                  const SCEVAddRecExpr *WideAR) {
+  unsigned Opcode = DU.NarrowUse->getOpcode();
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Sub:
+    return cloneArithmeticIVUser(DU, WideAR);
+
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return cloneBitwiseIVUser(DU);
+  }
+}
+
+Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n");
+
+  // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything
+  // about the narrow operand yet so must insert a [sz]ext. It is probably loop
+  // invariant and will be folded or hoisted. If it actually comes from a
+  // widened IV, it should be removed during a future call to widenIVUse.
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      IsSigned, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      IsSigned, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU,
+                                            const SCEVAddRecExpr *WideAR) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+  unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1;
+
+  // We're trying to find X such that
+  //
+  //  Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X
+  //
+  // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef),
+  // and check using SCEV if any of them are correct.
+
+  // Returns true if extending NonIVNarrowDef according to `SignExt` is a
+  // correct solution to X.
+  auto GuessNonIVOperand = [&](bool SignExt) {
+    const SCEV *WideLHS;
+    const SCEV *WideRHS;
+
+    auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) {
+      if (SignExt)
+        return SE->getSignExtendExpr(S, Ty);
+      return SE->getZeroExtendExpr(S, Ty);
+    };
+
+    if (IVOpIdx == 0) {
+      WideLHS = SE->getSCEV(WideDef);
+      const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1));
+      WideRHS = GetExtend(NarrowRHS, WideType);
+    } else {
+      const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0));
+      WideLHS = GetExtend(NarrowLHS, WideType);
+      WideRHS = SE->getSCEV(WideDef);
+    }
+
+    // WideUse is "WideDef `op.wide` X" as described in the comment.
+    const SCEV *WideUse = nullptr;
+
+    switch (NarrowUse->getOpcode()) {
+    default:
+      llvm_unreachable("No other possibility!");
+
+    case Instruction::Add:
+      WideUse = SE->getAddExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::Mul:
+      WideUse = SE->getMulExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::UDiv:
+      WideUse = SE->getUDivExpr(WideLHS, WideRHS);
+      break;
+
+    case Instruction::Sub:
+      WideUse = SE->getMinusSCEV(WideLHS, WideRHS);
+      break;
+    }
+
+    return WideUse == WideAR;
+  };
+
+  bool SignExtend = IsSigned;
+  if (!GuessNonIVOperand(SignExtend)) {
+    SignExtend = !SignExtend;
+    if (!GuessNonIVOperand(SignExtend))
+      return nullptr;
+  }
+
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      SignExtend, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      SignExtend, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+  return WideBO;
+}
+
+const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                                     unsigned OpCode) const {
+  if (OpCode == Instruction::Add)
+    return SE->getAddExpr(LHS, RHS);
+  if (OpCode == Instruction::Sub)
+    return SE->getMinusSCEV(LHS, RHS);
+  if (OpCode == Instruction::Mul)
+    return SE->getMulExpr(LHS, RHS);
+
+  llvm_unreachable("Unsupported opcode.");
+}
+
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return it.
+const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
+
+  // Handle the common case of add<nsw/nuw>
+  const unsigned OpCode = DU.NarrowUse->getOpcode();
+  // Only Add/Sub/Mul instructions supported yet.
+  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+      OpCode != Instruction::Mul)
+    return nullptr;
+
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  const unsigned ExtendOperIdx =
+      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = nullptr;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  if (IsSigned && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(!IsSigned && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return nullptr;
+
+  // When creating this SCEV expr, don't apply the current operations NSW or NUW
+  // flags. This instruction may be guarded by control flow that the no-wrap
+  // behavior depends on. Non-control-equivalent instructions can be mapped to
+  // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+  // semantics to those operations.
+  const SCEV *lhs = SE->getSCEV(DU.WideDef);
+  const SCEV *rhs = ExtendOperExpr;
+
+  // Let's swap operands to the initial order for the case of non-commutative
+  // operations, like SUB. See PR21014.
+  if (ExtendOperIdx == 0)
+    std::swap(lhs, rhs);
+  const SCEVAddRecExpr *AddRec =
+      dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
+
+  if (!AddRec || AddRec->getLoop() != L)
+    return nullptr;
+  return AddRec;
+}
+
+/// Is this instruction potentially interesting for further simplification after
+/// widening it's type? In other words, can the extend be safely hoisted out of
+/// the loop with SCEV reducing the value to a recurrence on the same loop. If
+/// so, return the sign or zero extended recurrence. Otherwise return NULL.
+const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) {
+  if (!SE->isSCEVable(NarrowUse->getType()))
+    return nullptr;
+
+  const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
+  if (SE->getTypeSizeInBits(NarrowExpr->getType())
+      >= SE->getTypeSizeInBits(WideType)) {
+    // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
+    // index. So don't follow this use.
+    return nullptr;
+  }
+
+  const SCEV *WideExpr = IsSigned ?
+    SE->getSignExtendExpr(NarrowExpr, WideType) :
+    SE->getZeroExtendExpr(NarrowExpr, WideType);
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return nullptr;
+  return AddRec;
+}
+
+/// This IV user cannot be widen. Replace this use of the original narrow IV
+/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) {
+  DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
+        << " for user " << *DU.NarrowUse << "\n");
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+  Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+}
+
+/// If the narrow use is a compare instruction, then widen the compare
+//  (and possibly the other operand).  The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+  if (!Cmp)
+    return false;
+
+  // We can legally widen the comparison in the following two cases:
+  //
+  //  - The signedness of the IV extension and comparison match
+  //
+  //  - The narrow IV is always positive (and thus its sign extension is equal
+  //    to its zero extension).  For instance, let's say we're zero extending
+  //    %narrow for the following use
+  //
+  //      icmp slt i32 %narrow, %val   ... (A)
+  //
+  //    and %narrow is always positive.  Then
+  //
+  //      (A) == icmp slt i32 sext(%narrow), sext(%val)
+  //          == icmp slt i32 zext(%narrow), sext(%val)
+
+  if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
+    return false;
+
+  Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+  unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+  unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+  assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+  // Widen the compare instruction.
+  IRBuilder<> Builder(
+      getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI));
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+  // Widen the other operand of the compare, if necessary.
+  if (CastWidth < IVWidth) {
+    Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp);
+    DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+  }
+  return true;
+}
+
+/// Determine whether an individual user of the narrow IV can be widened. If so,
+/// return the wide clone of the user.
+Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+
+  // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
+  if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
+    if (LI->getLoopFor(UsePhi->getParent()) != L) {
+      // For LCSSA phis, sink the truncate outside the loop.
+      // After SimplifyCFG most loop exit targets have a single predecessor.
+      // Otherwise fall back to a truncate within the loop.
+      if (UsePhi->getNumOperands() != 1)
+        truncateIVUse(DU, DT, LI);
+      else {
+        PHINode *WidePhi =
+          PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
+                          UsePhi);
+        WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
+        IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt());
+        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+        UsePhi->replaceAllUsesWith(Trunc);
+        DeadInsts.emplace_back(UsePhi);
+        DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
+              << " to " << *WidePhi << "\n");
+      }
+      return nullptr;
+    }
+  }
+  // Our raison d'etre! Eliminate sign and zero extension.
+  if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
+    Value *NewDef = DU.WideDef;
+    if (DU.NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
+      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+      if (CastWidth < IVWidth) {
+        // The cast isn't as wide as the IV, so insert a Trunc.
+        IRBuilder<> Builder(DU.NarrowUse);
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+      }
+      else {
+        // A wider extend was hidden behind a narrower one. This may induce
+        // another round of IV widening in which the intermediate IV becomes
+        // dead. It should be very rare.
+        DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+              << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+        NewDef = DU.NarrowUse;
+      }
+    }
+    if (NewDef != DU.NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+            << " replaced by " << *DU.WideDef << "\n");
+      ++NumElimExt;
+      DU.NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.emplace_back(DU.NarrowUse);
+    }
+    // Now that the extend is gone, we want to expose it's uses for potential
+    // further simplification. We don't need to directly inform SimplifyIVUsers
+    // of the new users, because their parent IV will be processed later as a
+    // new loop phi. If we preserved IVUsers analysis, we would also want to
+    // push the uses of WideDef here.
+
+    // No further widening is needed. The deceased [sz]ext had done it for us.
+    return nullptr;
+  }
+
+  // Does this user itself evaluate to a recurrence after widening?
+  const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse);
+  if (!WideAddRec)
+    WideAddRec = getExtendedOperandRecurrence(DU);
+
+  if (!WideAddRec) {
+    // If use is a loop condition, try to promote the condition instead of
+    // truncating the IV first.
+    if (widenLoopCompare(DU))
+      return nullptr;
+
+    // This user does not evaluate to a recurence after widening, so don't
+    // follow it. Instead insert a Trunc to kill off the original use,
+    // eventually isolating the original narrow IV so it can be removed.
+    truncateIVUse(DU, DT, LI);
+    return nullptr;
+  }
+  // Assume block terminators cannot evaluate to a recurrence. We can't to
+  // insert a Trunc after a terminator if there happens to be a critical edge.
+  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
+         "SCEV is not expected to evaluate a block terminator");
+
+  // Reuse the IV increment that SCEVExpander created as long as it dominates
+  // NarrowUse.
+  Instruction *WideUse = nullptr;
+  if (WideAddRec == WideIncExpr
+      && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+    WideUse = WideInc;
+  else {
+    WideUse = cloneIVUser(DU, WideAddRec);
+    if (!WideUse)
+      return nullptr;
+  }
+  // Evaluation of WideAddRec ensured that the narrow expression could be
+  // extended outside the loop without overflow. This suggests that the wide use
+  // evaluates to the same expression as the extended narrow use, but doesn't
+  // absolutely guarantee it. Hence the following failsafe check. In rare cases
+  // where it fails, we simply throw away the newly created wide use.
+  if (WideAddRec != SE->getSCEV(WideUse)) {
+    DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+          << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
+    DeadInsts.emplace_back(WideUse);
+    return nullptr;
+  }
+
+  // Returning WideUse pushes it on the worklist.
+  return WideUse;
+}
+
+/// Add eligible users of NarrowDef to NarrowIVUsers.
+///
+void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+  const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef);
+  bool NeverNegative =
+      SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV,
+                           SE->getConstant(NarrowSCEV->getType(), 0));
+  for (User *U : NarrowDef->users()) {
+    Instruction *NarrowUser = cast<Instruction>(U);
+
+    // Handle data flow merges and bizarre phi cycles.
+    if (!Widened.insert(NarrowUser).second)
+      continue;
+
+    NarrowIVUsers.push_back(
+        NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative));
+  }
+}
+
+/// Process a single induction variable. First use the SCEVExpander to create a
+/// wide induction variable that evaluates to the same recurrence as the
+/// original narrow IV. Then use a worklist to forward traverse the narrow IV's
+/// def-use chain. After widenIVUse has processed all interesting IV users, the
+/// narrow IV will be isolated for removal by DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+///
+PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
+  // Is this phi an induction variable?
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+  if (!AddRec)
+    return nullptr;
+
+  // Widen the induction variable expression.
+  const SCEV *WideIVExpr = IsSigned ?
+    SE->getSignExtendExpr(AddRec, WideType) :
+    SE->getZeroExtendExpr(AddRec, WideType);
+
+  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+         "Expect the new IV expression to preserve its type");
+
+  // Can the IV be extended outside the loop without overflow?
+  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return nullptr;
+
+  // An AddRec must have loop-invariant operands. Since this AddRec is
+  // materialized by a loop header phi, the expression cannot have any post-loop
+  // operands, so they must dominate the loop header.
+  assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+         SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader())
+         && "Loop header phi recurrence inputs do not dominate the loop");
+
+  // The rewriter provides a value for the desired IV expression. This may
+  // either find an existing phi or materialize a new one. Either way, we
+  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+  // of the phi-SCC dominates the loop entry.
+  Instruction *InsertPt = &L->getHeader()->front();
+  WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+  // Remembering the WideIV increment generated by SCEVExpander allows
+  // widenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // employ a general reuse mechanism because the call above is the only call to
+  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+    WideInc =
+      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+    WideIncExpr = SE->getSCEV(WideInc);
+  }
+
+  DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  ++NumWidened;
+
+  // Traverse the def-use chain using a worklist starting at the original IV.
+  assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
+
+  Widened.insert(OrigPhi);
+  pushNarrowIVUsers(OrigPhi, WidePhi);
+
+  while (!NarrowIVUsers.empty()) {
+    NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
+
+    // Process a def-use edge. This may replace the use, so don't hold a
+    // use_iterator across it.
+    Instruction *WideUse = widenIVUse(DU, Rewriter);
+
+    // Follow all def-use edges from the previous narrow use.
+    if (WideUse)
+      pushNarrowIVUsers(DU.NarrowUse, WideUse);
+
+    // widenIVUse may have removed the def-use edge.
+    if (DU.NarrowDef->use_empty())
+      DeadInsts.emplace_back(DU.NarrowDef);
+  }
+  return WidePhi;
+}
+
+//===----------------------------------------------------------------------===//
+//  Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+//  Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class IndVarSimplifyVisitor : public IVVisitor {
+  ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
+  PHINode *IVPhi;
+
+public:
+  WideIVInfo WI;
+
+  IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+                        const TargetTransformInfo *TTI,
+                        const DominatorTree *DTree)
+    : SE(SCEV), TTI(TTI), IVPhi(IV) {
+    DT = DTree;
+    WI.NarrowIV = IVPhi;
+    if (ReduceLiveIVs)
+      setSplitOverflowIntrinsics();
+  }
+
+  // Implement the interface used by simplifyUsersOfIV.
+  void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
+}
+
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+///
+void IndVarSimplify::simplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LoopInfo *LI) {
+  SmallVector<WideIVInfo, 8> WideIVs;
+
+  SmallVector<PHINode*, 8> LoopPhis;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    LoopPhis.push_back(cast<PHINode>(I));
+  }
+  // Each round of simplification iterates through the SimplifyIVUsers worklist
+  // for all current phis, then determines whether any IVs can be
+  // widened. Widening adds new phis to LoopPhis, inducing another round of
+  // simplification on the wide IVs.
+  while (!LoopPhis.empty()) {
+    // Evaluate as many IV expressions as possible before widening any IVs. This
+    // forces SCEV to set no-wrap flags before evaluating sign/zero
+    // extension. The first time SCEV attempts to normalize sign/zero extension,
+    // the result becomes final. So for the most predictable results, we delay
+    // evaluation of sign/zero extend evaluation until needed, and avoid running
+    // other SCEV based analysis prior to simplifyAndExtend.
+    do {
+      PHINode *CurrIV = LoopPhis.pop_back_val();
+
+      // Information about sign/zero extensions of CurrIV.
+      IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
+
+      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor);
+
+      if (Visitor.WI.WidestNativeType) {
+        WideIVs.push_back(Visitor.WI);
+      }
+    } while(!LoopPhis.empty());
+
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
+      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
+      if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) {
+        Changed = true;
+        LoopPhis.push_back(WidePhi);
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Return true if this loop's backedge taken count expression can be safely and
+/// cheaply expanded into an instruction sequence that can be used by
+/// linearFunctionTestReplace.
+///
+/// TODO: This fails for pointer-type loop counters with greater than one byte
+/// strides, consequently preventing LFTR from running. For the purpose of LFTR
+/// we could skip this check in the case that the LFTR loop counter (chosen by
+/// FindLoopCounter) is also pointer type. Instead, we could directly convert
+/// the loop test to an inequality test by checking the target data's alignment
+/// of element types (given that the initial pointer value originates from or is
+/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint).
+/// However, we don't yet have a strong motivation for converting loop tests
+/// into inequality tests.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE,
+                                        SCEVExpander &Rewriter) {
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      BackedgeTakenCount->isZero())
+    return false;
+
+  if (!L->getExitingBlock())
+    return false;
+
+  // Can't rewrite non-branch yet.
+  if (!isa<BranchInst>(L->getExitingBlock()->getTerminator()))
+    return false;
+
+  if (Rewriter.isHighCostExpansion(BackedgeTakenCount, L))
+    return false;
+
+  return true;
+}
+
+/// Return the loop header phi IFF IncV adds a loop invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return nullptr;
+
+  switch (IncI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+    break;
+  case Instruction::GetElementPtr:
+    // An IV counter must preserve its type.
+    if (IncI->getNumOperands() == 2)
+      break;
+  default:
+    return nullptr;
+  }
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(1), L, DT))
+      return Phi;
+    return nullptr;
+  }
+  if (IncI->getOpcode() == Instruction::GetElementPtr)
+    return nullptr;
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(0), L, DT))
+      return Phi;
+  }
+  return nullptr;
+}
+
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
+  assert(L->getExitingBlock() && "expected loop exit");
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  // Don't bother with LFTR if the loop is not properly simplified.
+  if (!LatchBlock)
+    return nullptr;
+
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  assert(BI && "expected exit branch");
+
+  return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+  // Do LFTR to simplify the exit condition to an ICMP.
+  ICmpInst *Cond = getLoopTest(L);
+  if (!Cond)
+    return true;
+
+  // Do LFTR to simplify the exit ICMP to EQ/NE
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+    return true;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!isLoopInvariant(RHS, L, DT)) {
+    if (!isLoopInvariant(LHS, L, DT))
+      return true;
+    std::swap(LHS, RHS);
+  }
+  // Look for a simple IV counter LHS
+  PHINode *Phi = dyn_cast<PHINode>(LHS);
+  if (!Phi)
+    Phi = getLoopPhiForCounter(LHS, L, DT);
+
+  if (!Phi)
+    return true;
+
+  // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+  int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+  if (Idx < 0)
+    return true;
+
+  // Do LFTR if the exit condition's IV is *not* a simple counter.
+  Value *IncV = Phi->getIncomingValue(Idx);
+  return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
+                               unsigned Depth) {
+  if (isa<Constant>(V))
+    return !isa<UndefValue>(V);
+
+  if (Depth >= 6)
+    return false;
+
+  // Conservatively handle non-constant non-instructions. For example, Arguments
+  // may be undef.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // Load and return values may be undef.
+  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+    return false;
+
+  // Optimistically handle other instructions.
+  for (Value *Op : I->operands()) {
+    if (!Visited.insert(Op).second)
+      continue;
+    if (!hasConcreteDefImpl(Op, Visited, Depth+1))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+  SmallPtrSet<Value*, 8> Visited;
+  Visited.insert(V);
+  return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+  for (User *U : Phi->users())
+    if (U != Cond && U != IncV) return false;
+
+  for (User *U : IncV->users())
+    if (U != Cond && U != Phi) return false;
+  return true;
+}
+
+/// Find an affine IV in canonical form.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
+                                ScalarEvolution *SE, DominatorTree *DT) {
+  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+  Value *Cond =
+    cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+  // Loop over all of the PHI nodes, looking for a simple counter.
+  PHINode *BestPhi = nullptr;
+  const SCEV *BestInit = nullptr;
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!SE->isSCEVable(Phi->getType()))
+      continue;
+
+    // Avoid comparing an integer IV against a pointer Limit.
+    if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+      continue;
+
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+    if (!AR || AR->getLoop() != L || !AR->isAffine())
+      continue;
+
+    // AR may be a pointer type, while BECount is an integer type.
+    // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+    // AR may not be a narrower type, or we may never exit.
+    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+    if (PhiWidth < BCWidth ||
+        !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth))
+      continue;
+
+    const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+    if (!Step || !Step->isOne())
+      continue;
+
+    int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+    Value *IncV = Phi->getIncomingValue(LatchIdx);
+    if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+      continue;
+
+    // Avoid reusing a potentially undef value to compute other values that may
+    // have originally had a concrete definition.
+    if (!hasConcreteDef(Phi)) {
+      // We explicitly allow unknown phis as long as they are already used by
+      // the loop test. In this case we assume that performing LFTR could not
+      // increase the number of undef users.
+      if (ICmpInst *Cond = getLoopTest(L)) {
+        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT)
+            && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+          continue;
+        }
+      }
+    }
+    const SCEV *Init = AR->getStart();
+
+    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+      // Don't force a live loop counter if another IV can be used.
+      if (AlmostDeadIV(Phi, LatchBlock, Cond))
+        continue;
+
+      // Prefer to count-from-zero. This is a more "canonical" counter form. It
+      // also prefers integer to pointer IVs.
+      if (BestInit->isZero() != Init->isZero()) {
+        if (BestInit->isZero())
+          continue;
+      }
+      // If two IVs both count from zero or both count from nonzero then the
+      // narrower is likely a dead phi that has been widened. Use the wider phi
+      // to allow the other to be eliminated.
+      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+        continue;
+    }
+    BestPhi = Phi;
+    BestInit = Init;
+  }
+  return BestPhi;
+}
+
+/// Help linearFunctionTestReplace by generating a value that holds the RHS of
+/// the new loop test.
+static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
+                           SCEVExpander &Rewriter, ScalarEvolution *SE) {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+  const SCEV *IVInit = AR->getStart();
+
+  // IVInit may be a pointer while IVCount is an integer when FindLoopCounter
+  // finds a valid pointer IV. Sign extend BECount in order to materialize a
+  // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+  // the existing GEPs whenever possible.
+  if (IndVar->getType()->isPointerTy()
+      && !IVCount->getType()->isPointerTy()) {
+
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. IVCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend IVCount.
+    Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
+
+    // Expand the code for the iteration count.
+    assert(SE->isLoopInvariant(IVOffset, L) &&
+           "Computed iteration count is not loop invariant!");
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI);
+
+    Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+    assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
+    // We could handle pointer IVs other than i8*, but we need to compensate for
+    // gep index scaling. See canExpandBackedgeTakenCount comments.
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
+             cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
+           && "unit stride pointer IV must be i8*");
+
+    IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+    return Builder.CreateGEP(nullptr, GEPBase, GEPOffset, "lftr.limit");
+  }
+  else {
+    // In any other case, convert both IVInit and IVCount to integers before
+    // comparing. This may result in SCEV expension of pointers, but in practice
+    // SCEV will fold the pointer arithmetic away as such:
+    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+    //
+    // Valid Cases: (1) both integers is most common; (2) both may be pointers
+    // for simple memset-style loops.
+    //
+    // IVInit integer and IVCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
+
+    const SCEV *IVLimit = nullptr;
+    // For unit stride, IVCount = Start + BECount with 2's complement overflow.
+    // For non-zero Start, compute IVCount here.
+    if (AR->getStart()->isZero())
+      IVLimit = IVCount;
+    else {
+      assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+      const SCEV *IVInit = AR->getStart();
+
+      // For integer IVs, truncate the IV before computing IVInit + BECount.
+      if (SE->getTypeSizeInBits(IVInit->getType())
+          > SE->getTypeSizeInBits(IVCount->getType()))
+        IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+
+      IVLimit = SE->getAddExpr(IVInit, IVCount);
+    }
+    // Expand the code for the iteration count.
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    IRBuilder<> Builder(BI);
+    assert(SE->isLoopInvariant(IVLimit, L) &&
+           "Computed iteration count is not loop invariant!");
+    // Ensure that we generate the same type as IndVar, or a smaller integer
+    // type. In the presence of null pointer values, we have an integer type
+    // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+    Type *LimitTy = IVCount->getType()->isPointerTy() ?
+      IndVar->getType() : IVCount->getType();
+    return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+  }
+}
+
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable.  This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
+Value *IndVarSimplify::
+linearFunctionTestReplace(Loop *L,
+                          const SCEV *BackedgeTakenCount,
+                          PHINode *IndVar,
+                          SCEVExpander &Rewriter) {
+  assert(canExpandBackedgeTakenCount(L, SE, Rewriter) && "precondition");
+
+  // Initialize CmpIndVar and IVCount to their preincremented values.
+  Value *CmpIndVar = IndVar;
+  const SCEV *IVCount = BackedgeTakenCount;
+
+  // If the exiting block is the same as the backedge block, we prefer to
+  // compare against the post-incremented value, otherwise we must compare
+  // against the preincremented value.
+  if (L->getExitingBlock() == L->getLoopLatch()) {
+    // Add one to the "backedge-taken" count to get the trip count.
+    // This addition may overflow, which is valid as long as the comparison is
+    // truncated to BackedgeTakenCount->getType().
+    IVCount = SE->getAddExpr(BackedgeTakenCount,
+                             SE->getOne(BackedgeTakenCount->getType()));
+    // The BackedgeTaken expression contains the number of times that the
+    // backedge branches to the loop header.  This is one less than the
+    // number of times the loop executes, so use the incremented indvar.
+    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
+  }
+
+  Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+  assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy()
+         && "genLoopLimit missed a cast");
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  ICmpInst::Predicate P;
+  if (L->contains(BI->getSuccessor(0)))
+    P = ICmpInst::ICMP_NE;
+  else
+    P = ICmpInst::ICMP_EQ;
+
+  DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+               << "      LHS:" << *CmpIndVar << '\n'
+               << "       op:\t"
+               << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+               << "      RHS:\t" << *ExitCnt << "\n"
+               << "  IVCount:\t" << *IVCount << "\n");
+
+  IRBuilder<> Builder(BI);
+
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+  unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+  if (CmpIndVarSize > ExitCntSize) {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+    const SCEV *ARStart = AR->getStart();
+    const SCEV *ARStep = AR->getStepRecurrence(*SE);
+    // For constant IVCount, avoid truncation.
+    if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
+      const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt();
+      APInt Count = cast<SCEVConstant>(IVCount)->getAPInt();
+      // Note that the post-inc value of BackedgeTakenCount may have overflowed
+      // above such that IVCount is now zero.
+      if (IVCount != BackedgeTakenCount && Count == 0) {
+        Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize);
+        ++Count;
+      }
+      else
+        Count = Count.zext(CmpIndVarSize);
+      APInt NewLimit;
+      if (cast<SCEVConstant>(ARStep)->getValue()->isNegative())
+        NewLimit = Start - Count;
+      else
+        NewLimit = Start + Count;
+      ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
+
+      DEBUG(dbgs() << "  Widen RHS:\t" << *ExitCnt << "\n");
+    } else {
+      CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                      "lftr.wideiv");
+    }
+  }
+  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+  Value *OrigCond = BI->getCondition();
+  // It's tempting to use replaceAllUsesWith here to fully replace the old
+  // comparison, but that's not immediately safe, since users of the old
+  // comparison may not be dominated by the new comparison. Instead, just
+  // update the branch to use the new comparison; in the common case this
+  // will make old comparison dead.
+  BI->setCondition(Cond);
+  DeadInsts.push_back(OrigCond);
+
+  ++NumLFTR;
+  Changed = true;
+  return Cond;
+}
+
+//===----------------------------------------------------------------------===//
+//  sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) return;
+
+  Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt();
+  BasicBlock::iterator I(Preheader->getTerminator());
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader.
+    if (isa<PHINode>(I))
+      break;
+
+    // Don't move instructions which might have side effects, since the side
+    // effects need to complete before instructions inside the loop.  Also don't
+    // move instructions which might read memory, since the loop may modify
+    // memory. Note that it's okay if the instruction might have undefined
+    // behavior: LoopSimplify guarantees that the preheader dominates the exit
+    // block.
+    if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      continue;
+
+    // Skip debug info intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Skip eh pad instructions.
+    if (I->isEHPad())
+      continue;
+
+    // Don't sink alloca: we never want to sink static alloca's out of the
+    // entry block, and correctly sinking dynamic alloca's requires
+    // checks for stacksave/stackrestore intrinsics.
+    // FIXME: Refactor this check somehow?
+    if (isa<AllocaInst>(I))
+      continue;
+
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UseBB = User->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(User)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = &*I;
+    bool Done = false;
+
+    if (I != Preheader->begin()) {
+      // Skip debug info intrinsics.
+      do {
+        --I;
+      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+        Done = true;
+    } else {
+      Done = true;
+    }
+
+    ToMove->moveBefore(InsertPt);
+    if (Done) break;
+    InsertPt = ToMove;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  // If LoopSimplify form is not available, stay out of trouble. Some notes:
+  //  - LSR currently only supports LoopSimplify-form loops. Indvars'
+  //    canonicalization can be a pessimization without LSR to "clean up"
+  //    afterwards.
+  //  - We depend on having a preheader; in particular,
+  //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
+  //    and we're in trouble if we can't find the induction variable even when
+  //    we've manually inserted one.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  TLI = TLIP ? &TLIP->getTLI() : nullptr;
+  auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+  TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  DeadInsts.clear();
+  Changed = false;
+
+  // If there are any floating-point recurrences, attempt to
+  // transform them to use integer recurrences.
+  rewriteNonIntegerIVs(L);
+
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE, DL, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+  // Eliminate redundant IV users.
+  //
+  // Simplification works best when run before other consumers of SCEV. We
+  // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+  // other expressions involving loop IVs have been evaluated. This helps SCEV
+  // set no-wrap flags before normalizing sign/zero extension.
+  Rewriter.disableCanonicalMode();
+  simplifyAndExtend(L, Rewriter, LI);
+
+  // Check to see if this loop has a computable loop-invariant execution count.
+  // If so, this means that we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values of
+  // the current expressions.
+  //
+  if (ReplaceExitValue != NeverRepl &&
+      !isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    rewriteLoopExitValues(L, Rewriter);
+
+  // Eliminate redundant IV cycles.
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+  // If we have a trip count expression, rewrite the loop's exit condition
+  // using it.  We can currently only handle loops with a single exit.
+  if (canExpandBackedgeTakenCount(L, SE, Rewriter) && needsLFTR(L, DT)) {
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
+    if (IndVar) {
+      // Check preconditions for proper SCEVExpander operation. SCEV does not
+      // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+      // pass that uses the SCEVExpander must do it. This does not work well for
+      // loop passes because SCEVExpander makes assumptions about all loops,
+      // while LoopPassManager only forces the current loop to be simplified.
+      //
+      // FIXME: SCEV expansion has no way to bail out, so the caller must
+      // explicitly check any assumptions made by SCEV. Brittle.
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+      if (!AR || AR->getLoop()->getLoopPreheader())
+        (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
+    }
+  }
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted in the loop below, causing the AssertingVH in the cache to
+  // trigger.
+  Rewriter.clear();
+
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
+  while (!DeadInsts.empty())
+    if (Instruction *Inst =
+            dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+
+  // The Rewriter may not be used from this point on.
+
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  sinkUnusedInvariants(L);
+
+  // Clean up dead instructions.
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+
+  // Check a post-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!");
+
+  // Verify that LFTR, and any other change have not interfered with SCEV's
+  // ability to compute trip count.
+#ifndef NDEBUG
+  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    SE->forgetLoop(L);
+    const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+        SE->getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE->getTruncateOrNoop(NewBECount,
+                                         BackedgeTakenCount->getType());
+    else
+      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+                                                 NewBECount->getType());
+    assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+  }
+#endif
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
new file mode 100644
index 0000000..dea61f6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -0,0 +1,1503 @@
+//===-- InductiveRangeCheckElimination.cpp - ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges.  It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+//   len = < known positive >
+//   for (i = 0; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//
+// to
+//
+//   len = < known positive >
+//   limit = smin(n, len)
+//   // no first segment
+//   for (i = 0; i < limit; i++) {
+//     if (0 <= i && i < len) { // this check is fully redundant
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//   for (i = limit; i < n; i++) {
+//     if (0 <= i && i < len) {
+//       do_something();
+//     } else {
+//       throw_out_of_bounds();
+//     }
+//   }
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <array>
+
+using namespace llvm;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+                                        cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+                                       cl::init(false));
+
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+                                      cl::init(false));
+
+static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
+                                          cl::Hidden, cl::init(10));
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+///  1. a very cold successor (i.e. the branch jumps to that successor very
+///     rarely)
+///
+///  and
+///
+///  2. a condition that is provably true for some contiguous range of values
+///     taken by the containing loop's induction variable.
+///
+class InductiveRangeCheck {
+  // Classifies a range check
+  enum RangeCheckKind : unsigned {
+    // Range check of the form "0 <= I".
+    RANGE_CHECK_LOWER = 1,
+
+    // Range check of the form "I < L" where L is known positive.
+    RANGE_CHECK_UPPER = 2,
+
+    // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER
+    // conditions.
+    RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER,
+
+    // Unrecognized range check condition.
+    RANGE_CHECK_UNKNOWN = (unsigned)-1
+  };
+
+  static const char *rangeCheckKindToStr(RangeCheckKind);
+
+  const SCEV *Offset;
+  const SCEV *Scale;
+  Value *Length;
+  BranchInst *Branch;
+  RangeCheckKind Kind;
+
+  static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                            ScalarEvolution &SE, Value *&Index,
+                                            Value *&Length);
+
+  static InductiveRangeCheck::RangeCheckKind
+  parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition,
+                  const SCEV *&Index, Value *&UpperLimit);
+
+  InductiveRangeCheck() :
+    Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { }
+
+public:
+  const SCEV *getOffset() const { return Offset; }
+  const SCEV *getScale() const { return Scale; }
+  Value *getLength() const { return Length; }
+
+  void print(raw_ostream &OS) const {
+    OS << "InductiveRangeCheck:\n";
+    OS << "  Kind: " << rangeCheckKindToStr(Kind) << "\n";
+    OS << "  Offset: ";
+    Offset->print(OS);
+    OS << "  Scale: ";
+    Scale->print(OS);
+    OS << "  Length: ";
+    if (Length)
+      Length->print(OS);
+    else
+      OS << "(null)";
+    OS << "\n  Branch: ";
+    getBranch()->print(OS);
+    OS << "\n";
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() {
+    print(dbgs());
+  }
+#endif
+
+  BranchInst *getBranch() const { return Branch; }
+
+  /// Represents an signed integer range [Range.getBegin(), Range.getEnd()).  If
+  /// R.getEnd() sle R.getBegin(), then R denotes the empty range.
+
+  class Range {
+    const SCEV *Begin;
+    const SCEV *End;
+
+  public:
+    Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+      assert(Begin->getType() == End->getType() && "ill-typed range!");
+    }
+
+    Type *getType() const { return Begin->getType(); }
+    const SCEV *getBegin() const { return Begin; }
+    const SCEV *getEnd() const { return End; }
+  };
+
+  typedef SpecificBumpPtrAllocator<InductiveRangeCheck> AllocatorTy;
+
+  /// This is the value the condition of the branch needs to evaluate to for the
+  /// branch to take the hot successor (see (1) above).
+  bool getPassingDirection() { return true; }
+
+  /// Computes a range for the induction variable (IndVar) in which the range
+  /// check is redundant and can be constant-folded away.  The induction
+  /// variable is not required to be the canonical {0,+,1} induction variable.
+  Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+                                            const SCEVAddRecExpr *IndVar,
+                                            IRBuilder<> &B) const;
+
+  /// Create an inductive range check out of BI if possible, else return
+  /// nullptr.
+  static InductiveRangeCheck *create(AllocatorTy &Alloc, BranchInst *BI,
+                                     Loop *L, ScalarEvolution &SE,
+                                     BranchProbabilityInfo &BPI);
+};
+
+class InductiveRangeCheckElimination : public LoopPass {
+  InductiveRangeCheck::AllocatorTy Allocator;
+
+public:
+  static char ID;
+  InductiveRangeCheckElimination() : LoopPass(ID) {
+    initializeInductiveRangeCheckEliminationPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+};
+
+char InductiveRangeCheckElimination::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce",
+                      "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce",
+                    "Inductive range check elimination", false, false)
+
+const char *InductiveRangeCheck::rangeCheckKindToStr(
+    InductiveRangeCheck::RangeCheckKind RCK) {
+  switch (RCK) {
+  case InductiveRangeCheck::RANGE_CHECK_UNKNOWN:
+    return "RANGE_CHECK_UNKNOWN";
+
+  case InductiveRangeCheck::RANGE_CHECK_UPPER:
+    return "RANGE_CHECK_UPPER";
+
+  case InductiveRangeCheck::RANGE_CHECK_LOWER:
+    return "RANGE_CHECK_LOWER";
+
+  case InductiveRangeCheck::RANGE_CHECK_BOTH:
+    return "RANGE_CHECK_BOTH";
+  }
+
+  llvm_unreachable("unknown range check type!");
+}
+
+/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI`
+/// cannot
+/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set
+/// `Index` and `Length` to `nullptr`.  Otherwise set `Index` to the value
+/// being
+/// range checked, and set `Length` to the upper limit `Index` is being range
+/// checked with if (and only if) the range check type is stronger or equal to
+/// RANGE_CHECK_UPPER.
+///
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                         ScalarEvolution &SE, Value *&Index,
+                                         Value *&Length) {
+
+  auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
+    const SCEV *S = SE.getSCEV(V);
+    if (isa<SCEVCouldNotCompute>(S))
+      return false;
+
+    return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
+           SE.isKnownNonNegative(S);
+  };
+
+  using namespace llvm::PatternMatch;
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+
+  switch (Pred) {
+  default:
+    return RANGE_CHECK_UNKNOWN;
+
+  case ICmpInst::ICMP_SLE:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_SGE:
+    if (match(RHS, m_ConstantInt<0>())) {
+      Index = LHS;
+      return RANGE_CHECK_LOWER;
+    }
+    return RANGE_CHECK_UNKNOWN;
+
+  case ICmpInst::ICMP_SLT:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_SGT:
+    if (match(RHS, m_ConstantInt<-1>())) {
+      Index = LHS;
+      return RANGE_CHECK_LOWER;
+    }
+
+    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return RANGE_CHECK_UPPER;
+    }
+    return RANGE_CHECK_UNKNOWN;
+
+  case ICmpInst::ICMP_ULT:
+    std::swap(LHS, RHS);
+  // fallthrough
+  case ICmpInst::ICMP_UGT:
+    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return RANGE_CHECK_BOTH;
+    }
+    return RANGE_CHECK_UNKNOWN;
+  }
+
+  llvm_unreachable("default clause returns!");
+}
+
+/// Parses an arbitrary condition into a range check.  `Length` is set only if
+/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger.
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE,
+                                     Value *Condition, const SCEV *&Index,
+                                     Value *&Length) {
+  using namespace llvm::PatternMatch;
+
+  Value *A = nullptr;
+  Value *B = nullptr;
+
+  if (match(Condition, m_And(m_Value(A), m_Value(B)))) {
+    Value *IndexA = nullptr, *IndexB = nullptr;
+    Value *LengthA = nullptr, *LengthB = nullptr;
+    ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B);
+
+    if (!ICmpA || !ICmpB)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA);
+    auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB);
+
+    if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN ||
+        RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    if (IndexA != IndexB)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    Index = SE.getSCEV(IndexA);
+    if (isa<SCEVCouldNotCompute>(Index))
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    Length = LengthA == nullptr ? LengthB : LengthA;
+
+    return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB);
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+    Value *IndexVal = nullptr;
+
+    auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length);
+
+    if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    Index = SE.getSCEV(IndexVal);
+    if (isa<SCEVCouldNotCompute>(Index))
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    return RCKind;
+  }
+
+  return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+}
+
+
+InductiveRangeCheck *
+InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
+                            Loop *L, ScalarEvolution &SE,
+                            BranchProbabilityInfo &BPI) {
+
+  if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+    return nullptr;
+
+  BranchProbability LikelyTaken(15, 16);
+
+  if (BPI.getEdgeProbability(BI->getParent(), (unsigned) 0) < LikelyTaken)
+    return nullptr;
+
+  Value *Length = nullptr;
+  const SCEV *IndexSCEV = nullptr;
+
+  auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(),
+                                                     IndexSCEV, Length);
+
+  if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+    return nullptr;
+
+  assert(IndexSCEV && "contract with SplitRangeCheckCondition!");
+  assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) &&
+         "contract with SplitRangeCheckCondition!");
+
+  const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV);
+  bool IsAffineIndex =
+      IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+  if (!IsAffineIndex)
+    return nullptr;
+
+  InductiveRangeCheck *IRC = new (A.Allocate()) InductiveRangeCheck;
+  IRC->Length = Length;
+  IRC->Offset = IndexAddRec->getStart();
+  IRC->Scale = IndexAddRec->getStepRecurrence(SE);
+  IRC->Branch = BI;
+  IRC->Kind = RCKind;
+  return IRC;
+}
+
+namespace {
+
+// Keeps track of the structure of a loop.  This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR.  This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+  const char *Tag;
+
+  BasicBlock *Header;
+  BasicBlock *Latch;
+
+  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+  // successor is `LatchExit', the exit block of the loop.
+  BranchInst *LatchBr;
+  BasicBlock *LatchExit;
+  unsigned LatchBrExitIdx;
+
+  Value *IndVarNext;
+  Value *IndVarStart;
+  Value *LoopExitAt;
+  bool IndVarIncreasing;
+
+  LoopStructure()
+      : Tag(""), Header(nullptr), Latch(nullptr), LatchBr(nullptr),
+        LatchExit(nullptr), LatchBrExitIdx(-1), IndVarNext(nullptr),
+        IndVarStart(nullptr), LoopExitAt(nullptr), IndVarIncreasing(false) {}
+
+  template <typename M> LoopStructure map(M Map) const {
+    LoopStructure Result;
+    Result.Tag = Tag;
+    Result.Header = cast<BasicBlock>(Map(Header));
+    Result.Latch = cast<BasicBlock>(Map(Latch));
+    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+    Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+    Result.LatchBrExitIdx = LatchBrExitIdx;
+    Result.IndVarNext = Map(IndVarNext);
+    Result.IndVarStart = Map(IndVarStart);
+    Result.LoopExitAt = Map(LoopExitAt);
+    Result.IndVarIncreasing = IndVarIncreasing;
+    return Result;
+  }
+
+  static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
+                                                    BranchProbabilityInfo &BPI,
+                                                    Loop &,
+                                                    const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End).  The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations.  The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+///
+class LoopConstrainer {
+  // The representation of a clone of the original loop we started out with.
+  struct ClonedLoop {
+    // The cloned blocks
+    std::vector<BasicBlock *> Blocks;
+
+    // `Map` maps values in the clonee into values in the cloned version
+    ValueToValueMapTy Map;
+
+    // An instance of `LoopStructure` for the cloned loop
+    LoopStructure Structure;
+  };
+
+  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
+  // more details on what these fields mean.
+  struct RewrittenRangeInfo {
+    BasicBlock *PseudoExit;
+    BasicBlock *ExitSelector;
+    std::vector<PHINode *> PHIValuesAtPseudoExit;
+    PHINode *IndVarEnd;
+
+    RewrittenRangeInfo()
+        : PseudoExit(nullptr), ExitSelector(nullptr), IndVarEnd(nullptr) {}
+  };
+
+  // Calculated subranges we restrict the iteration space of the main loop to.
+  // See the implementation of `calculateSubRanges' for more details on how
+  // these fields are computed.  `LowLimit` is None if there is no restriction
+  // on low end of the restricted iteration space of the main loop.  `HighLimit`
+  // is None if there is no restriction on high end of the restricted iteration
+  // space of the main loop.
+
+  struct SubRanges {
+    Optional<const SCEV *> LowLimit;
+    Optional<const SCEV *> HighLimit;
+  };
+
+  // A utility function that does a `replaceUsesOfWith' on the incoming block
+  // set of a `PHINode' -- replaces instances of `Block' in the `PHINode's
+  // incoming block list with `ReplaceBy'.
+  static void replacePHIBlock(PHINode *PN, BasicBlock *Block,
+                              BasicBlock *ReplaceBy);
+
+  // Compute a safe set of limits for the main loop to run in -- effectively the
+  // intersection of `Range' and the iteration space of the original loop.
+  // Return None if unable to compute the set of subranges.
+  //
+  Optional<SubRanges> calculateSubRanges() const;
+
+  // Clone `OriginalLoop' and return the result in CLResult.  The IR after
+  // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+  // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+  // but there is no such edge.
+  //
+  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
+  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt
+  // `OriginalHeaderCount'.
+  //
+  // If there are iterations left to execute, control is made to jump to
+  // `ContinuationBlock', otherwise they take the normal loop exit.  The
+  // returned `RewrittenRangeInfo' object is populated as follows:
+  //
+  //  .PseudoExit is a basic block that unconditionally branches to
+  //      `ContinuationBlock'.
+  //
+  //  .ExitSelector is a basic block that decides, on exit from the loop,
+  //      whether to branch to the "true" exit or to `PseudoExit'.
+  //
+  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+  //      for each PHINode in the loop header on taking the pseudo exit.
+  //
+  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+  // preheader because it is made to branch to the loop header only
+  // conditionally.
+  //
+  RewrittenRangeInfo
+  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+                          Value *ExitLoopAt,
+                          BasicBlock *ContinuationBlock) const;
+
+  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
+  // function creates a new preheader for `LS' and returns it.
+  //
+  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+                              const char *Tag) const;
+
+  // `ContinuationBlockAndPreheader' was the continuation block for some call to
+  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+  // This function rewrites the PHI nodes in `LS.Header' to start with the
+  // correct value.
+  void rewriteIncomingValuesForPHIs(
+      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+      const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+  // Even though we do not preserve any passes at this time, we at least need to
+  // keep the parent loop structure consistent.  The `LPPassManager' seems to
+  // verify this after running a loop pass.  This function adds the list of
+  // blocks denoted by BBs to this loops parent loop if required.
+  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+  // Some global state.
+  Function &F;
+  LLVMContext &Ctx;
+  ScalarEvolution &SE;
+
+  // Information about the original loop we started out with.
+  Loop &OriginalLoop;
+  LoopInfo &OriginalLoopInfo;
+  const SCEV *LatchTakenCount;
+  BasicBlock *OriginalPreheader;
+
+  // The preheader of the main loop.  This may or may not be different from
+  // `OriginalPreheader'.
+  BasicBlock *MainLoopPreheader;
+
+  // The range we need to run the main loop in.
+  InductiveRangeCheck::Range Range;
+
+  // The structure of the main loop (see comment at the beginning of this class
+  // for a definition)
+  LoopStructure MainLoopStructure;
+
+public:
+  LoopConstrainer(Loop &L, LoopInfo &LI, const LoopStructure &LS,
+                  ScalarEvolution &SE, InductiveRangeCheck::Range R)
+      : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+        SE(SE), OriginalLoop(L), OriginalLoopInfo(LI), LatchTakenCount(nullptr),
+        OriginalPreheader(nullptr), MainLoopPreheader(nullptr), Range(R),
+        MainLoopStructure(LS) {}
+
+  // Entry point for the algorithm.  Returns true on success.
+  bool run();
+};
+
+}
+
+void LoopConstrainer::replacePHIBlock(PHINode *PN, BasicBlock *Block,
+                                      BasicBlock *ReplaceBy) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingBlock(i) == Block)
+      PN->setIncomingBlock(i, ReplaceBy);
+}
+
+static bool CanBeSMax(ScalarEvolution &SE, const SCEV *S) {
+  APInt SMax =
+      APInt::getSignedMaxValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(SMax) &&
+         SE.getUnsignedRange(S).contains(SMax);
+}
+
+static bool CanBeSMin(ScalarEvolution &SE, const SCEV *S) {
+  APInt SMin =
+      APInt::getSignedMinValue(cast<IntegerType>(S->getType())->getBitWidth());
+  return SE.getSignedRange(S).contains(SMin) &&
+         SE.getUnsignedRange(S).contains(SMin);
+}
+
+Optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BPI,
+                                  Loop &L, const char *&FailureReason) {
+  assert(L.isLoopSimplifyForm() && "should follow from addRequired<>");
+
+  BasicBlock *Latch = L.getLoopLatch();
+  if (!L.isLoopExiting(Latch)) {
+    FailureReason = "no loop latch";
+    return None;
+  }
+
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    FailureReason = "no preheader";
+    return None;
+  }
+
+  BranchInst *LatchBr = dyn_cast<BranchInst>(&*Latch->rbegin());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    FailureReason = "latch terminator not conditional branch";
+    return None;
+  }
+
+  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+  BranchProbability ExitProbability =
+    BPI.getEdgeProbability(LatchBr->getParent(), LatchBrExitIdx);
+
+  if (ExitProbability > BranchProbability(1, MaxExitProbReciprocal)) {
+    FailureReason = "short running loop, not profitable";
+    return None;
+  }
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+    FailureReason = "latch terminator branch not conditional on integral icmp";
+    return None;
+  }
+
+  const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+  if (isa<SCEVCouldNotCompute>(LatchCount)) {
+    FailureReason = "could not compute latch count";
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LeftValue = ICI->getOperand(0);
+  const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+  Value *RightValue = ICI->getOperand(1);
+  const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+  if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+    if (isa<SCEVAddRecExpr>(RightSCEV)) {
+      std::swap(LeftSCEV, RightSCEV);
+      std::swap(LeftValue, RightValue);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    } else {
+      FailureReason = "no add recurrences in the icmp";
+      return None;
+    }
+  }
+
+  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+    if (AR->getNoWrapFlags(SCEV::FlagNSW))
+      return true;
+
+    IntegerType *Ty = cast<IntegerType>(AR->getType());
+    IntegerType *WideTy =
+        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+    const SCEVAddRecExpr *ExtendAfterOp =
+        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+    if (ExtendAfterOp) {
+      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+      const SCEV *ExtendedStep =
+          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+      if (NoSignedWrap)
+        return true;
+    }
+
+    // We may have proved this when computing the sign extension above.
+    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+  };
+
+  auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+    if (!AR->isAffine())
+      return false;
+
+    // Currently we only work with induction variables that have been proved to
+    // not wrap.  This restriction can potentially be lifted in the future.
+
+    if (!HasNoSignedWrap(AR))
+      return false;
+
+    if (const SCEVConstant *StepExpr =
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE))) {
+      ConstantInt *StepCI = StepExpr->getValue();
+      if (StepCI->isOne() || StepCI->isMinusOne()) {
+        IsIncreasing = StepCI->isOne();
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  // `ICI` is interpreted as taking the backedge if the *next* value of the
+  // induction variable satisfies some constraint.
+
+  const SCEVAddRecExpr *IndVarNext = cast<SCEVAddRecExpr>(LeftSCEV);
+  bool IsIncreasing = false;
+  if (!IsInductionVar(IndVarNext, IsIncreasing)) {
+    FailureReason = "LHS in icmp not induction variable";
+    return None;
+  }
+
+  ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+  // TODO: generalize the predicates here to also match their unsigned variants.
+  if (IsIncreasing) {
+    bool FoundExpectedPred =
+        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
+        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp slt semantically, found something else";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      if (CanBeSMax(SE, RightSCEV)) {
+        // TODO: this restriction is easily removable -- we just have to
+        // remember that the icmp was an slt and not an sle.
+        FailureReason = "limit may overflow when coercing sle to slt";
+        return None;
+      }
+
+      IRBuilder<> B(&*Preheader->rbegin());
+      RightValue = B.CreateAdd(RightValue, One);
+    }
+
+  } else {
+    bool FoundExpectedPred =
+        (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
+        (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp sgt semantically, found something else";
+      return None;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      if (CanBeSMin(SE, RightSCEV)) {
+        // TODO: this restriction is easily removable -- we just have to
+        // remember that the icmp was an sgt and not an sge.
+        FailureReason = "limit may overflow when coercing sge to sgt";
+        return None;
+      }
+
+      IRBuilder<> B(&*Preheader->rbegin());
+      RightValue = B.CreateSub(RightValue, One);
+    }
+  }
+
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
+  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+  assert(SE.getLoopDisposition(LatchCount, &L) ==
+             ScalarEvolution::LoopInvariant &&
+         "loop variant exit count doesn't make sense!");
+
+  assert(!L.contains(LatchExit) && "expected an exit block!");
+  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  Value *IndVarStartV =
+      SCEVExpander(SE, DL, "irce")
+          .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin());
+  IndVarStartV->setName("indvar.start");
+
+  LoopStructure Result;
+
+  Result.Tag = "main";
+  Result.Header = Header;
+  Result.Latch = Latch;
+  Result.LatchBr = LatchBr;
+  Result.LatchExit = LatchExit;
+  Result.LatchBrExitIdx = LatchBrExitIdx;
+  Result.IndVarStart = IndVarStartV;
+  Result.IndVarNext = LeftValue;
+  Result.IndVarIncreasing = IsIncreasing;
+  Result.LoopExitAt = RightValue;
+
+  FailureReason = nullptr;
+
+  return Result;
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges() const {
+  IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+  if (Range.getType() != Ty)
+    return None;
+
+  LoopConstrainer::SubRanges Result;
+
+  // I think we can be more aggressive here and make this nuw / nsw if the
+  // addition that feeds into the icmp for the latch's terminating branch is nuw
+  // / nsw.  In any case, a wrapping 2's complement addition is safe.
+  ConstantInt *One = ConstantInt::get(Ty, 1);
+  const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
+  const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
+
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
+  // range of values the induction variable takes.
+
+  const SCEV *Smallest = nullptr, *Greatest = nullptr;
+
+  if (Increasing) {
+    Smallest = Start;
+    Greatest = End;
+  } else {
+    // These two computations may sign-overflow.  Here is why that is okay:
+    //
+    // We know that the induction variable does not sign-overflow on any
+    // iteration except the last one, and it starts at `Start` and ends at
+    // `End`, decrementing by one every time.
+    //
+    //  * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+    //    induction variable is decreasing we know that that the smallest value
+    //    the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+    //
+    //  * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`.  In
+    //    that case, `Clamp` will always return `Smallest` and
+    //    [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+    //    will be an empty range.  Returning an empty range is always safe.
+    //
+
+    Smallest = SE.getAddExpr(End, SE.getSCEV(One));
+    Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+  }
+
+  auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
+    return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
+  };
+
+  // In some cases we can prove that we don't need a pre or post loop
+
+  bool ProvablyNoPreloop =
+      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Range.getBegin(), Smallest);
+  if (!ProvablyNoPreloop)
+    Result.LowLimit = Clamp(Range.getBegin());
+
+  bool ProvablyNoPostLoop =
+      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+  if (!ProvablyNoPostLoop)
+    Result.HighLimit = Clamp(Range.getEnd());
+
+  return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+                                const char *Tag) const {
+  for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+    Result.Blocks.push_back(Clone);
+    Result.Map[BB] = Clone;
+  }
+
+  auto GetClonedValue = [&Result](Value *V) {
+    assert(V && "null values not in domain!");
+    auto It = Result.Map.find(V);
+    if (It == Result.Map.end())
+      return V;
+    return static_cast<Value *>(It->second);
+  };
+
+  Result.Structure = MainLoopStructure.map(GetClonedValue);
+  Result.Structure.Tag = Tag;
+
+  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+    BasicBlock *ClonedBB = Result.Blocks[i];
+    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+    for (Instruction &I : *ClonedBB)
+      RemapInstruction(&I, Result.Map,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+    // Exit blocks will now have one more predecessor and their PHI nodes need
+    // to be edited to reflect that.  No phi nodes need to be introduced because
+    // the loop is in LCSSA.
+
+    for (auto SBBI = succ_begin(OriginalBB), SBBE = succ_end(OriginalBB);
+         SBBI != SBBE; ++SBBI) {
+
+      if (OriginalLoop.contains(*SBBI))
+        continue; // not an exit block
+
+      for (Instruction &I : **SBBI) {
+        if (!isa<PHINode>(&I))
+          break;
+
+        PHINode *PN = cast<PHINode>(&I);
+        Value *OldIncoming = PN->getIncomingValueForBlock(OriginalBB);
+        PN->addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+      }
+    }
+  }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+    BasicBlock *ContinuationBlock) const {
+
+  // We start with a loop with a single latch:
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      |
+  //    |                    |
+  //    +--------+-----------+
+  //             |      ----------------\
+  //             |     /                |
+  //    +--------v----v------+          |
+  //    |                    |          |
+  //    |      header        |          |
+  //    |                    |          |
+  //    +--------------------+          |
+  //                                    |
+  //            .....                   |
+  //                                    |
+  //    +--------------------+          |
+  //    |                    |          |
+  //    |       latch        >----------/
+  //    |                    |
+  //    +-------v------------+
+  //            |
+  //            |
+  //            |   +--------------------+
+  //            |   |                    |
+  //            +--->   original exit    |
+  //                |                    |
+  //                +--------------------+
+  //
+  // We change the control flow to look like
+  //
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      >-------------------------+
+  //    |                    |                         |
+  //    +--------v-----------+                         |
+  //             |    /-------------+                  |
+  //             |   /              |                  |
+  //    +--------v--v--------+      |                  |
+  //    |                    |      |                  |
+  //    |      header        |      |   +--------+     |
+  //    |                    |      |   |        |     |
+  //    +--------------------+      |   |  +-----v-----v-----------+
+  //                                |   |  |                       |
+  //                                |   |  |     .pseudo.exit      |
+  //                                |   |  |                       |
+  //                                |   |  +-----------v-----------+
+  //                                |   |              |
+  //            .....               |   |              |
+  //                                |   |     +--------v-------------+
+  //    +--------------------+      |   |     |                      |
+  //    |                    |      |   |     |   ContinuationBlock  |
+  //    |       latch        >------+   |     |                      |
+  //    |                    |          |     +----------------------+
+  //    +---------v----------+          |
+  //              |                     |
+  //              |                     |
+  //              |     +---------------^-----+
+  //              |     |                     |
+  //              +----->    .exit.selector   |
+  //                    |                     |
+  //                    +----------v----------+
+  //                               |
+  //     +--------------------+    |
+  //     |                    |    |
+  //     |   original exit    <----+
+  //     |                    |
+  //     +--------------------+
+  //
+
+  RewrittenRangeInfo RRI;
+
+  auto BBInsertLocation = std::next(Function::iterator(LS.Latch));
+  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+                                        &F, &*BBInsertLocation);
+  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+                                      &*BBInsertLocation);
+
+  BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin());
+  bool Increasing = LS.IndVarIncreasing;
+
+  IRBuilder<> B(PreheaderJump);
+
+  // EnterLoopCond - is it okay to start executing this `LS'?
+  Value *EnterLoopCond = Increasing
+                             ? B.CreateICmpSLT(LS.IndVarStart, ExitSubloopAt)
+                             : B.CreateICmpSGT(LS.IndVarStart, ExitSubloopAt);
+
+  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+  PreheaderJump->eraseFromParent();
+
+  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+  B.SetInsertPoint(LS.LatchBr);
+  Value *TakeBackedgeLoopCond =
+      Increasing ? B.CreateICmpSLT(LS.IndVarNext, ExitSubloopAt)
+                 : B.CreateICmpSGT(LS.IndVarNext, ExitSubloopAt);
+  Value *CondForBranch = LS.LatchBrExitIdx == 1
+                             ? TakeBackedgeLoopCond
+                             : B.CreateNot(TakeBackedgeLoopCond);
+
+  LS.LatchBr->setCondition(CondForBranch);
+
+  B.SetInsertPoint(RRI.ExitSelector);
+
+  // IterationsLeft - are there any more iterations left, given the original
+  // upper bound on the induction variable?  If not, we branch to the "real"
+  // exit.
+  Value *IterationsLeft = Increasing
+                              ? B.CreateICmpSLT(LS.IndVarNext, LS.LoopExitAt)
+                              : B.CreateICmpSGT(LS.IndVarNext, LS.LoopExitAt);
+  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+  BranchInst *BranchToContinuation =
+      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+  // each of the PHI nodes in the loop header.  This feeds into the initial
+  // value of the same PHI nodes if/when we continue execution.
+  for (Instruction &I : *LS.Header) {
+    if (!isa<PHINode>(&I))
+      break;
+
+    PHINode *PN = cast<PHINode>(&I);
+
+    PHINode *NewPHI = PHINode::Create(PN->getType(), 2, PN->getName() + ".copy",
+                                      BranchToContinuation);
+
+    NewPHI->addIncoming(PN->getIncomingValueForBlock(Preheader), Preheader);
+    NewPHI->addIncoming(PN->getIncomingValueForBlock(LS.Latch),
+                        RRI.ExitSelector);
+    RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+  }
+
+  RRI.IndVarEnd = PHINode::Create(LS.IndVarNext->getType(), 2, "indvar.end",
+                                  BranchToContinuation);
+  RRI.IndVarEnd->addIncoming(LS.IndVarStart, Preheader);
+  RRI.IndVarEnd->addIncoming(LS.IndVarNext, RRI.ExitSelector);
+
+  // The latch exit now has a branch from `RRI.ExitSelector' instead of
+  // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
+  for (Instruction &I : *LS.LatchExit) {
+    if (PHINode *PN = dyn_cast<PHINode>(&I))
+      replacePHIBlock(PN, LS.Latch, RRI.ExitSelector);
+    else
+      break;
+  }
+
+  return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+    LoopStructure &LS, BasicBlock *ContinuationBlock,
+    const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+
+  unsigned PHIIndex = 0;
+  for (Instruction &I : *LS.Header) {
+    if (!isa<PHINode>(&I))
+      break;
+
+    PHINode *PN = cast<PHINode>(&I);
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+      if (PN->getIncomingBlock(i) == ContinuationBlock)
+        PN->setIncomingValue(i, RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+  }
+
+  LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+                                             BasicBlock *OldPreheader,
+                                             const char *Tag) const {
+
+  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+  BranchInst::Create(LS.Header, Preheader);
+
+  for (Instruction &I : *LS.Header) {
+    if (!isa<PHINode>(&I))
+      break;
+
+    PHINode *PN = cast<PHINode>(&I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i)
+      replacePHIBlock(PN, OldPreheader, Preheader);
+  }
+
+  return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+  Loop *ParentLoop = OriginalLoop.getParentLoop();
+  if (!ParentLoop)
+    return;
+
+  for (BasicBlock *BB : BBs)
+    ParentLoop->addBasicBlockToLoop(BB, OriginalLoopInfo);
+}
+
+bool LoopConstrainer::run() {
+  BasicBlock *Preheader = nullptr;
+  LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+  Preheader = OriginalLoop.getLoopPreheader();
+  assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+         "preconditions!");
+
+  OriginalPreheader = Preheader;
+  MainLoopPreheader = Preheader;
+
+  Optional<SubRanges> MaybeSR = calculateSubRanges();
+  if (!MaybeSR.hasValue()) {
+    DEBUG(dbgs() << "irce: could not compute subranges\n");
+    return false;
+  }
+
+  SubRanges SR = MaybeSR.getValue();
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+  IntegerType *IVTy =
+      cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
+
+  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
+  Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+  // It would have been better to make `PreLoop' and `PostLoop'
+  // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+  // constructor.
+  ClonedLoop PreLoop, PostLoop;
+  bool NeedsPreLoop =
+      Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+  bool NeedsPostLoop =
+      Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+  Value *ExitPreLoopAt = nullptr;
+  Value *ExitMainLoopAt = nullptr;
+  const SCEVConstant *MinusOneS =
+      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+  if (NeedsPreLoop) {
+    const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitPreLoopAtSCEV = *SR.LowLimit;
+    else {
+      if (CanBeSMin(SE, *SR.HighLimit)) {
+        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                     << "preloop exit limit.  HighLimit = " << *(*SR.HighLimit)
+                     << "\n");
+        return false;
+      }
+      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+    }
+
+    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+    ExitPreLoopAt->setName("exit.preloop.at");
+  }
+
+  if (NeedsPostLoop) {
+    const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitMainLoopAtSCEV = *SR.HighLimit;
+    else {
+      if (CanBeSMin(SE, *SR.LowLimit)) {
+        DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+                     << "mainloop exit limit.  LowLimit = " << *(*SR.LowLimit)
+                     << "\n");
+        return false;
+      }
+      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+    }
+
+    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+    ExitMainLoopAt->setName("exit.mainloop.at");
+  }
+
+  // We clone these ahead of time so that we don't have to deal with changing
+  // and temporarily invalid IR as we transform the loops.
+  if (NeedsPreLoop)
+    cloneLoop(PreLoop, "preloop");
+  if (NeedsPostLoop)
+    cloneLoop(PostLoop, "postloop");
+
+  RewrittenRangeInfo PreLoopRRI;
+
+  if (NeedsPreLoop) {
+    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+                                                  PreLoop.Structure.Header);
+
+    MainLoopPreheader =
+        createPreheader(MainLoopStructure, Preheader, "mainloop");
+    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+                                         ExitPreLoopAt, MainLoopPreheader);
+    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+                                 PreLoopRRI);
+  }
+
+  BasicBlock *PostLoopPreheader = nullptr;
+  RewrittenRangeInfo PostLoopRRI;
+
+  if (NeedsPostLoop) {
+    PostLoopPreheader =
+        createPreheader(PostLoop.Structure, Preheader, "postloop");
+    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+                                          ExitMainLoopAt, PostLoopPreheader);
+    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+                                 PostLoopRRI);
+  }
+
+  BasicBlock *NewMainLoopPreheader =
+      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit,
+                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit,
+                             PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+  // Some of the above may be nullptr, filter them out before passing to
+  // addToParentLoopIfNeeded.
+  auto NewBlocksEnd =
+      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+  addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+  addToParentLoopIfNeeded(PreLoop.Blocks);
+  addToParentLoopIfNeeded(PostLoop.Blocks);
+
+  return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided.  If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
+                                               const SCEVAddRecExpr *IndVar,
+                                               IRBuilder<> &) const {
+  // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+  // variable, that may or may not exist as a real llvm::Value in the loop) and
+  // this inductive range check is a range check on the "C + D * I" ("C" is
+  // getOffset() and "D" is getScale()).  We rewrite the value being range
+  // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+  // Currently we support this only for "B" = "D" = { 1 or -1 }, but the code
+  // can be generalized as needed.
+  //
+  // The actual inequalities we solve are of the form
+  //
+  //   0 <= M + 1 * IndVar < L given L >= 0  (i.e. N == 1)
+  //
+  // The inequality is satisfied by -M <= IndVar < (L - M) [^1].  All additions
+  // and subtractions are twos-complement wrapping and comparisons are signed.
+  //
+  // Proof:
+  //
+  //   If there exists IndVar such that -M <= IndVar < (L - M) then it follows
+  //   that -M <= (-M + L) [== Eq. 1].  Since L >= 0, if (-M + L) sign-overflows
+  //   then (-M + L) < (-M).  Hence by [Eq. 1], (-M + L) could not have
+  //   overflown.
+  //
+  //   This means IndVar = t + (-M) for t in [0, L).  Hence (IndVar + M) = t.
+  //   Hence 0 <= (IndVar + M) < L
+
+  // [^1]: Note that the solution does _not_ apply if L < 0; consider values M =
+  // 127, IndVar = 126 and L = -2 in an i8 world.
+
+  if (!IndVar->isAffine())
+    return None;
+
+  const SCEV *A = IndVar->getStart();
+  const SCEVConstant *B = dyn_cast<SCEVConstant>(IndVar->getStepRecurrence(SE));
+  if (!B)
+    return None;
+
+  const SCEV *C = getOffset();
+  const SCEVConstant *D = dyn_cast<SCEVConstant>(getScale());
+  if (D != B)
+    return None;
+
+  ConstantInt *ConstD = D->getValue();
+  if (!(ConstD->isMinusOne() || ConstD->isOne()))
+    return None;
+
+  const SCEV *M = SE.getMinusSCEV(C, A);
+
+  const SCEV *Begin = SE.getNegativeSCEV(M);
+  const SCEV *UpperLimit = nullptr;
+
+  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+  // We can potentially do much better here.
+  if (Value *V = getLength()) {
+    UpperLimit = SE.getSCEV(V);
+  } else {
+    assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+    unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+    UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+  }
+
+  const SCEV *End = SE.getMinusSCEV(UpperLimit, M);
+  return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectRange(ScalarEvolution &SE,
+               const Optional<InductiveRangeCheck::Range> &R1,
+               const InductiveRangeCheck::Range &R2, IRBuilder<> &B) {
+  if (!R1.hasValue())
+    return R2;
+  auto &R1Value = R1.getValue();
+
+  // TODO: we could widen the smaller range and have this work; but for now we
+  // bail out to keep things simple.
+  if (R1Value.getType() != R2.getType())
+    return None;
+
+  const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+  const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+  return InductiveRangeCheck::Range(NewBegin, NewEnd);
+}
+
+bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (L->getBlocks().size() >= LoopSizeCutoff) {
+    DEBUG(dbgs() << "irce: giving up constraining loop, too large\n";);
+    return false;
+  }
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+    return false;
+  }
+
+  LLVMContext &Context = Preheader->getContext();
+  InductiveRangeCheck::AllocatorTy IRCAlloc;
+  SmallVector<InductiveRangeCheck *, 16> RangeChecks;
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  BranchProbabilityInfo &BPI =
+      getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+
+  for (auto BBI : L->getBlocks())
+    if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+      if (InductiveRangeCheck *IRC =
+          InductiveRangeCheck::create(IRCAlloc, TBI, L, SE, BPI))
+        RangeChecks.push_back(IRC);
+
+  if (RangeChecks.empty())
+    return false;
+
+  auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+    OS << "irce: looking at loop "; L->print(OS);
+    OS << "irce: loop has " << RangeChecks.size()
+       << " inductive range checks: \n";
+    for (InductiveRangeCheck *IRC : RangeChecks)
+      IRC->print(OS);
+  };
+
+  DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+  if (PrintRangeChecks)
+    PrintRecognizedRangeChecks(errs());
+
+  const char *FailureReason = nullptr;
+  Optional<LoopStructure> MaybeLoopStructure =
+      LoopStructure::parseLoopStructure(SE, BPI, *L, FailureReason);
+  if (!MaybeLoopStructure.hasValue()) {
+    DEBUG(dbgs() << "irce: could not parse loop structure: " << FailureReason
+                 << "\n";);
+    return false;
+  }
+  LoopStructure LS = MaybeLoopStructure.getValue();
+  bool Increasing = LS.IndVarIncreasing;
+  const SCEV *MinusOne =
+      SE.getConstant(LS.IndVarNext->getType(), Increasing ? -1 : 1, true);
+  const SCEVAddRecExpr *IndVar =
+      cast<SCEVAddRecExpr>(SE.getAddExpr(SE.getSCEV(LS.IndVarNext), MinusOne));
+
+  Optional<InductiveRangeCheck::Range> SafeIterRange;
+  Instruction *ExprInsertPt = Preheader->getTerminator();
+
+  SmallVector<InductiveRangeCheck *, 4> RangeChecksToEliminate;
+
+  IRBuilder<> B(ExprInsertPt);
+  for (InductiveRangeCheck *IRC : RangeChecks) {
+    auto Result = IRC->computeSafeIterationSpace(SE, IndVar, B);
+    if (Result.hasValue()) {
+      auto MaybeSafeIterRange =
+        IntersectRange(SE, SafeIterRange, Result.getValue(), B);
+      if (MaybeSafeIterRange.hasValue()) {
+        RangeChecksToEliminate.push_back(IRC);
+        SafeIterRange = MaybeSafeIterRange.getValue();
+      }
+    }
+  }
+
+  if (!SafeIterRange.hasValue())
+    return false;
+
+  LoopConstrainer LC(*L, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), LS,
+                     SE, SafeIterRange.getValue());
+  bool Changed = LC.run();
+
+  if (Changed) {
+    auto PrintConstrainedLoopInfo = [L]() {
+      dbgs() << "irce: in function ";
+      dbgs() << L->getHeader()->getParent()->getName() << ": ";
+      dbgs() << "constrained ";
+      L->print(dbgs());
+    };
+
+    DEBUG(PrintConstrainedLoopInfo());
+
+    if (PrintChangedLoops)
+      PrintConstrainedLoopInfo();
+
+    // Optimize away the now-redundant range checks.
+
+    for (InductiveRangeCheck *IRC : RangeChecksToEliminate) {
+      ConstantInt *FoldedRangeCheck = IRC->getPassingDirection()
+                                          ? ConstantInt::getTrue(Context)
+                                          : ConstantInt::getFalse(Context);
+      IRC->getBranch()->setCondition(FoldedRangeCheck);
+    }
+  }
+
+  return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+  return new InductiveRangeCheckElimination;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 0000000..dcdcfed
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,1955 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <memory>
+using namespace llvm;
+
+#define DEBUG_TYPE "jump-threading"
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds,   "Number of terminators folded");
+STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+BBDuplicateThreshold("jump-threading-threshold",
+          cl::desc("Max block size to duplicate for jump threading"),
+          cl::init(6), cl::Hidden);
+
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+  "jump-threading-implication-search-threshold",
+  cl::desc("The number of predecessors to search for a stronger "
+           "condition to use to thread over a weaker condition"),
+  cl::init(3), cl::Hidden);
+
+namespace {
+  // These are at global scope so static functions can use them too.
+  typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo;
+  typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy;
+
+  // This is used to keep track of what kind of constant we're currently hoping
+  // to find.
+  enum ConstantPreference {
+    WantInteger,
+    WantBlockAddress
+  };
+
+  /// This pass performs 'jump threading', which looks at blocks that have
+  /// multiple predecessors and multiple successors.  If one or more of the
+  /// predecessors of the block can be proven to always jump to one of the
+  /// successors, we forward the edge from the predecessor to the successor by
+  /// duplicating the contents of this block.
+  ///
+  /// An example of when this can occur is code like this:
+  ///
+  ///   if () { ...
+  ///     X = 4;
+  ///   }
+  ///   if (X < 3) {
+  ///
+  /// In this case, the unconditional branch at the end of the first if can be
+  /// revectored to the false side of the second if.
+  ///
+  class JumpThreading : public FunctionPass {
+    TargetLibraryInfo *TLI;
+    LazyValueInfo *LVI;
+    std::unique_ptr<BlockFrequencyInfo> BFI;
+    std::unique_ptr<BranchProbabilityInfo> BPI;
+    bool HasProfileData;
+#ifdef NDEBUG
+    SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
+#else
+    SmallSet<AssertingVH<const BasicBlock>, 16> LoopHeaders;
+#endif
+    DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
+
+    unsigned BBDupThreshold;
+
+    // RAII helper for updating the recursion stack.
+    struct RecursionSetRemover {
+      DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
+      std::pair<Value*, BasicBlock*> ThePair;
+
+      RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S,
+                          std::pair<Value*, BasicBlock*> P)
+        : TheSet(S), ThePair(P) { }
+
+      ~RecursionSetRemover() {
+        TheSet.erase(ThePair);
+      }
+    };
+  public:
+    static char ID; // Pass identification
+    JumpThreading(int T = -1) : FunctionPass(ID) {
+      BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+      initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LazyValueInfo>();
+      AU.addPreserved<LazyValueInfo>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+    void releaseMemory() override {
+      BFI.reset();
+      BPI.reset();
+    }
+
+    void FindLoopHeaders(Function &F);
+    bool ProcessBlock(BasicBlock *BB);
+    bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
+                    BasicBlock *SuccBB);
+    bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
+                                  const SmallVectorImpl<BasicBlock *> &PredBBs);
+
+    bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
+                                         PredValueInfo &Result,
+                                         ConstantPreference Preference,
+                                         Instruction *CxtI = nullptr);
+    bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+                                ConstantPreference Preference,
+                                Instruction *CxtI = nullptr);
+
+    bool ProcessBranchOnPHI(PHINode *PN);
+    bool ProcessBranchOnXOR(BinaryOperator *BO);
+    bool ProcessImpliedCondition(BasicBlock *BB);
+
+    bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+    bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
+    bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
+
+  private:
+    BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
+                                const char *Suffix);
+    void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB,
+                                      BasicBlock *NewBB, BasicBlock *SuccBB);
+  };
+}
+
+char JumpThreading::ID = 0;
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
+
+/// runOnFunction - Top level algorithm.
+///
+bool JumpThreading::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LVI = &getAnalysis<LazyValueInfo>();
+  BFI.reset();
+  BPI.reset();
+  // When profile data is available, we need to update edge weights after
+  // successful jump threading, which requires both BPI and BFI being available.
+  HasProfileData = F.getEntryCount().hasValue();
+  if (HasProfileData) {
+    LoopInfo LI{DominatorTree(F)};
+    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+  }
+
+  // Remove unreachable blocks from function as they may result in infinite
+  // loop. We do threading if we found something profitable. Jump threading a
+  // branch can create other opportunities. If these opportunities form a cycle
+  // i.e. if any jump threading is undoing previous threading in the path, then
+  // we will loop forever. We take care of this issue by not jump threading for
+  // back edges. This works for normal cases but not for unreachable blocks as
+  // they may have cycle with no back edge.
+  bool EverChanged = false;
+  EverChanged |= removeUnreachableBlocks(F, LVI);
+
+  FindLoopHeaders(F);
+
+  bool Changed;
+  do {
+    Changed = false;
+    for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+      BasicBlock *BB = &*I;
+      // Thread all of the branches we can over this block.
+      while (ProcessBlock(BB))
+        Changed = true;
+
+      ++I;
+
+      // If the block is trivially dead, zap it.  This eliminates the successor
+      // edges which simplifies the CFG.
+      if (pred_empty(BB) &&
+          BB != &BB->getParent()->getEntryBlock()) {
+        DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
+              << "' with terminator: " << *BB->getTerminator() << '\n');
+        LoopHeaders.erase(BB);
+        LVI->eraseBlock(BB);
+        DeleteDeadBlock(BB);
+        Changed = true;
+        continue;
+      }
+
+      BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+
+      // Can't thread an unconditional jump, but if the block is "almost
+      // empty", we can replace uses of it with uses of the successor and make
+      // this dead.
+      if (BI && BI->isUnconditional() &&
+          BB != &BB->getParent()->getEntryBlock() &&
+          // If the terminator is the only non-phi instruction, try to nuke it.
+          BB->getFirstNonPHIOrDbg()->isTerminator()) {
+        // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
+        // block, we have to make sure it isn't in the LoopHeaders set.  We
+        // reinsert afterward if needed.
+        bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
+        BasicBlock *Succ = BI->getSuccessor(0);
+
+        // FIXME: It is always conservatively correct to drop the info
+        // for a block even if it doesn't get erased.  This isn't totally
+        // awesome, but it allows us to use AssertingVH to prevent nasty
+        // dangling pointer issues within LazyValueInfo.
+        LVI->eraseBlock(BB);
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+          Changed = true;
+          // If we deleted BB and BB was the header of a loop, then the
+          // successor is now the header of the loop.
+          BB = Succ;
+        }
+
+        if (ErasedFromLoopHeaders)
+          LoopHeaders.insert(BB);
+      }
+    }
+    EverChanged |= Changed;
+  } while (Changed);
+
+  LoopHeaders.clear();
+  return EverChanged;
+}
+
+/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
+/// thread across it. Stop scanning the block when passing the threshold.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+                                             unsigned Threshold) {
+  /// Ignore PHI nodes, these will be flattened when duplication happens.
+  BasicBlock::const_iterator I(BB->getFirstNonPHI());
+
+  // FIXME: THREADING will delete values that are just used to compute the
+  // branch, so they shouldn't count against the duplication cost.
+
+  unsigned Bonus = 0;
+  const TerminatorInst *BBTerm = BB->getTerminator();
+  // Threading through a switch statement is particularly profitable.  If this
+  // block ends in a switch, decrease its cost to make it more likely to happen.
+  if (isa<SwitchInst>(BBTerm))
+    Bonus = 6;
+
+  // The same holds for indirect branches, but slightly more so.
+  if (isa<IndirectBrInst>(BBTerm))
+    Bonus = 8;
+
+  // Bump the threshold up so the early exit from the loop doesn't skip the
+  // terminator-based Size adjustment at the end.
+  Threshold += Bonus;
+
+  // Sum up the cost of each instruction until we get to the terminator.  Don't
+  // include the terminator because the copy won't include it.
+  unsigned Size = 0;
+  for (; !isa<TerminatorInst>(I); ++I) {
+
+    // Stop scanning the block if we've reached the threshold.
+    if (Size > Threshold)
+      return Size;
+
+    // Debugger intrinsics don't incur code size.
+    if (isa<DbgInfoIntrinsic>(I)) continue;
+
+    // If this is a pointer->pointer bitcast, it is free.
+    if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+      continue;
+
+    // Bail out if this instruction gives back a token type, it is not possible
+    // to duplicate it if it is used outside this BB.
+    if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+      return ~0U;
+
+    // All other instructions count for at least one unit.
+    ++Size;
+
+    // Calls are more expensive.  If they are non-intrinsic calls, we model them
+    // as having cost of 4.  If they are a non-vector intrinsic, we model them
+    // as having cost of 2 total, and if they are a vector intrinsic, we model
+    // them as having cost 1.
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (CI->cannotDuplicate() || CI->isConvergent())
+        // Blocks with NoDuplicate are modelled as having infinite cost, so they
+        // are never duplicated.
+        return ~0U;
+      else if (!isa<IntrinsicInst>(CI))
+        Size += 3;
+      else if (!CI->getType()->isVectorTy())
+        Size += 1;
+    }
+  }
+
+  return Size > Bonus ? Size - Bonus : 0;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops.  Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations.  To prevent this from
+/// happening, we first have to find the loop headers.  Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header.  For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop).  This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+///
+void JumpThreading::FindLoopHeaders(Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+
+  for (const auto &Edge : Edges)
+    LoopHeaders.insert(Edge.second);
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+  if (!Val)
+    return nullptr;
+
+  // Undef is "known" enough.
+  if (UndefValue *U = dyn_cast<UndefValue>(Val))
+    return U;
+
+  if (Preference == WantBlockAddress)
+    return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+  return dyn_cast<ConstantInt>(Val);
+}
+
+/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors.  If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
+///
+bool JumpThreading::
+ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
+                                ConstantPreference Preference,
+                                Instruction *CxtI) {
+  // This method walks up use-def chains recursively.  Because of this, we could
+  // get into an infinite loop going around loops in the use-def chain.  To
+  // prevent this, keep track of what (value, block) pairs we've already visited
+  // and terminate the search if we loop back to them
+  if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+    return false;
+
+  // An RAII help to remove this pair from the recursion set once the recursion
+  // stack pops back out again.
+  RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
+
+  // If V is a constant, then it is known in all predecessors.
+  if (Constant *KC = getKnownConstant(V, Preference)) {
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.push_back(std::make_pair(KC, Pred));
+
+    return true;
+  }
+
+  // If V is a non-instruction value, or an instruction in a different block,
+  // then it can't be derived from a PHI.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || I->getParent() != BB) {
+
+    // Okay, if this is a live-in value, see if it has a known value at the end
+    // of any of our predecessors.
+    //
+    // FIXME: This should be an edge property, not a block end property.
+    /// TODO: Per PR2563, we could infer value range information about a
+    /// predecessor based on its terminator.
+    //
+    // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+    // "I" is a non-local compare-with-a-constant instruction.  This would be
+    // able to handle value inequalities better, for example if the compare is
+    // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+    // Perhaps getConstantOnEdge should be smart enough to do this?
+
+    for (BasicBlock *P : predecessors(BB)) {
+      // If the value is known by LazyValueInfo to be a constant in a
+      // predecessor, use that information to try to thread this block.
+      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
+      if (Constant *KC = getKnownConstant(PredCst, Preference))
+        Result.push_back(std::make_pair(KC, P));
+    }
+
+    return !Result.empty();
+  }
+
+  /// If I is a PHI node, then we know the incoming values for any constants.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      if (Constant *KC = getKnownConstant(InVal, Preference)) {
+        Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+      } else {
+        Constant *CI = LVI->getConstantOnEdge(InVal,
+                                              PN->getIncomingBlock(i),
+                                              BB, CxtI);
+        if (Constant *KC = getKnownConstant(CI, Preference))
+          Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  PredValueInfoTy LHSVals, RHSVals;
+
+  // Handle some boolean conditions.
+  if (I->getType()->getPrimitiveSizeInBits() == 1) {
+    assert(Preference == WantInteger && "One-bit non-integer type?");
+    // X | true -> true
+    // X & false -> false
+    if (I->getOpcode() == Instruction::Or ||
+        I->getOpcode() == Instruction::And) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                      WantInteger, CxtI);
+      ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
+                                      WantInteger, CxtI);
+
+      if (LHSVals.empty() && RHSVals.empty())
+        return false;
+
+      ConstantInt *InterestingVal;
+      if (I->getOpcode() == Instruction::Or)
+        InterestingVal = ConstantInt::getTrue(I->getContext());
+      else
+        InterestingVal = ConstantInt::getFalse(I->getContext());
+
+      SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+      // Scan for the sentinel.  If we find an undef, force it to the
+      // interesting value: x|undef -> true and x&undef -> false.
+      for (const auto &LHSVal : LHSVals)
+        if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+          Result.emplace_back(InterestingVal, LHSVal.second);
+          LHSKnownBBs.insert(LHSVal.second);
+        }
+      for (const auto &RHSVal : RHSVals)
+        if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
+          // If we already inferred a value for this block on the LHS, don't
+          // re-add it.
+          if (!LHSKnownBBs.count(RHSVal.second))
+            Result.emplace_back(InterestingVal, RHSVal.second);
+        }
+
+      return !Result.empty();
+    }
+
+    // Handle the NOT form of XOR.
+    if (I->getOpcode() == Instruction::Xor &&
+        isa<ConstantInt>(I->getOperand(1)) &&
+        cast<ConstantInt>(I->getOperand(1))->isOne()) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
+                                      WantInteger, CxtI);
+      if (Result.empty())
+        return false;
+
+      // Invert the known values.
+      for (auto &R : Result)
+        R.first = ConstantExpr::getNot(R.first);
+
+      return true;
+    }
+
+  // Try to simplify some other binary operator values.
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    assert(Preference != WantBlockAddress
+            && "A binary operator creating a block address?");
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+      PredValueInfoTy LHSVals;
+      ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
+                                      WantInteger, CxtI);
+
+      // Try to use constant folding to simplify the binary operator.
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
+        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.push_back(std::make_pair(KC, LHSVal.second));
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  // Handle compare with phi operand, where the PHI is defined in this block.
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    assert(Preference == WantInteger && "Compares only produce integers");
+    PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+    if (PN && PN->getParent() == BB) {
+      const DataLayout &DL = PN->getModule()->getDataLayout();
+      // We can do this simplification if any comparisons fold to true or false.
+      // See if any do.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *PredBB = PN->getIncomingBlock(i);
+        Value *LHS = PN->getIncomingValue(i);
+        Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+
+        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL);
+        if (!Res) {
+          if (!isa<Constant>(RHS))
+            continue;
+
+          LazyValueInfo::Tristate
+            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+                                           cast<Constant>(RHS), PredBB, BB,
+                                           CxtI ? CxtI : Cmp);
+          if (ResT == LazyValueInfo::Unknown)
+            continue;
+          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+        }
+
+        if (Constant *KC = getKnownConstant(Res, WantInteger))
+          Result.push_back(std::make_pair(KC, PredBB));
+      }
+
+      return !Result.empty();
+    }
+
+    // If comparing a live-in value against a constant, see if we know the
+    // live-in value on any predecessors.
+    if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
+      if (!isa<Instruction>(Cmp->getOperand(0)) ||
+          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
+        Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+
+        for (BasicBlock *P : predecessors(BB)) {
+          // If the value is known by LazyValueInfo to be a constant in a
+          // predecessor, use that information to try to thread this block.
+          LazyValueInfo::Tristate Res =
+            LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+                                    RHSCst, P, BB, CxtI ? CxtI : Cmp);
+          if (Res == LazyValueInfo::Unknown)
+            continue;
+
+          Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+          Result.push_back(std::make_pair(ResC, P));
+        }
+
+        return !Result.empty();
+      }
+
+      // Try to find a constant value for the LHS of a comparison,
+      // and evaluate it statically if we can.
+      if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
+        PredValueInfoTy LHSVals;
+        ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                        WantInteger, CxtI);
+
+        for (const auto &LHSVal : LHSVals) {
+          Constant *V = LHSVal.first;
+          Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
+                                                      V, CmpConst);
+          if (Constant *KC = getKnownConstant(Folded, WantInteger))
+            Result.push_back(std::make_pair(KC, LHSVal.second));
+        }
+
+        return !Result.empty();
+      }
+    }
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+    // Handle select instructions where at least one operand is a known constant
+    // and we can figure out the condition value for any predecessor block.
+    Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+    Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+    PredValueInfoTy Conds;
+    if ((TrueVal || FalseVal) &&
+        ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
+                                        WantInteger, CxtI)) {
+      for (auto &C : Conds) {
+        Constant *Cond = C.first;
+
+        // Figure out what value to use for the condition.
+        bool KnownCond;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+          // A known boolean.
+          KnownCond = CI->isOne();
+        } else {
+          assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+          // Either operand will do, so be sure to pick the one that's a known
+          // constant.
+          // FIXME: Do this more cleverly if both values are known constants?
+          KnownCond = (TrueVal != nullptr);
+        }
+
+        // See if the select has a known constant value for this predecessor.
+        if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+          Result.push_back(std::make_pair(Val, C.second));
+      }
+
+      return !Result.empty();
+    }
+  }
+
+  // If all else fails, see if LVI can figure out a constant value for us.
+  Constant *CI = LVI->getConstant(V, BB, CxtI);
+  if (Constant *KC = getKnownConstant(CI, Preference)) {
+    for (BasicBlock *Pred : predecessors(BB))
+      Result.push_back(std::make_pair(KC, Pred));
+  }
+
+  return !Result.empty();
+}
+
+
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors.  This should reduce the in-degree of the others.
+///
+static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+  TerminatorInst *BBTerm = BB->getTerminator();
+  unsigned MinSucc = 0;
+  BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+  // Compute the successor with the minimum number of predecessors.
+  unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+  for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+    TestBB = BBTerm->getSuccessor(i);
+    unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+    if (NumPreds < MinNumPreds) {
+      MinSucc = i;
+      MinNumPreds = NumPreds;
+    }
+  }
+
+  return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+  if (!BB->hasAddressTaken()) return false;
+
+  // If the block has its address taken, it may be a tree of dead constants
+  // hanging off of it.  These shouldn't keep the block alive.
+  BlockAddress *BA = BlockAddress::get(BB);
+  BA->removeDeadConstantUsers();
+  return !BA->use_empty();
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+  // If the block is trivially dead, just return and let the caller nuke it.
+  // This simplifies other transformations.
+  if (pred_empty(BB) &&
+      BB != &BB->getParent()->getEntryBlock())
+    return false;
+
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
+    const TerminatorInst *TI = SinglePred->getTerminator();
+    if (!TI->isExceptional() && TI->getNumSuccessors() == 1 &&
+        SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
+      // If SinglePred was a loop header, BB becomes one.
+      if (LoopHeaders.erase(SinglePred))
+        LoopHeaders.insert(BB);
+
+      LVI->eraseBlock(SinglePred);
+      MergeBasicBlockIntoOnlyPred(BB);
+
+      return true;
+    }
+  }
+
+  if (TryToUnfoldSelectInCurrBB(BB))
+    return true;
+
+  // What kind of constant we're looking for.
+  ConstantPreference Preference = WantInteger;
+
+  // Look to see if the terminator is a conditional branch, switch or indirect
+  // branch, if not we can't thread it.
+  Value *Condition;
+  Instruction *Terminator = BB->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+    // Can't thread an unconditional jump.
+    if (BI->isUnconditional()) return false;
+    Condition = BI->getCondition();
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+    Condition = SI->getCondition();
+  } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+    // Can't thread indirect branch with no successors.
+    if (IB->getNumSuccessors() == 0) return false;
+    Condition = IB->getAddress()->stripPointerCasts();
+    Preference = WantBlockAddress;
+  } else {
+    return false; // Must be an invoke.
+  }
+
+  // Run constant folding to see if we can reduce the condition to a simple
+  // constant.
+  if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+    Value *SimpleVal =
+        ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+    if (SimpleVal) {
+      I->replaceAllUsesWith(SimpleVal);
+      I->eraseFromParent();
+      Condition = SimpleVal;
+    }
+  }
+
+  // If the terminator is branching on an undef, we can pick any of the
+  // successors to branch to.  Let GetBestDestForJumpOnUndef decide.
+  if (isa<UndefValue>(Condition)) {
+    unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+
+    // Fold the branch/switch.
+    TerminatorInst *BBTerm = BB->getTerminator();
+    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      if (i == BestSucc) continue;
+      BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+    }
+
+    DEBUG(dbgs() << "  In block '" << BB->getName()
+          << "' folding undef terminator: " << *BBTerm << '\n');
+    BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+    BBTerm->eraseFromParent();
+    return true;
+  }
+
+  // If the terminator of this block is branching on a constant, simplify the
+  // terminator to an unconditional branch.  This can occur due to threading in
+  // other blocks.
+  if (getKnownConstant(Condition, Preference)) {
+    DEBUG(dbgs() << "  In block '" << BB->getName()
+          << "' folding terminator: " << *BB->getTerminator() << '\n');
+    ++NumFolds;
+    ConstantFoldTerminator(BB, true);
+    return true;
+  }
+
+  Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+  // All the rest of our checks depend on the condition being an instruction.
+  if (!CondInst) {
+    // FIXME: Unify this with code below.
+    if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
+      return true;
+    return false;
+  }
+
+
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+    // If we're branching on a conditional, LVI might be able to determine
+    // it's value at the branch instruction.  We only handle comparisons
+    // against a constant at this time.
+    // TODO: This should be extended to handle switches as well.
+    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+    if (CondBr && CondConst && CondBr->isConditional()) {
+      LazyValueInfo::Tristate Ret =
+        LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+                            CondConst, CondBr);
+      if (Ret != LazyValueInfo::Unknown) {
+        unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+        unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+        CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+        BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+        CondBr->eraseFromParent();
+        if (CondCmp->use_empty())
+          CondCmp->eraseFromParent();
+        else if (CondCmp->getParent() == BB) {
+          // If the fact we just learned is true for all uses of the
+          // condition, replace it with a constant value
+          auto *CI = Ret == LazyValueInfo::True ?
+            ConstantInt::getTrue(CondCmp->getType()) :
+            ConstantInt::getFalse(CondCmp->getType());
+          CondCmp->replaceAllUsesWith(CI);
+          CondCmp->eraseFromParent();
+        }
+        return true;
+      }
+    }
+
+    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
+      return true;
+  }
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  //
+  Value *SimplifyValue = CondInst;
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+
+  // TODO: There are other places where load PRE would be profitable, such as
+  // more complex comparisons.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LI))
+      return true;
+
+
+  // Handle a variety of cases where we are branching on something derived from
+  // a PHI node in the current block.  If we can prove that any predecessors
+  // compute a predictable value based on a PHI node, thread those predecessors.
+  //
+  if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
+    return true;
+
+  // If this is an otherwise-unfoldable branch on a phi node in the current
+  // block, see if we can simplify.
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      return ProcessBranchOnPHI(PN);
+
+
+  // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+  if (CondInst->getOpcode() == Instruction::Xor &&
+      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+    return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+
+  // Search for a stronger dominating condition that can be used to simplify a
+  // conditional branch leaving BB.
+  if (ProcessImpliedCondition(BB))
+    return true;
+
+  return false;
+}
+
+bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) {
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  BasicBlock *CurrentBB = BB;
+  BasicBlock *CurrentPred = BB->getSinglePredecessor();
+  unsigned Iter = 0;
+
+  auto &DL = BB->getModule()->getDataLayout();
+
+  while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+    if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB)
+      return false;
+
+    if (isImpliedCondition(PBI->getCondition(), Cond, DL)) {
+      BI->getSuccessor(1)->removePredecessor(BB);
+      BranchInst::Create(BI->getSuccessor(0), BI);
+      BI->eraseFromParent();
+      return true;
+    }
+    CurrentBB = CurrentPred;
+    CurrentPred = CurrentBB->getSinglePredecessor();
+  }
+
+  return false;
+}
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node.  This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile/atomic loads.
+  if (!LI->isSimple()) return false;
+
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+
+  // If the load is defined in an EH pad, it can't be partially redundant,
+  // because the edges between the invoke and the EH pad cannot have other
+  // instructions between them.
+  if (LoadBB->isEHPad())
+    return false;
+
+  Value *LoadedPtr = LI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB, it can't be available.
+  // TODO: Could do simple PHI translation, that would be fun :)
+  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
+    if (PtrOp->getParent() == LoadBB)
+      return false;
+
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt(LI);
+
+  if (Value *AvailableVal =
+        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) {
+    // If the value of the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+    //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
+
+    // If the returned value is the load itself, replace with an undef. This can
+    // only happen in dead loops.
+    if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+    if (AvailableVal->getType() != LI->getType())
+      AvailableVal =
+          CastInst::CreateBitOrPointerCast(AvailableVal, LI->getType(), "", LI);
+    LI->replaceAllUsesWith(AvailableVal);
+    LI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+
+  // If all of the loads and stores that feed the value have the same AA tags,
+  // then we can propagate them onto any newly inserted loads.
+  AAMDNodes AATags;
+  LI->getAAMetadata(AATags);
+
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = nullptr;
+
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (BasicBlock *PredBB : predecessors(LoadBB)) {
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB).second)
+      continue;
+
+    // Scan the predecessor to see if the value is available in the pred.
+    BBIt = PredBB->end();
+    AAMDNodes ThisAATags;
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt,
+                                                    DefMaxInstsToScan,
+                                                    nullptr, &ThisAATags);
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+
+    // If AA tags disagree or are not present, forget about them.
+    if (AATags != ThisAATags) AATags = AAMDNodes();
+
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+  }
+
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = nullptr;
+
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (const auto &AvailablePred : AvailablePreds)
+      AvailablePredSet.insert(AvailablePred.first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (BasicBlock *P : predecessors(LoadBB)) {
+      // If the predecessor is an indirect goto, we can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()))
+        return false;
+
+      if (!AvailablePredSet.count(P))
+        PredsToSplit.push_back(P);
+    }
+
+    // Split them out to their own block.
+    UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
+  }
+
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+                                 LI->getAlignment(),
+                                 UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LI->getDebugLoc());
+    if (AATags)
+      NewVal->setAAMetadata(AATags);
+
+    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+  }
+
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+  PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+                                &LoadBB->front());
+  PN->takeName(LI);
+  PN->setDebugLoc(LI->getDebugLoc());
+
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = PB; PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    AvailablePredsTy::iterator I =
+      std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+                       std::make_pair(P, (Value*)nullptr));
+
+    assert(I != AvailablePreds.end() && I->first == P &&
+           "Didn't find entry for predecessor!");
+
+    // If we have an available predecessor but it requires casting, insert the
+    // cast in the predecessor and use the cast. Note that we have to update the
+    // AvailablePreds vector as we go so that all of the PHI entries for this
+    // predecessor use the same bitcast.
+    Value *&PredV = I->second;
+    if (PredV->getType() != LI->getType())
+      PredV = CastInst::CreateBitOrPointerCast(PredV, LI->getType(), "",
+                                               P->getTerminator());
+
+    PN->addIncoming(PredV, I->first);
+  }
+
+  //cerr << "PRE: " << *LI << *PN << "\n";
+
+  LI->replaceAllUsesWith(PN);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+/// FindMostPopularDest - The specified list contains multiple possible
+/// threadable destinations.  Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
+FindMostPopularDest(BasicBlock *BB,
+                    const SmallVectorImpl<std::pair<BasicBlock*,
+                                  BasicBlock*> > &PredToDestList) {
+  assert(!PredToDestList.empty());
+
+  // Determine popularity.  If there are multiple possible destinations, we
+  // explicitly choose to ignore 'undef' destinations.  We prefer to thread
+  // blocks with known and real destinations to threading undef.  We'll handle
+  // them later if interesting.
+  DenseMap<BasicBlock*, unsigned> DestPopularity;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second)
+      DestPopularity[PredToDest.second]++;
+
+  // Find the most popular dest.
+  DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
+  BasicBlock *MostPopularDest = DPI->first;
+  unsigned Popularity = DPI->second;
+  SmallVector<BasicBlock*, 4> SamePopularity;
+
+  for (++DPI; DPI != DestPopularity.end(); ++DPI) {
+    // If the popularity of this entry isn't higher than the popularity we've
+    // seen so far, ignore it.
+    if (DPI->second < Popularity)
+      ; // ignore.
+    else if (DPI->second == Popularity) {
+      // If it is the same as what we've seen so far, keep track of it.
+      SamePopularity.push_back(DPI->first);
+    } else {
+      // If it is more popular, remember it.
+      SamePopularity.clear();
+      MostPopularDest = DPI->first;
+      Popularity = DPI->second;
+    }
+  }
+
+  // Okay, now we know the most popular destination.  If there is more than one
+  // destination, we need to determine one.  This is arbitrary, but we need
+  // to make a deterministic decision.  Pick the first one that appears in the
+  // successor list.
+  if (!SamePopularity.empty()) {
+    SamePopularity.push_back(MostPopularDest);
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0; ; ++i) {
+      assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
+
+      if (std::find(SamePopularity.begin(), SamePopularity.end(),
+                    TI->getSuccessor(i)) == SamePopularity.end())
+        continue;
+
+      MostPopularDest = TI->getSuccessor(i);
+      break;
+    }
+  }
+
+  // Okay, we have finally picked the most popular destination.
+  return MostPopularDest;
+}
+
+bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+                                           ConstantPreference Preference,
+                                           Instruction *CxtI) {
+  // If threading this would thread across a loop header, don't even try to
+  // thread the edge.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  PredValueInfoTy PredValues;
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
+    return false;
+
+  assert(!PredValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  DEBUG(dbgs() << "IN BB: " << *BB;
+        for (const auto &PredValue : PredValues) {
+          dbgs() << "  BB '" << BB->getName() << "': FOUND condition = "
+            << *PredValue.first
+            << " for pred '" << PredValue.second->getName() << "'.\n";
+        });
+
+  // Decide what we want to thread through.  Convert our list of known values to
+  // a list of known destinations for each pred.  This also discards duplicate
+  // predecessors and keeps track of the undefined inputs (which are represented
+  // as a null dest in the PredToDestList).
+  SmallPtrSet<BasicBlock*, 16> SeenPreds;
+  SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+  BasicBlock *OnlyDest = nullptr;
+  BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+
+  for (const auto &PredValue : PredValues) {
+    BasicBlock *Pred = PredValue.second;
+    if (!SeenPreds.insert(Pred).second)
+      continue;  // Duplicate predecessor entry.
+
+    // If the predecessor ends with an indirect goto, we can't change its
+    // destination.
+    if (isa<IndirectBrInst>(Pred->getTerminator()))
+      continue;
+
+    Constant *Val = PredValue.first;
+
+    BasicBlock *DestBB;
+    if (isa<UndefValue>(Val))
+      DestBB = nullptr;
+    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+      DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+    else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+    } else {
+      assert(isa<IndirectBrInst>(BB->getTerminator())
+              && "Unexpected terminator");
+      DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+    }
+
+    // If we have exactly one destination, remember it for efficiency below.
+    if (PredToDestList.empty())
+      OnlyDest = DestBB;
+    else if (OnlyDest != DestBB)
+      OnlyDest = MultipleDestSentinel;
+
+    PredToDestList.push_back(std::make_pair(Pred, DestBB));
+  }
+
+  // If all edges were unthreadable, we fail.
+  if (PredToDestList.empty())
+    return false;
+
+  // Determine which is the most common successor.  If we have many inputs and
+  // this block is a switch, we want to start by threading the batch that goes
+  // to the most popular destination first.  If we only know about one
+  // threadable destination (the common case) we can avoid this.
+  BasicBlock *MostPopularDest = OnlyDest;
+
+  if (MostPopularDest == MultipleDestSentinel)
+    MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+
+  // Now that we know what the most popular destination is, factor all
+  // predecessors that will jump to it into a single predecessor.
+  SmallVector<BasicBlock*, 16> PredsToFactor;
+  for (const auto &PredToDest : PredToDestList)
+    if (PredToDest.second == MostPopularDest) {
+      BasicBlock *Pred = PredToDest.first;
+
+      // This predecessor may be a switch or something else that has multiple
+      // edges to the block.  Factor each of these edges by listing them
+      // according to # occurrences in PredsToFactor.
+      for (BasicBlock *Succ : successors(Pred))
+        if (Succ == BB)
+          PredsToFactor.push_back(Pred);
+    }
+
+  // If the threadable edges are branching on an undefined value, we get to pick
+  // the destination that these predecessors should get to.
+  if (!MostPopularDest)
+    MostPopularDest = BB->getTerminator()->
+                            getSuccessor(GetBestDestForJumpOnUndef(BB));
+
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+}
+
+/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node in the current block.  See if there are any simplifications we
+/// can do based on inputs to the phi node.
+///
+bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) {
+  BasicBlock *BB = PN->getParent();
+
+  // TODO: We could make use of this to do it once for blocks with common PHI
+  // values.
+  SmallVector<BasicBlock*, 1> PredBBs;
+  PredBBs.resize(1);
+
+  // If any of the predecessor blocks end in an unconditional branch, we can
+  // *duplicate* the conditional branch into that block in order to further
+  // encourage jump threading and to eliminate cases where we have branch on a
+  // phi of an icmp (branch on icmp is much better).
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+      if (PredBr->isUnconditional()) {
+        PredBBs[0] = PredBB;
+        // Try to duplicate BB into PredBB.
+        if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+          return true;
+      }
+  }
+
+  return false;
+}
+
+/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// a xor instruction in the current block.  See if there are any
+/// simplifications we can do based on inputs to the xor.
+///
+bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
+  BasicBlock *BB = BO->getParent();
+
+  // If either the LHS or RHS of the xor is a constant, don't do this
+  // optimization.
+  if (isa<ConstantInt>(BO->getOperand(0)) ||
+      isa<ConstantInt>(BO->getOperand(1)))
+    return false;
+
+  // If the first instruction in BB isn't a phi, we won't be able to infer
+  // anything special about any particular predecessor.
+  if (!isa<PHINode>(BB->front()))
+    return false;
+
+  // If we have a xor as the branch input to this block, and we know that the
+  // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+  // the condition into the predecessor and fix that value to true, saving some
+  // logical ops on that path and encouraging other paths to simplify.
+  //
+  // This copies something like this:
+  //
+  //  BB:
+  //    %X = phi i1 [1],  [%X']
+  //    %Y = icmp eq i32 %A, %B
+  //    %Z = xor i1 %X, %Y
+  //    br i1 %Z, ...
+  //
+  // Into:
+  //  BB':
+  //    %Y = icmp ne i32 %A, %B
+  //    br i1 %Y, ...
+
+  PredValueInfoTy XorOpValues;
+  bool isLHS = true;
+  if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+                                       WantInteger, BO)) {
+    assert(XorOpValues.empty());
+    if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+                                         WantInteger, BO))
+      return false;
+    isLHS = false;
+  }
+
+  assert(!XorOpValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  // Scan the information to see which is most popular: true or false.  The
+  // predecessors can be of the set true, false, or undef.
+  unsigned NumTrue = 0, NumFalse = 0;
+  for (const auto &XorOpValue : XorOpValues) {
+    if (isa<UndefValue>(XorOpValue.first))
+      // Ignore undefs for the count.
+      continue;
+    if (cast<ConstantInt>(XorOpValue.first)->isZero())
+      ++NumFalse;
+    else
+      ++NumTrue;
+  }
+
+  // Determine which value to split on, true, false, or undef if neither.
+  ConstantInt *SplitVal = nullptr;
+  if (NumTrue > NumFalse)
+    SplitVal = ConstantInt::getTrue(BB->getContext());
+  else if (NumTrue != 0 || NumFalse != 0)
+    SplitVal = ConstantInt::getFalse(BB->getContext());
+
+  // Collect all of the blocks that this can be folded into so that we can
+  // factor this once and clone it once.
+  SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+  for (const auto &XorOpValue : XorOpValues) {
+    if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
+      continue;
+
+    BlocksToFoldInto.push_back(XorOpValue.second);
+  }
+
+  // If we inferred a value for all of the predecessors, then duplication won't
+  // help us.  However, we can just replace the LHS or RHS with the constant.
+  if (BlocksToFoldInto.size() ==
+      cast<PHINode>(BB->front()).getNumIncomingValues()) {
+    if (!SplitVal) {
+      // If all preds provide undef, just nuke the xor, because it is undef too.
+      BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+      BO->eraseFromParent();
+    } else if (SplitVal->isZero()) {
+      // If all preds provide 0, replace the xor with the other input.
+      BO->replaceAllUsesWith(BO->getOperand(isLHS));
+      BO->eraseFromParent();
+    } else {
+      // If all preds provide 1, set the computed value to 1.
+      BO->setOperand(!isLHS, SplitVal);
+    }
+
+    return true;
+  }
+
+  // Try to duplicate BB into PredBB.
+  return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+}
+
+
+/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
+static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+                                            BasicBlock *OldPred,
+                                            BasicBlock *NewPred,
+                                     DenseMap<Instruction*, Value*> &ValueMap) {
+  for (BasicBlock::iterator PNI = PHIBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) {
+    // Ok, we have a PHI node.  Figure out what the incoming value was for the
+    // DestBlock.
+    Value *IV = PN->getIncomingValueForBlock(OldPred);
+
+    // Remap the value if necessary.
+    if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+      if (I != ValueMap.end())
+        IV = I->second;
+    }
+
+    PN->addIncoming(IV, NewPred);
+  }
+}
+
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB.  Transform the IR to reflect this change.
+bool JumpThreading::ThreadEdge(BasicBlock *BB,
+                               const SmallVectorImpl<BasicBlock*> &PredBBs,
+                               BasicBlock *SuccBB) {
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+          << "' - would thread to self!\n");
+    return false;
+  }
+
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DEBUG(dbgs() << "  Not threading across loop header BB '" << BB->getName()
+          << "' to dest BB '" << SuccBB->getName()
+          << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  if (JumpThreadCost > BBDupThreshold) {
+    DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+          << "' - Cost is too high: " << JumpThreadCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors if needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+  }
+
+  // And finally, do it!
+  DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName() << "' to '"
+        << SuccBB->getName() << "' with cost: " << JumpThreadCost
+        << ", across block:\n    "
+        << *BB << "\n");
+
+  LVI->threadEdge(PredBB, BB, SuccBB);
+
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+                                         BB->getName()+".thread",
+                                         BB->getParent(), BB);
+  NewBB->moveAfter(PredBB);
+
+  // Set the block frequency of NewBB.
+  if (HasProfileData) {
+    auto NewBBFreq =
+        BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; !isa<TerminatorInst>(BI); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  // We didn't copy the terminator from BB over to NewBB, because there is now
+  // an unconditional jump to SuccBB.  Insert the unconditional jump.
+  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+  // PHI nodes for NewBB now.
+  AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use*, 16> UsesToRename;
+  for (Instruction &I : *BB) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Use &U : I.uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(U) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&U);
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    DEBUG(dbgs() << "\n");
+  }
+
+
+  // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
+  // NewBB instead of BB.  This eliminates predecessors from BB, which requires
+  // us to simplify any PHI nodes in BB.
+  TerminatorInst *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB, true);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+
+  // At this point, the IR is fully up to date and consistent.  Do a quick scan
+  // over the new instructions and zap any that are constants or dead.  This
+  // frequently happens because of phi translation.
+  SimplifyInstructionsInBlock(NewBB, TLI);
+
+  // Update the edge weight from BB to SuccBB, which should be less than before.
+  UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
+
+  // Threaded an edge!
+  ++NumThreads;
+  return true;
+}
+
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is availble, update the frequency of
+/// this new block.
+BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB,
+                                           ArrayRef<BasicBlock *> Preds,
+                                           const char *Suffix) {
+  // Collect the frequencies of all predecessors of BB, which will be used to
+  // update the edge weight on BB->SuccBB.
+  BlockFrequency PredBBFreq(0);
+  if (HasProfileData)
+    for (auto Pred : Preds)
+      PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB);
+
+  BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix);
+
+  // Set the block frequency of the newly created PredBB, which is the sum of
+  // frequencies of Preds.
+  if (HasProfileData)
+    BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency());
+  return PredBB;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
+                                                 BasicBlock *BB,
+                                                 BasicBlock *NewBB,
+                                                 BasicBlock *SuccBB) {
+  if (!HasProfileData)
+    return;
+
+  assert(BFI && BPI && "BFI & BPI should have been created here");
+
+  // As the edge from PredBB to BB is deleted, we have to update the block
+  // frequency of BB.
+  auto BBOrigFreq = BFI->getBlockFreq(BB);
+  auto NewBBFreq = BFI->getBlockFreq(NewBB);
+  auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+  auto BBNewFreq = BBOrigFreq - NewBBFreq;
+  BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+  // Collect updated outgoing edges' frequencies from BB and use them to update
+  // edge probabilities.
+  SmallVector<uint64_t, 4> BBSuccFreq;
+  for (BasicBlock *Succ : successors(BB)) {
+    auto SuccFreq = (Succ == SuccBB)
+                        ? BB2SuccBBFreq - NewBBFreq
+                        : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+    BBSuccFreq.push_back(SuccFreq.getFrequency());
+  }
+
+  uint64_t MaxBBSuccFreq =
+      *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+  SmallVector<BranchProbability, 4> BBSuccProbs;
+  if (MaxBBSuccFreq == 0)
+    BBSuccProbs.assign(BBSuccFreq.size(),
+                       {1, static_cast<unsigned>(BBSuccFreq.size())});
+  else {
+    for (uint64_t Freq : BBSuccFreq)
+      BBSuccProbs.push_back(
+          BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+    // Normalize edge probabilities so that they sum up to one.
+    BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+                                              BBSuccProbs.end());
+  }
+
+  // Update edge probabilities in BPI.
+  for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
+    BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+
+  if (BBSuccProbs.size() >= 2) {
+    SmallVector<uint32_t, 4> Weights;
+    for (auto Prob : BBSuccProbs)
+      Weights.push_back(Prob.getNumerator());
+
+    auto TI = BB->getTerminator();
+    TI->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+  }
+}
+
+/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
+bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
+                                 const SmallVectorImpl<BasicBlock *> &PredBBs) {
+  assert(!PredBBs.empty() && "Can't handle an empty set");
+
+  // If BB is a loop header, then duplicating this block outside the loop would
+  // cause us to transform this into an irreducible loop, don't do this.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
+          << "' into predecessor block '" << PredBBs[0]->getName()
+          << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  if (DuplicationCost > BBDupThreshold) {
+    DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
+          << "' - Cost is too high: " << DuplicationCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors if needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm");
+  }
+
+  // Okay, we decided to do this!  Clone all the instructions in BB onto the end
+  // of PredBB.
+  DEBUG(dbgs() << "  Duplicating block '" << BB->getName() << "' into end of '"
+        << PredBB->getName() << "' to eliminate branch on phi.  Cost: "
+        << DuplicationCost << " block is:" << *BB << "\n");
+
+  // Unless PredBB ends with an unconditional branch, split the edge so that we
+  // can just clone the bits from BB into the end of the new PredBB.
+  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+  if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+    PredBB = SplitEdge(PredBB, BB);
+    OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+  }
+
+  // We are going to have to map operands from the original BB block into the
+  // PredBB block.  Evaluate PHI nodes in BB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+  // Clone the non-phi instructions of BB into PredBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; BI != BB->end(); ++BI) {
+    Instruction *New = BI->clone();
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+
+    // If this instruction can be simplified after the operands are updated,
+    // just use the simplified value instead.  This frequently happens due to
+    // phi translation.
+    if (Value *IV =
+            SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
+      delete New;
+      ValueMapping[&*BI] = IV;
+    } else {
+      // Otherwise, insert the new instruction into the block.
+      New->setName(BI->getName());
+      PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+      ValueMapping[&*BI] = New;
+    }
+  }
+
+  // Check to see if the targets of the branch had PHI nodes. If so, we need to
+  // add entries to the PHI nodes for branch from PredBB now.
+  BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+                                  ValueMapping);
+  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+                                  ValueMapping);
+
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use*, 16> UsesToRename;
+  for (Instruction &I : *BB) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Use &U : I.uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(U) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&U);
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I.getType(), I.getName());
+    SSAUpdate.AddAvailableValue(BB, &I);
+    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    DEBUG(dbgs() << "\n");
+  }
+
+  // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+  // that we nuked.
+  BB->removePredecessor(PredBB, true);
+
+  // Remove the unconditional branch at the end of the PredBB block.
+  OldPredBranch->eraseFromParent();
+
+  ++NumDupes;
+  return true;
+}
+
+/// TryToUnfoldSelect - Look for blocks of the form
+/// bb1:
+///   %a = select
+///   br bb
+///
+/// bb2:
+///   %p = phi [%a, %bb] ...
+///   %c = icmp %p
+///   br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
+bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+  Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+  if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+      CondLHS->getParent() != BB)
+    return false;
+
+  for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+    BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+    SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
+
+    // Look if one of the incoming values is a select in the corresponding
+    // predecessor.
+    if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+      continue;
+
+    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredTerm || !PredTerm->isUnconditional())
+      continue;
+
+    // Now check if one of the select values would allow us to constant fold the
+    // terminator in BB. We don't do the transform if both sides fold, those
+    // cases will be threaded in any case.
+    LazyValueInfo::Tristate LHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+                                CondRHS, Pred, BB, CondCmp);
+    LazyValueInfo::Tristate RHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+                                CondRHS, Pred, BB, CondCmp);
+    if ((LHSFolds != LazyValueInfo::Unknown ||
+         RHSFolds != LazyValueInfo::Unknown) &&
+        LHSFolds != RHSFolds) {
+      // Expand the select.
+      //
+      // Pred --
+      //  |    v
+      //  |  NewBB
+      //  |    |
+      //  |-----
+      //  v
+      // BB
+      BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+                                             BB->getParent(), BB);
+      // Move the unconditional branch to NewBB.
+      PredTerm->removeFromParent();
+      NewBB->getInstList().insert(NewBB->end(), PredTerm);
+      // Create a conditional branch and update PHI nodes.
+      BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+      CondLHS->setIncomingValue(I, SI->getFalseValue());
+      CondLHS->addIncoming(SI->getTrueValue(), NewBB);
+      // The select is now dead.
+      SI->eraseFromParent();
+
+      // Update any other PHI nodes in BB.
+      for (BasicBlock::iterator BI = BB->begin();
+           PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+        if (Phi != CondLHS)
+          Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// bb:
+///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+///   %s = select p, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant.  If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
+bool JumpThreading::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  // Look for a Phi/Select pair in the same basic block.  The Phi feeds the
+  // condition of the Select and at least one of the incoming values is a
+  // constant.
+  for (BasicBlock::iterator BI = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+    unsigned NumPHIValues = PN->getNumIncomingValues();
+    if (NumPHIValues == 0 || !PN->hasOneUse())
+      continue;
+
+    SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
+    if (!SI || SI->getParent() != BB)
+      continue;
+
+    Value *Cond = SI->getCondition();
+    if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+      continue;
+
+    bool HasConst = false;
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      if (PN->getIncomingBlock(i) == BB)
+        return false;
+      if (isa<ConstantInt>(PN->getIncomingValue(i)))
+        HasConst = true;
+    }
+
+    if (HasConst) {
+      // Expand the select.
+      TerminatorInst *Term =
+          SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+      PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+      NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+      NewPN->addIncoming(SI->getFalseValue(), BB);
+      SI->replaceAllUsesWith(NewPN);
+      SI->eraseFromParent();
+      return true;
+    }
+  }
+  
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 0000000..8923ff7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,1107 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "licm"
+
+STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
+
+static cl::opt<bool>
+DisablePromotion("disable-licm-promotion", cl::Hidden,
+                 cl::desc("Disable memory promotion in LICM pass"));
+
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LICMSafetyInfo *SafetyInfo);
+static bool hoist(Instruction &I, BasicBlock *Preheader);
+static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LICMSafetyInfo *SafetyInfo);
+static bool isGuaranteedToExecute(const Instruction &Inst,
+                                  const DominatorTree *DT,
+                                  const Loop *CurLoop,
+                                  const LICMSafetyInfo *SafetyInfo);
+static bool isSafeToExecuteUnconditionally(const Instruction &Inst,
+                                           const DominatorTree *DT,
+                                           const TargetLibraryInfo *TLI,
+                                           const Loop *CurLoop,
+                                           const LICMSafetyInfo *SafetyInfo,
+                                           const Instruction *CtxI = nullptr);
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                     const AAMDNodes &AAInfo, 
+                                     AliasSetTracker *CurAST);
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LICMSafetyInfo *SafetyInfo);
+static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
+                               DominatorTree *DT, TargetLibraryInfo *TLI,
+                               Loop *CurLoop, AliasSetTracker *CurAST,
+                               LICMSafetyInfo *SafetyInfo);
+
+namespace {
+  struct LICM : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LICM() : LoopPass(ID) {
+      initializeLICMPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<BasicAAWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<SCEVAAWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+    using llvm::Pass::doFinalization;
+
+    bool doFinalization() override {
+      assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
+      return false;
+    }
+
+  private:
+    AliasAnalysis *AA;       // Current AliasAnalysis information
+    LoopInfo      *LI;       // Current LoopInfo
+    DominatorTree *DT;       // Dominator Tree for the current Loop.
+
+    TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
+
+    // State that is updated as we process loops.
+    bool Changed;            // Set to true when we change anything.
+    BasicBlock *Preheader;   // The preheader block of the current loop...
+    Loop *CurLoop;           // The current loop we are working on...
+    AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
+
+    /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+                                 Loop *L) override;
+
+    /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+    /// set.
+    void deleteAnalysisValue(Value *V, Loop *L) override;
+
+    /// Simple Analysis hook. Delete loop L from alias set map.
+    void deleteAnalysisLoop(Loop *L) override;
+  };
+}
+
+char LICM::ID = 0;
+INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
+
+Pass *llvm::createLICMPass() { return new LICM(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+///
+bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  Changed = false;
+
+  // Get our Loop and Alias Analysis information...
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
+  CurAST = new AliasSetTracker(*AA);
+  // Collect Alias info from subloops.
+  for (Loop *InnerL : L->getSubLoops()) {
+    AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL];
+    assert(InnerAST && "Where is my AST?");
+
+    // What if InnerLoop was modified by other passes ?
+    CurAST->add(*InnerAST);
+
+    // Once we've incorporated the inner loop's AST into ours, we don't need the
+    // subloop's anymore.
+    delete InnerAST;
+    LoopToAliasSetMap.erase(InnerL);
+  }
+
+  CurLoop = L;
+
+  // Get the preheader block to move instructions into...
+  Preheader = L->getLoopPreheader();
+
+  // Loop over the body of this loop, looking for calls, invokes, and stores.
+  // Because subloops have already been incorporated into AST, we skip blocks in
+  // subloops.
+  //
+  for (BasicBlock *BB : L->blocks()) {
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops.
+      CurAST->add(*BB);                 // Incorporate the specified basic block
+  }
+
+  // Compute loop safety information.
+  LICMSafetyInfo SafetyInfo;
+  computeLICMSafetyInfo(&SafetyInfo, CurLoop);
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  After sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  //
+  if (L->hasDedicatedExits())
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop,
+                          CurAST, &SafetyInfo);
+  if (Preheader)
+    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI,
+                           CurLoop, CurAST, &SafetyInfo);
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can.
+  if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) {
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    SmallVector<Instruction *, 8> InsertPts;
+    PredIteratorCache PIC;
+
+    // Loop over all of the alias sets in the tracker object.
+    for (AliasSet &AS : *CurAST)
+      Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts,
+                                              PIC, LI, DT, CurLoop, 
+                                              CurAST, &SafetyInfo);
+
+    // Once we have promoted values across the loop body we have to recursively
+    // reform LCSSA as any nested loop may now have values defined within the
+    // loop used in the outer loop.
+    // FIXME: This is really heavy handed. It would be a bit better to use an
+    // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+    // it as it went.
+    if (Changed) {
+      auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+      formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr);
+    }
+  }
+
+  // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+  // specifically moving instructions across the loop boundary and so it is
+  // especially in need of sanity checking here.
+  assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
+  assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+         "Parent loop not left in LCSSA form after LICM!");
+
+  // Clear out loops state information for the next iteration
+  CurLoop = nullptr;
+  Preheader = nullptr;
+
+  // If this loop is nested inside of another one, save the alias information
+  // for when we process the outer loop.
+  if (L->getParentLoop())
+    LoopToAliasSetMap[L] = CurAST;
+  else
+    delete CurAST;
+  return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth 
+/// first order w.r.t the DominatorTree.  This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
+///
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && 
+         DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
+         SafetyInfo != nullptr && "Unexpected input to sinkRegion");
+
+  // Set changed as false.
+  bool Changed = false;
+  // Get basic block
+  BasicBlock *BB = N->getBlock();
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return Changed;
+
+  // We are processing blocks in reverse dfo, so process children first.
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (DomTreeNode *Child : Children)
+    Changed |= sinkRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (inSubLoop(BB,CurLoop,LI)) return Changed;
+
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+    Instruction &I = *--II;
+
+    // If the instruction is dead, we would try to sink it because it isn't used
+    // in the loop, instead, just delete it.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+      ++II;
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    // Check to see if we can sink this instruction to the exit blocks
+    // of the loop.  We can do this if the all users of the instruction are
+    // outside of the loop.  In this case, it doesn't even matter if the
+    // operands of the instruction are loop invariant.
+    //
+    if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+        canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo)) {
+      ++II;
+      Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo);
+    }
+  }
+  return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree.  This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                       AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+  // Verify inputs.
+  assert(N != nullptr && AA != nullptr && LI != nullptr && 
+         DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
+         SafetyInfo != nullptr && "Unexpected input to hoistRegion");
+  // Set changed as false.
+  bool Changed = false;
+  // Get basic block
+  BasicBlock *BB = N->getBlock();
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return Changed;
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (!inSubLoop(BB, CurLoop, LI))
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+      Instruction &I = *II++;
+      // Try constant folding this instruction.  If all the operands are
+      // constants, it is technically hoistable, but it would be better to just
+      // fold it.
+      if (Constant *C = ConstantFoldInstruction(
+              &I, I.getModule()->getDataLayout(), TLI)) {
+        DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
+        CurAST->copyValue(&I, C);
+        CurAST->deleteValue(&I);
+        I.replaceAllUsesWith(C);
+        I.eraseFromParent();
+        continue;
+      }
+
+      // Try hoisting the instruction out to the preheader.  We can only do this
+      // if all of the operands of the instruction are loop invariant and if it
+      // is safe to hoist the instruction.
+      //
+      if (CurLoop->hasLoopInvariantOperands(&I) &&
+          canSinkOrHoistInst(I, AA, DT, TLI, CurLoop, CurAST, SafetyInfo) &&
+          isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo,
+                                 CurLoop->getLoopPreheader()->getTerminator()))
+        Changed |= hoist(I, CurLoop->getLoopPreheader());
+    }
+
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (DomTreeNode *Child : Children)
+    Changed |= hoistRegion(Child, AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
+  return Changed;
+}
+
+/// Computes loop safety information, checks loop body & header
+/// for the possibility of may throw exception.
+///
+void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
+  assert(CurLoop != nullptr && "CurLoop cant be null");
+  BasicBlock *Header = CurLoop->getHeader();
+  // Setting default safety values.
+  SafetyInfo->MayThrow = false;
+  SafetyInfo->HeaderMayThrow = false;
+  // Iterate over header and compute safety info.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end();
+       (I != E) && !SafetyInfo->HeaderMayThrow; ++I)
+    SafetyInfo->HeaderMayThrow |= I->mayThrow();
+  
+  SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
+  // Iterate over loop instructions and compute safety info. 
+  for (Loop::block_iterator BB = CurLoop->block_begin(), 
+       BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !SafetyInfo->MayThrow; ++I)
+      SafetyInfo->MayThrow |= I->mayThrow();
+
+  // Compute funclet colors if we might sink/hoist in a function with a funclet
+  // personality routine.
+  Function *Fn = CurLoop->getHeader()->getParent();
+  if (Fn->hasPersonalityFn())
+    if (Constant *PersonalityFn = Fn->getPersonalityFn())
+      if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
+        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
+}
+
+/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
+/// instruction.
+///
+bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
+                        TargetLibraryInfo *TLI, Loop *CurLoop,
+                        AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (!LI->isUnordered())
+      return false;        // Don't hoist volatile/atomic loads!
+
+    // Loads from constant memory are always safe to move, even if they end up
+    // in the same alias set as something that ends up being modified.
+    if (AA->pointsToConstantMemory(LI->getOperand(0)))
+      return true;
+    if (LI->getMetadata(LLVMContext::MD_invariant_load))
+      return true;
+
+    // Don't hoist loads which have may-aliased stores in loop.
+    uint64_t Size = 0;
+    if (LI->getType()->isSized())
+      Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType());
+
+    AAMDNodes AAInfo;
+    LI->getAAMetadata(AAInfo);
+
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo, CurAST);
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Don't sink or hoist dbg info; it's legal, but not useful.
+    if (isa<DbgInfoIntrinsic>(I))
+      return false;
+
+    // Don't sink calls which can throw.
+    if (CI->mayThrow())
+      return false;
+
+    // Handle simple cases by querying alias analysis.
+    FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == FMRB_DoesNotAccessMemory)
+      return true;
+    if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+      // A readonly argmemonly function only reads from memory pointed to by
+      // it's arguments with arbitrary offsets.  If we can prove there are no
+      // writes to this memory in the loop, we can hoist or sink.
+      if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) {
+        for (Value *Op : CI->arg_operands())
+          if (Op->getType()->isPointerTy() &&
+              pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize,
+                                       AAMDNodes(), CurAST))
+            return false;
+        return true;
+      }
+      // If this call only reads from memory and there are no writes to memory
+      // in the loop, we can hoist or sink the call as appropriate.
+      bool FoundMod = false;
+      for (AliasSet &AS : *CurAST) {
+        if (!AS.isForwardingAliasSet() && AS.isMod()) {
+          FoundMod = true;
+          break;
+        }
+      }
+      if (!FoundMod) return true;
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
+
+    return false;
+  }
+
+  // Only these instructions are hoistable/sinkable.
+  if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+      !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+      !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+      !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+      !isa<InsertValueInst>(I))
+    return false;
+
+  // TODO: Plumb the context instruction through to make hoisting and sinking
+  // more powerful. Hoisting of loads already works due to the special casing
+  // above. 
+  return isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo,
+                                        nullptr);
+}
+
+/// Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
+///
+static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
+  for (const Value *IncValue : PN.incoming_values())
+    if (IncValue != &I)
+      return false;
+
+  return true;
+}
+
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
+///
+static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
+                            const LICMSafetyInfo *SafetyInfo) {
+  const auto &BlockColors = SafetyInfo->BlockColors;
+  for (const User *U : I.users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+      const BasicBlock *BB = PN->getParent();
+      // We cannot sink uses in catchswitches.
+      if (isa<CatchSwitchInst>(BB->getTerminator()))
+        return false;
+
+      // We need to sink a callsite to a unique funclet.  Avoid sinking if the
+      // phi use is too muddled.
+      if (isa<CallInst>(I))
+        if (!BlockColors.empty() &&
+            BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+          return false;
+
+      // A PHI node where all of the incoming values are this instruction are
+      // special -- they can just be RAUW'ed with the instruction and thus
+      // don't require a use in the predecessor. This is a particular important
+      // special case because it is the pattern found in LCSSA form.
+      if (isTriviallyReplacablePHI(*PN, I)) {
+        if (CurLoop->contains(PN))
+          return false;
+        else
+          continue;
+      }
+
+      // Otherwise, PHI node uses occur in predecessor blocks if the incoming
+      // values. Check for such a use being inside the loop.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I)
+          if (CurLoop->contains(PN->getIncomingBlock(i)))
+            return false;
+
+      continue;
+    }
+
+    if (CurLoop->contains(UI))
+      return false;
+  }
+  return true;
+}
+
+static Instruction *
+CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
+                            const LoopInfo *LI,
+                            const LICMSafetyInfo *SafetyInfo) {
+  Instruction *New;
+  if (auto *CI = dyn_cast<CallInst>(&I)) {
+    const auto &BlockColors = SafetyInfo->BlockColors;
+
+    // Sinking call-sites need to be handled differently from other
+    // instructions.  The cloned call-site needs a funclet bundle operand
+    // appropriate for it's location in the CFG.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+         BundleIdx != BundleEnd; ++BundleIdx) {
+      OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+      if (Bundle.getTagID() == LLVMContext::OB_funclet)
+        continue;
+
+      OpBundles.emplace_back(Bundle);
+    }
+
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+      assert(CV.size() == 1 && "non-unique color for exit block!");
+      BasicBlock *BBColor = CV.front();
+      Instruction *EHPad = BBColor->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+
+    New = CallInst::Create(CI, OpBundles);
+  } else {
+    New = I.clone();
+  }
+
+  ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+  if (!I.getName().empty()) New->setName(I.getName() + ".le");
+
+  // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+  // particularly cheap because we can rip off the PHI node that we're
+  // replacing for the number and blocks of the predecessors.
+  // OPT: If this shows up in a profile, we can instead finish sinking all
+  // invariant instructions, and then walk their operands to re-establish
+  // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+  // sinking bottom-up.
+  for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+       ++OI)
+    if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+      if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+        if (!OLoop->contains(&PN)) {
+          PHINode *OpPN =
+              PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+                              OInst->getName() + ".lcssa", &ExitBlock.front());
+          for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+            OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+          *OI = OpPN;
+        }
+  return New;
+}
+
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
+                 const Loop *CurLoop, AliasSetTracker *CurAST,
+                 const LICMSafetyInfo *SafetyInfo) {
+  DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+  bool Changed = false;
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumSunk;
+  Changed = true;
+
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), 
+                                             ExitBlocks.end());
+#endif
+
+  // Clones of this instruction. Don't create more than one per exit block!
+  SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+  // If this instruction is only used outside of the loop, then all users are
+  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+  // the instruction.
+  while (!I.use_empty()) {
+    Value::user_iterator UI = I.user_begin();
+    auto *User = cast<Instruction>(*UI);
+    if (!DT->isReachableFromEntry(User->getParent())) {
+      User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+      continue;
+    }
+    // The user must be a PHI node.
+    PHINode *PN = cast<PHINode>(User);
+
+    // Surprisingly, instructions can be used outside of loops without any
+    // exits.  This can only happen in PHI nodes if the incoming block is
+    // unreachable.
+    Use &U = UI.getUse();
+    BasicBlock *BB = PN->getIncomingBlock(U);
+    if (!DT->isReachableFromEntry(BB)) {
+      U = UndefValue::get(I.getType());
+      continue;
+    }
+
+    BasicBlock *ExitBlock = PN->getParent();
+    assert(ExitBlockSet.count(ExitBlock) &&
+           "The LCSSA PHI is not in an exit block!");
+
+    Instruction *New;
+    auto It = SunkCopies.find(ExitBlock);
+    if (It != SunkCopies.end())
+      New = It->second;
+    else
+      New = SunkCopies[ExitBlock] =
+          CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
+
+    PN->replaceAllUsesWith(New);
+    PN->eraseFromParent();
+  }
+
+  CurAST->deleteValue(&I);
+  I.eraseFromParent();
+  return Changed;
+}
+
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
+///
+static bool hoist(Instruction &I, BasicBlock *Preheader) {
+  DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
+        << I << "\n");
+  // Move the new node to the Preheader, before its terminator.
+  I.moveBefore(Preheader->getTerminator());
+
+  // Metadata can be dependent on the condition we are hoisting above.
+  // Conservatively strip all metadata on the instruction.
+  I.dropUnknownNonDebugMetadata();
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumHoisted;
+  return true;
+}
+
+/// Only sink or hoist an instruction if it is not a trapping instruction,
+/// or if the instruction is known not to trap when moved to the preheader.
+/// or if it is a trapping instruction and is guaranteed to execute.
+static bool isSafeToExecuteUnconditionally(const Instruction &Inst, 
+                                           const DominatorTree *DT,
+                                           const TargetLibraryInfo *TLI,
+                                           const Loop *CurLoop,
+                                           const LICMSafetyInfo *SafetyInfo,
+                                           const Instruction *CtxI) {
+  if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI))
+    return true;
+
+  return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+}
+
+static bool isGuaranteedToExecute(const Instruction &Inst,
+                                  const DominatorTree *DT,
+                                  const Loop *CurLoop,
+                                  const LICMSafetyInfo * SafetyInfo) {
+
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (!DT->dominates(Inst.getParent(), ExitBlock))
+      return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  return true;
+}
+
+namespace {
+  class LoopPromoter : public LoadAndStorePromoter {
+    Value *SomePtr;  // Designated pointer to store to.
+    SmallPtrSetImpl<Value*> &PointerMustAliases;
+    SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
+    SmallVectorImpl<Instruction*> &LoopInsertPts;
+    PredIteratorCache &PredCache;
+    AliasSetTracker &AST;
+    LoopInfo &LI;
+    DebugLoc DL;
+    int Alignment;
+    AAMDNodes AATags;
+
+    Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        if (Loop *L = LI.getLoopFor(I->getParent()))
+          if (!L->contains(BB)) {
+            // We need to create an LCSSA PHI node for the incoming value and
+            // store that.
+            PHINode *PN =
+                PHINode::Create(I->getType(), PredCache.size(BB),
+                                I->getName() + ".lcssa", &BB->front());
+            for (BasicBlock *Pred : PredCache.get(BB))
+              PN->addIncoming(I, Pred);
+            return PN;
+          }
+      return V;
+    }
+
+  public:
+    LoopPromoter(Value *SP,
+                 ArrayRef<const Instruction *> Insts,
+                 SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA,
+                 SmallVectorImpl<BasicBlock *> &LEB,
+                 SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
+                 AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
+                 const AAMDNodes &AATags)
+        : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+          LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
+          LI(li), DL(dl), Alignment(alignment), AATags(AATags) {}
+
+    bool isInstInList(Instruction *I,
+                      const SmallVectorImpl<Instruction*> &) const override {
+      Value *Ptr;
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        Ptr = LI->getOperand(0);
+      else
+        Ptr = cast<StoreInst>(I)->getPointerOperand();
+      return PointerMustAliases.count(Ptr);
+    }
+
+    void doExtraRewritesBeforeFinalDeletion() const override {
+      // Insert stores after in the loop exit blocks.  Each exit block gets a
+      // store of the live-out values that feed them.  Since we've already told
+      // the SSA updater about the defs in the loop and the preheader
+      // definition, it is all set and we can start using it.
+      for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+        BasicBlock *ExitBlock = LoopExitBlocks[i];
+        Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+        LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+        Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+        Instruction *InsertPos = LoopInsertPts[i];
+        StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+        NewSI->setAlignment(Alignment);
+        NewSI->setDebugLoc(DL);
+        if (AATags) NewSI->setAAMetadata(AATags);
+      }
+    }
+
+    void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+      // Update alias analysis.
+      AST.copyValue(LI, V);
+    }
+    void instructionDeleted(Instruction *I) const override {
+      AST.deleteValue(I);
+    }
+  };
+} // end anon namespace
+
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop.  We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
+///
+bool llvm::promoteLoopAccessesToScalars(AliasSet &AS,
+                                        SmallVectorImpl<BasicBlock*>&ExitBlocks,
+                                        SmallVectorImpl<Instruction*>&InsertPts,
+                                        PredIteratorCache &PIC, LoopInfo *LI, 
+                                        DominatorTree *DT, Loop *CurLoop, 
+                                        AliasSetTracker *CurAST, 
+                                        LICMSafetyInfo * SafetyInfo) { 
+  // Verify inputs.
+  assert(LI != nullptr && DT != nullptr && 
+         CurLoop != nullptr && CurAST != nullptr && 
+         SafetyInfo != nullptr && 
+         "Unexpected Input to promoteLoopAccessesToScalars");
+  // Initially set Changed status to false.
+  bool Changed = false;
+  // We can promote this alias set if it has a store, if it is a "Must" alias
+  // set, if the pointer is loop invariant, and if we are not eliminating any
+  // volatile loads or stores.
+  if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+      AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+    return Changed;
+
+  assert(!AS.empty() &&
+         "Must alias set should have at least one pointer element in it!");
+
+  Value *SomePtr = AS.begin()->getValue();
+  BasicBlock * Preheader = CurLoop->getLoopPreheader();
+
+  // It isn't safe to promote a load/store from the loop if the load/store is
+  // conditional.  For example, turning:
+  //
+  //    for () { if (c) *P += 1; }
+  //
+  // into:
+  //
+  //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+  //
+  // is not safe, because *P may only be valid to access if 'c' is true.
+  //
+  // It is safe to promote P if all uses are direct load/stores and if at
+  // least one is guaranteed to be executed.
+  bool GuaranteedToExecute = false;
+
+  SmallVector<Instruction*, 64> LoopUses;
+  SmallPtrSet<Value*, 4> PointerMustAliases;
+
+  // We start with an alignment of one and try to find instructions that allow
+  // us to prove better alignment.
+  unsigned Alignment = 1;
+  AAMDNodes AATags;
+  bool HasDedicatedExits = CurLoop->hasDedicatedExits();
+
+  // Check that all of the pointers in the alias set have the same type.  We
+  // cannot (yet) promote a memory location that is loaded and stored in
+  // different sizes.  While we are at it, collect alignment and AA info.
+  for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
+    Value *ASIV = ASI->getValue();
+    PointerMustAliases.insert(ASIV);
+
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    if (SomePtr->getType() != ASIV->getType())
+      return Changed;
+
+    for (User *U : ASIV->users()) {
+      // Ignore instructions that are outside the loop.
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !CurLoop->contains(UI))
+        continue;
+
+      // If there is an non-load/store instruction in the loop, we can't promote
+      // it.
+      if (const LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+        assert(!Load->isVolatile() && "AST broken");
+        if (!Load->isSimple())
+          return Changed;
+      } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
+        // Stores *of* the pointer are not interesting, only stores *to* the
+        // pointer.
+        if (UI->getOperand(1) != ASIV)
+          continue;
+        assert(!Store->isVolatile() && "AST broken");
+        if (!Store->isSimple())
+          return Changed;
+        // Don't sink stores from loops without dedicated block exits. Exits
+        // containing indirect branches are not transformed by loop simplify,
+        // make sure we catch that. An additional load may be generated in the
+        // preheader for SSA updater, so also avoid sinking when no preheader
+        // is available.
+        if (!HasDedicatedExits || !Preheader)
+          return Changed;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
+        // If the alignment of this instruction allows us to specify a more
+        // restrictive (and performant) alignment and if we are sure this
+        // instruction will be executed, update the alignment.
+        // Larger is better, with the exception of 0 being the best alignment.
+        unsigned InstAlignment = Store->getAlignment();
+        if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
+          if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
+            GuaranteedToExecute = true;
+            Alignment = InstAlignment;
+          }
+
+        if (!GuaranteedToExecute)
+          GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, 
+                                                      CurLoop, SafetyInfo);
+
+      } else
+        return Changed; // Not a load or store.
+
+      // Merge the AA tags.
+      if (LoopUses.empty()) {
+        // On the first load/store, just take its AA tags.
+        UI->getAAMetadata(AATags);
+      } else if (AATags) {
+        UI->getAAMetadata(AATags, /* Merge = */ true);
+      }
+
+      LoopUses.push_back(UI);
+    }
+  }
+
+  // If there isn't a guaranteed-to-execute instruction, we can't promote.
+  if (!GuaranteedToExecute)
+    return Changed;
+
+  // Figure out the loop exits and their insertion points, if this is the
+  // first promotion.
+  if (ExitBlocks.empty()) {
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    InsertPts.clear();
+    InsertPts.reserve(ExitBlocks.size());
+    for (BasicBlock *ExitBlock : ExitBlocks)
+      InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+  }
+
+  // Can't insert into a catchswitch.
+  for (BasicBlock *ExitBlock : ExitBlocks)
+    if (isa<CatchSwitchInst>(ExitBlock->getTerminator()))
+      return Changed;
+
+  // Otherwise, this is safe to promote, lets do it!
+  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
+  Changed = true;
+  ++NumPromoted;
+
+  // Grab a debug location for the inserted loads/stores; given that the
+  // inserted loads/stores have little relation to the original loads/stores,
+  // this code just arbitrarily picks a location from one, since any debug
+  // location is better than none.
+  DebugLoc DL = LoopUses[0]->getDebugLoc();
+
+  // We use the SSAUpdater interface to insert phi nodes as required.
+  SmallVector<PHINode*, 16> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+  LoopPromoter Promoter(SomePtr, LoopUses, SSA,
+                        PointerMustAliases, ExitBlocks,
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+
+  // Set up the preheader to have a definition of the value.  It is the live-out
+  // value from the preheader that uses in the loop will use.
+  LoadInst *PreheaderLoad =
+    new LoadInst(SomePtr, SomePtr->getName()+".promoted",
+                 Preheader->getTerminator());
+  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setDebugLoc(DL);
+  if (AATags) PreheaderLoad->setAAMetadata(AATags);
+  SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+  // Rewrite all the loads in the loop and remember all the definitions from
+  // stores in the loop.
+  Promoter.run(LoopUses);
+
+  // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+  if (PreheaderLoad->use_empty())
+    PreheaderLoad->eraseFromParent();
+
+  return Changed;
+}
+
+/// Simple analysis hook. Clone alias set info.
+///
+void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+  if (!AST)
+    return;
+
+  AST->copyValue(From, To);
+}
+
+/// Simple Analysis hook. Delete value V from alias set
+///
+void LICM::deleteAnalysisValue(Value *V, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+  if (!AST)
+    return;
+
+  AST->deleteValue(V);
+}
+
+/// Simple Analysis hook. Delete value L from alias set map.
+///
+void LICM::deleteAnalysisLoop(Loop *L) {
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+  if (!AST)
+    return;
+
+  delete AST;
+  LoopToAliasSetMap.erase(L);
+}
+
+
+/// Return true if the body of this loop may store into the memory
+/// location pointed to by V.
+///
+static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                     const AAMDNodes &AAInfo, 
+                                     AliasSetTracker *CurAST) {
+  // Check to see if any of the basic blocks in CurLoop invalidate *V.
+  return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
+}
+
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+  assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+  return LI->getLoopFor(BB) != CurLoop;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
new file mode 100644
index 0000000..1648878
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
@@ -0,0 +1,282 @@
+//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation combines adjacent loads.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-combine"
+
+STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
+STATISTIC(NumLoadsCombined, "Number of loads combined");
+
+namespace {
+struct PointerOffsetPair {
+  Value *Pointer;
+  uint64_t Offset;
+};
+
+struct LoadPOPPair {
+  LoadPOPPair() = default;
+  LoadPOPPair(LoadInst *L, PointerOffsetPair P, unsigned O)
+      : Load(L), POP(P), InsertOrder(O) {}
+  LoadInst *Load;
+  PointerOffsetPair POP;
+  /// \brief The new load needs to be created before the first load in IR order.
+  unsigned InsertOrder;
+};
+
+class LoadCombine : public BasicBlockPass {
+  LLVMContext *C;
+  AliasAnalysis *AA;
+
+public:
+  LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
+    initializeLoadCombinePass(*PassRegistry::getPassRegistry());
+  }
+  
+  using llvm::Pass::doInitialization;
+  bool doInitialization(Function &) override;
+  bool runOnBasicBlock(BasicBlock &BB) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override { return "LoadCombine"; }
+  static char ID;
+
+  typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+private:
+  BuilderTy *Builder;
+
+  PointerOffsetPair getPointerOffsetPair(LoadInst &);
+  bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
+  bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
+  bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
+};
+}
+
+bool LoadCombine::doInitialization(Function &F) {
+  DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
+  C = &F.getContext();
+  return true;
+}
+
+PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
+  PointerOffsetPair POP;
+  POP.Pointer = LI.getPointerOperand();
+  POP.Offset = 0;
+  while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
+      auto &DL = LI.getModule()->getDataLayout();
+      unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType());
+      APInt Offset(BitWidth, 0);
+      if (GEP->accumulateConstantOffset(DL, Offset))
+        POP.Offset += Offset.getZExtValue();
+      else
+        // Can't handle GEPs with variable indices.
+        return POP;
+      POP.Pointer = GEP->getPointerOperand();
+    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer))
+      POP.Pointer = BC->getOperand(0);
+  }
+  return POP;
+}
+
+bool LoadCombine::combineLoads(
+    DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
+  bool Combined = false;
+  for (auto &Loads : LoadMap) {
+    if (Loads.second.size() < 2)
+      continue;
+    std::sort(Loads.second.begin(), Loads.second.end(),
+              [](const LoadPOPPair &A, const LoadPOPPair &B) {
+      return A.POP.Offset < B.POP.Offset;
+    });
+    if (aggregateLoads(Loads.second))
+      Combined = true;
+  }
+  return Combined;
+}
+
+/// \brief Try to aggregate loads from a sorted list of loads to be combined.
+///
+/// It is guaranteed that no writes occur between any of the loads. All loads
+/// have the same base pointer. There are at least two loads.
+bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+  assert(Loads.size() >= 2 && "Insufficient loads!");
+  LoadInst *BaseLoad = nullptr;
+  SmallVector<LoadPOPPair, 8> AggregateLoads;
+  bool Combined = false;
+  uint64_t PrevOffset = -1ull;
+  uint64_t PrevSize = 0;
+  for (auto &L : Loads) {
+    if (PrevOffset == -1ull) {
+      BaseLoad = L.Load;
+      PrevOffset = L.POP.Offset;
+      PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
+          L.Load->getType());
+      AggregateLoads.push_back(L);
+      continue;
+    }
+    if (L.Load->getAlignment() > BaseLoad->getAlignment())
+      continue;
+    if (L.POP.Offset > PrevOffset + PrevSize) {
+      // No other load will be combinable
+      if (combineLoads(AggregateLoads))
+        Combined = true;
+      AggregateLoads.clear();
+      PrevOffset = -1;
+      continue;
+    }
+    if (L.POP.Offset != PrevOffset + PrevSize)
+      // This load is offset less than the size of the last load.
+      // FIXME: We may want to handle this case.
+      continue;
+    PrevOffset = L.POP.Offset;
+    PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
+        L.Load->getType());
+    AggregateLoads.push_back(L);
+  }
+  if (combineLoads(AggregateLoads))
+    Combined = true;
+  return Combined;
+}
+
+/// \brief Given a list of combinable load. Combine the maximum number of them.
+bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
+  // Remove loads from the end while the size is not a power of 2.
+  unsigned TotalSize = 0;
+  for (const auto &L : Loads)
+    TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
+  while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
+    TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
+  if (Loads.size() < 2)
+    return false;
+
+  DEBUG({
+    dbgs() << "***** Combining Loads ******\n";
+    for (const auto &L : Loads) {
+      dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
+    }
+  });
+
+  // Find first load. This is where we put the new load.
+  LoadPOPPair FirstLP;
+  FirstLP.InsertOrder = -1u;
+  for (const auto &L : Loads)
+    if (L.InsertOrder < FirstLP.InsertOrder)
+      FirstLP = L;
+
+  unsigned AddressSpace =
+      FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
+
+  Builder->SetInsertPoint(FirstLP.Load);
+  Value *Ptr = Builder->CreateConstGEP1_64(
+      Builder->CreatePointerCast(Loads[0].POP.Pointer,
+                                 Builder->getInt8PtrTy(AddressSpace)),
+      Loads[0].POP.Offset);
+  LoadInst *NewLoad = new LoadInst(
+      Builder->CreatePointerCast(
+          Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
+                                Ptr->getType()->getPointerAddressSpace())),
+      Twine(Loads[0].Load->getName()) + ".combined", false,
+      Loads[0].Load->getAlignment(), FirstLP.Load);
+
+  for (const auto &L : Loads) {
+    Builder->SetInsertPoint(L.Load);
+    Value *V = Builder->CreateExtractInteger(
+        L.Load->getModule()->getDataLayout(), NewLoad,
+        cast<IntegerType>(L.Load->getType()),
+        L.POP.Offset - Loads[0].POP.Offset, "combine.extract");
+    L.Load->replaceAllUsesWith(V);
+  }
+
+  NumLoadsCombined = NumLoadsCombined + Loads.size();
+  return true;
+}
+
+bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
+  if (skipOptnoneFunction(BB))
+    return false;
+
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  IRBuilder<true, TargetFolder> TheBuilder(
+      BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
+  Builder = &TheBuilder;
+
+  DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
+  AliasSetTracker AST(*AA);
+
+  bool Combined = false;
+  unsigned Index = 0;
+  for (auto &I : BB) {
+    if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
+      if (combineLoads(LoadMap))
+        Combined = true;
+      LoadMap.clear();
+      AST.clear();
+      continue;
+    }
+    LoadInst *LI = dyn_cast<LoadInst>(&I);
+    if (!LI)
+      continue;
+    ++NumLoadsAnalyzed;
+    if (!LI->isSimple() || !LI->getType()->isIntegerTy())
+      continue;
+    auto POP = getPointerOffsetPair(*LI);
+    if (!POP.Pointer)
+      continue;
+    LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+    AST.add(LI);
+  }
+  if (combineLoads(LoadMap))
+    Combined = true;
+  return Combined;
+}
+
+void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+char LoadCombine::ID = 0;
+
+BasicBlockPass *llvm::createLoadCombinePass() {
+  return new LoadCombine();
+}
+
+INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",
+                    false, false)
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 0000000..7b1940b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,254 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Dominators.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-delete"
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+namespace {
+  class LoopDeletion : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopDeletion() : LoopPass(ID) {
+      initializeLoopDeletionPass(*PassRegistry::getPassRegistry());
+    }
+
+    // Possibly eliminate loop L if it is dead.
+    bool runOnLoop(Loop *L, LPPassManager &) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+    }
+
+  private:
+    bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                    SmallVectorImpl<BasicBlock *> &exitBlocks,
+                    bool &Changed, BasicBlock *Preheader);
+
+  };
+}
+
+char LoopDeletion::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
+                "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
+                "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() {
+  return new LoopDeletion();
+}
+
+/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// checked for unique exit and exiting blocks, and that the code is in LCSSA
+/// form.
+bool LoopDeletion::isLoopDead(Loop *L,
+                              SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                              SmallVectorImpl<BasicBlock *> &exitBlocks,
+                              bool &Changed, BasicBlock *Preheader) {
+  BasicBlock *exitBlock = exitBlocks[0];
+
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+
+    // Make sure all exiting blocks produce the same incoming value for the exit
+    // block.  If there are different incoming values for different exiting
+    // blocks, then it is impossible to statically determine which value should
+    // be used.
+    for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
+      if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
+        return false;
+    }
+
+    if (Instruction *I = dyn_cast<Instruction>(incoming))
+      if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
+        return false;
+
+    ++BI;
+  }
+
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.  This could be made more aggressive by using aliasing
+  // information to identify readonly and readnone calls.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ++BI) {
+      if (BI->mayHaveSideEffects())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
+/// observable behavior of the program other than finite running time.  Note
+/// we do ensure that this never remove a loop that might be infinite, as doing
+/// so could change the halting/non-halting nature of a program.
+/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
+/// in order to make various safety checks work.
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  // We can only remove the loop if there is a preheader that we can
+  // branch from after removing it.
+  BasicBlock *preheader = L->getLoopPreheader();
+  if (!preheader)
+    return false;
+
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->hasDedicatedExits())
+    return false;
+
+  // We can't remove loops that contain subloops.  If the subloops were dead,
+  // they would already have been removed in earlier executions of this pass.
+  if (L->begin() != L->end())
+    return false;
+
+  SmallVector<BasicBlock*, 4> exitingBlocks;
+  L->getExitingBlocks(exitingBlocks);
+
+  SmallVector<BasicBlock*, 4> exitBlocks;
+  L->getUniqueExitBlocks(exitBlocks);
+
+  // We require that the loop only have a single exit block.  Otherwise, we'd
+  // be in the situation of needing to be able to solve statically which exit
+  // block will be branched to, or trying to preserve the branching logic in
+  // a loop invariant manner.
+  if (exitBlocks.size() != 1)
+    return false;
+
+  // Finally, we have to check that the loop really is dead.
+  bool Changed = false;
+  if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+    return Changed;
+
+  // Don't remove loops for which we can't solve the trip count.
+  // They could be infinite, in which case we'd be changing program behavior.
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(S))
+    return Changed;
+
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.
+  BasicBlock *exitBlock = exitBlocks[0];
+
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.  Don't
+  // mess with this unless you have good reason and know what you're doing.
+
+  // Tell ScalarEvolution that the loop is deleted. Do this before
+  // deleting the loop so that ScalarEvolution can look at the loop
+  // to determine what it needs to clean up.
+  SE.forgetLoop(L);
+
+  // Connect the preheader directly to the exit block.
+  TerminatorInst *TI = preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock *exitingBlock = exitingBlocks[0];
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    int j = P->getBasicBlockIndex(exitingBlock);
+    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+    P->setIncomingBlock(j, preheader);
+    for (unsigned i = 1; i < exitingBlocks.size(); ++i)
+      P->removeIncomingValue(exitingBlocks[i]);
+    ++BI;
+  }
+
+  // Update the dominator tree and remove the instructions and blocks that will
+  // be deleted from the reference counting scheme.
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SmallVector<DomTreeNode*, 8> ChildNodes;
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    // Move all of the block's children to be children of the preheader, which
+    // allows us to remove the domtree entry for the block.
+    ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
+    for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(),
+         DE = ChildNodes.end(); DI != DE; ++DI) {
+      DT.changeImmediateDominator(*DI, DT[preheader]);
+    }
+
+    ChildNodes.clear();
+    DT.eraseNode(*LI);
+
+    // Remove the block from the reference counting scheme, so that we can
+    // delete it freely later.
+    (*LI)->dropAllReferences();
+  }
+
+  // Erase the instructions and the blocks without having to worry
+  // about ordering because we already dropped the references.
+  // NOTE: This iteration is safe because erasing the block does not remove its
+  // entry from the loop's block list.  We do that in the next section.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    (*LI)->eraseFromParent();
+
+  // Finally, the blocks from loopinfo.  This has to happen late because
+  // otherwise our loop iterators won't work.
+  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SmallPtrSet<BasicBlock*, 8> blocks;
+  blocks.insert(L->block_begin(), L->block_end());
+  for (BasicBlock *BB : blocks)
+    loopInfo.removeBlock(BB);
+
+  // The last step is to update LoopInfo now that we've eliminated this loop.
+  loopInfo.markAsRemoved(L);
+  Changed = true;
+
+  ++NumDeleted;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
new file mode 100644
index 0000000..3d3cf3e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -0,0 +1,836 @@
+//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop Distribution Pass.  Its main focus is to
+// distribute loops that cannot be vectorized due to dependence cycles.  It
+// tries to isolate the offending dependences into a new loop allowing
+// vectorization of the remaining parts.
+//
+// For dependence analysis, the pass uses the LoopVectorizer's
+// LoopAccessAnalysis.  Because this analysis presumes no change in the order of
+// memory operations, special care is taken to preserve the lexical order of
+// these operations.
+//
+// Similarly to the Vectorizer, the pass also supports loop versioning to
+// run-time disambiguate potentially overlapping arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <list>
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+using namespace llvm;
+
+static cl::opt<bool>
+    LDistVerify("loop-distribute-verify", cl::Hidden,
+                cl::desc("Turn on DominatorTree and LoopInfo verification "
+                         "after Loop Distribution"),
+                cl::init(false));
+
+static cl::opt<bool> DistributeNonIfConvertible(
+    "loop-distribute-non-if-convertible", cl::Hidden,
+    cl::desc("Whether to distribute into a loop that may not be "
+             "if-convertible by the loop vectorizer"),
+    cl::init(false));
+
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+    "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Distribution"));
+
+STATISTIC(NumLoopsDistributed, "Number of loops distributed");
+
+namespace {
+/// \brief Maintains the set of instructions of the loop for a partition before
+/// cloning.  After cloning, it hosts the new loop.
+class InstPartition {
+  typedef SmallPtrSet<Instruction *, 8> InstructionSet;
+
+public:
+  InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
+      : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
+    Set.insert(I);
+  }
+
+  /// \brief Returns whether this partition contains a dependence cycle.
+  bool hasDepCycle() const { return DepCycle; }
+
+  /// \brief Adds an instruction to this partition.
+  void add(Instruction *I) { Set.insert(I); }
+
+  /// \brief Collection accessors.
+  InstructionSet::iterator begin() { return Set.begin(); }
+  InstructionSet::iterator end() { return Set.end(); }
+  InstructionSet::const_iterator begin() const { return Set.begin(); }
+  InstructionSet::const_iterator end() const { return Set.end(); }
+  bool empty() const { return Set.empty(); }
+
+  /// \brief Moves this partition into \p Other.  This partition becomes empty
+  /// after this.
+  void moveTo(InstPartition &Other) {
+    Other.Set.insert(Set.begin(), Set.end());
+    Set.clear();
+    Other.DepCycle |= DepCycle;
+  }
+
+  /// \brief Populates the partition with a transitive closure of all the
+  /// instructions that the seeded instructions dependent on.
+  void populateUsedSet() {
+    // FIXME: We currently don't use control-dependence but simply include all
+    // blocks (possibly empty at the end) and let simplifycfg mostly clean this
+    // up.
+    for (auto *B : OrigLoop->getBlocks())
+      Set.insert(B->getTerminator());
+
+    // Follow the use-def chains to form a transitive closure of all the
+    // instructions that the originally seeded instructions depend on.
+    SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      // Insert instructions from the loop that we depend on.
+      for (Value *V : I->operand_values()) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
+          Worklist.push_back(I);
+      }
+    }
+  }
+
+  /// \brief Clones the original loop.
+  ///
+  /// Updates LoopInfo and DominatorTree using the information that block \p
+  /// LoopDomBB dominates the loop.
+  Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
+                               unsigned Index, LoopInfo *LI,
+                               DominatorTree *DT) {
+    ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
+                                          VMap, Twine(".ldist") + Twine(Index),
+                                          LI, DT, ClonedLoopBlocks);
+    return ClonedLoop;
+  }
+
+  /// \brief The cloned loop.  If this partition is mapped to the original loop,
+  /// this is null.
+  const Loop *getClonedLoop() const { return ClonedLoop; }
+
+  /// \brief Returns the loop where this partition ends up after distribution.
+  /// If this partition is mapped to the original loop then use the block from
+  /// the loop.
+  const Loop *getDistributedLoop() const {
+    return ClonedLoop ? ClonedLoop : OrigLoop;
+  }
+
+  /// \brief The VMap that is populated by cloning and then used in
+  /// remapinstruction to remap the cloned instructions.
+  ValueToValueMapTy &getVMap() { return VMap; }
+
+  /// \brief Remaps the cloned instructions using VMap.
+  void remapInstructions() {
+    remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+  }
+
+  /// \brief Based on the set of instructions selected for this partition,
+  /// removes the unnecessary ones.
+  void removeUnusedInsts() {
+    SmallVector<Instruction *, 8> Unused;
+
+    for (auto *Block : OrigLoop->getBlocks())
+      for (auto &Inst : *Block)
+        if (!Set.count(&Inst)) {
+          Instruction *NewInst = &Inst;
+          if (!VMap.empty())
+            NewInst = cast<Instruction>(VMap[NewInst]);
+
+          assert(!isa<BranchInst>(NewInst) &&
+                 "Branches are marked used early on");
+          Unused.push_back(NewInst);
+        }
+
+    // Delete the instructions backwards, as it has a reduced likelihood of
+    // having to update as many def-use and use-def chains.
+    for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
+      if (!Inst->use_empty())
+        Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+      Inst->eraseFromParent();
+    }
+  }
+
+  void print() const {
+    if (DepCycle)
+      dbgs() << "  (cycle)\n";
+    for (auto *I : Set)
+      // Prefix with the block name.
+      dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
+  }
+
+  void printBlocks() const {
+    for (auto *BB : getDistributedLoop()->getBlocks())
+      dbgs() << *BB;
+  }
+
+private:
+  /// \brief Instructions from OrigLoop selected for this partition.
+  InstructionSet Set;
+
+  /// \brief Whether this partition contains a dependence cycle.
+  bool DepCycle;
+
+  /// \brief The original loop.
+  Loop *OrigLoop;
+
+  /// \brief The cloned loop.  If this partition is mapped to the original loop,
+  /// this is null.
+  Loop *ClonedLoop;
+
+  /// \brief The blocks of ClonedLoop including the preheader.  If this
+  /// partition is mapped to the original loop, this is empty.
+  SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
+
+  /// \brief These gets populated once the set of instructions have been
+  /// finalized. If this partition is mapped to the original loop, these are not
+  /// set.
+  ValueToValueMapTy VMap;
+};
+
+/// \brief Holds the set of Partitions.  It populates them, merges them and then
+/// clones the loops.
+class InstPartitionContainer {
+  typedef DenseMap<Instruction *, int> InstToPartitionIdT;
+
+public:
+  InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
+      : L(L), LI(LI), DT(DT) {}
+
+  /// \brief Returns the number of partitions.
+  unsigned getSize() const { return PartitionContainer.size(); }
+
+  /// \brief Adds \p Inst into the current partition if that is marked to
+  /// contain cycles.  Otherwise start a new partition for it.
+  void addToCyclicPartition(Instruction *Inst) {
+    // If the current partition is non-cyclic.  Start a new one.
+    if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
+      PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
+    else
+      PartitionContainer.back().add(Inst);
+  }
+
+  /// \brief Adds \p Inst into a partition that is not marked to contain
+  /// dependence cycles.
+  ///
+  //  Initially we isolate memory instructions into as many partitions as
+  //  possible, then later we may merge them back together.
+  void addToNewNonCyclicPartition(Instruction *Inst) {
+    PartitionContainer.emplace_back(Inst, L);
+  }
+
+  /// \brief Merges adjacent non-cyclic partitions.
+  ///
+  /// The idea is that we currently only want to isolate the non-vectorizable
+  /// partition.  We could later allow more distribution among these partition
+  /// too.
+  void mergeAdjacentNonCyclic() {
+    mergeAdjacentPartitionsIf(
+        [](const InstPartition *P) { return !P->hasDepCycle(); });
+  }
+
+  /// \brief If a partition contains only conditional stores, we won't vectorize
+  /// it.  Try to merge it with a previous cyclic partition.
+  void mergeNonIfConvertible() {
+    mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
+      if (Partition->hasDepCycle())
+        return true;
+
+      // Now, check if all stores are conditional in this partition.
+      bool seenStore = false;
+
+      for (auto *Inst : *Partition)
+        if (isa<StoreInst>(Inst)) {
+          seenStore = true;
+          if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
+            return false;
+        }
+      return seenStore;
+    });
+  }
+
+  /// \brief Merges the partitions according to various heuristics.
+  void mergeBeforePopulating() {
+    mergeAdjacentNonCyclic();
+    if (!DistributeNonIfConvertible)
+      mergeNonIfConvertible();
+  }
+
+  /// \brief Merges partitions in order to ensure that no loads are duplicated.
+  ///
+  /// We can't duplicate loads because that could potentially reorder them.
+  /// LoopAccessAnalysis provides dependency information with the context that
+  /// the order of memory operation is preserved.
+  ///
+  /// Return if any partitions were merged.
+  bool mergeToAvoidDuplicatedLoads() {
+    typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
+    typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
+
+    LoadToPartitionT LoadToPartition;
+    ToBeMergedT ToBeMerged;
+
+    // Step through the partitions and create equivalence between partitions
+    // that contain the same load.  Also put partitions in between them in the
+    // same equivalence class to avoid reordering of memory operations.
+    for (PartitionContainerT::iterator I = PartitionContainer.begin(),
+                                       E = PartitionContainer.end();
+         I != E; ++I) {
+      auto *PartI = &*I;
+
+      // If a load occurs in two partitions PartI and PartJ, merge all
+      // partitions (PartI, PartJ] into PartI.
+      for (Instruction *Inst : *PartI)
+        if (isa<LoadInst>(Inst)) {
+          bool NewElt;
+          LoadToPartitionT::iterator LoadToPart;
+
+          std::tie(LoadToPart, NewElt) =
+              LoadToPartition.insert(std::make_pair(Inst, PartI));
+          if (!NewElt) {
+            DEBUG(dbgs() << "Merging partitions due to this load in multiple "
+                         << "partitions: " << PartI << ", "
+                         << LoadToPart->second << "\n" << *Inst << "\n");
+
+            auto PartJ = I;
+            do {
+              --PartJ;
+              ToBeMerged.unionSets(PartI, &*PartJ);
+            } while (&*PartJ != LoadToPart->second);
+          }
+        }
+    }
+    if (ToBeMerged.empty())
+      return false;
+
+    // Merge the member of an equivalence class into its class leader.  This
+    // makes the members empty.
+    for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
+         I != E; ++I) {
+      if (!I->isLeader())
+        continue;
+
+      auto PartI = I->getData();
+      for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
+                                   ToBeMerged.member_end())) {
+        PartJ->moveTo(*PartI);
+      }
+    }
+
+    // Remove the empty partitions.
+    PartitionContainer.remove_if(
+        [](const InstPartition &P) { return P.empty(); });
+
+    return true;
+  }
+
+  /// \brief Sets up the mapping between instructions to partitions.  If the
+  /// instruction is duplicated across multiple partitions, set the entry to -1.
+  void setupPartitionIdOnInstructions() {
+    int PartitionID = 0;
+    for (const auto &Partition : PartitionContainer) {
+      for (Instruction *Inst : Partition) {
+        bool NewElt;
+        InstToPartitionIdT::iterator Iter;
+
+        std::tie(Iter, NewElt) =
+            InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
+        if (!NewElt)
+          Iter->second = -1;
+      }
+      ++PartitionID;
+    }
+  }
+
+  /// \brief Populates the partition with everything that the seeding
+  /// instructions require.
+  void populateUsedSet() {
+    for (auto &P : PartitionContainer)
+      P.populateUsedSet();
+  }
+
+  /// \brief This performs the main chunk of the work of cloning the loops for
+  /// the partitions.
+  void cloneLoops() {
+    BasicBlock *OrigPH = L->getLoopPreheader();
+    // At this point the predecessor of the preheader is either the memcheck
+    // block or the top part of the original preheader.
+    BasicBlock *Pred = OrigPH->getSinglePredecessor();
+    assert(Pred && "Preheader does not have a single predecessor");
+    BasicBlock *ExitBlock = L->getExitBlock();
+    assert(ExitBlock && "No single exit block");
+    Loop *NewLoop;
+
+    assert(!PartitionContainer.empty() && "at least two partitions expected");
+    // We're cloning the preheader along with the loop so we already made sure
+    // it was empty.
+    assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
+           "preheader not empty");
+
+    // Create a loop for each partition except the last.  Clone the original
+    // loop before PH along with adding a preheader for the cloned loop.  Then
+    // update PH to point to the newly added preheader.
+    BasicBlock *TopPH = OrigPH;
+    unsigned Index = getSize() - 1;
+    for (auto I = std::next(PartitionContainer.rbegin()),
+              E = PartitionContainer.rend();
+         I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
+      auto *Part = &*I;
+
+      NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
+
+      Part->getVMap()[ExitBlock] = TopPH;
+      Part->remapInstructions();
+    }
+    Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
+
+    // Now go in forward order and update the immediate dominator for the
+    // preheaders with the exiting block of the previous loop.  Dominance
+    // within the loop is updated in cloneLoopWithPreheader.
+    for (auto Curr = PartitionContainer.cbegin(),
+              Next = std::next(PartitionContainer.cbegin()),
+              E = PartitionContainer.cend();
+         Next != E; ++Curr, ++Next)
+      DT->changeImmediateDominator(
+          Next->getDistributedLoop()->getLoopPreheader(),
+          Curr->getDistributedLoop()->getExitingBlock());
+  }
+
+  /// \brief Removes the dead instructions from the cloned loops.
+  void removeUnusedInsts() {
+    for (auto &Partition : PartitionContainer)
+      Partition.removeUnusedInsts();
+  }
+
+  /// \brief For each memory pointer, it computes the partitionId the pointer is
+  /// used in.
+  ///
+  /// This returns an array of int where the I-th entry corresponds to I-th
+  /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple
+  /// partitions its entry is set to -1.
+  SmallVector<int, 8>
+  computePartitionSetForPointers(const LoopAccessInfo &LAI) {
+    const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
+
+    unsigned N = RtPtrCheck->Pointers.size();
+    SmallVector<int, 8> PtrToPartitions(N);
+    for (unsigned I = 0; I < N; ++I) {
+      Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
+      auto Instructions =
+          LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+
+      int &Partition = PtrToPartitions[I];
+      // First set it to uninitialized.
+      Partition = -2;
+      for (Instruction *Inst : Instructions) {
+        // Note that this could be -1 if Inst is duplicated across multiple
+        // partitions.
+        int ThisPartition = this->InstToPartitionId[Inst];
+        if (Partition == -2)
+          Partition = ThisPartition;
+        // -1 means belonging to multiple partitions.
+        else if (Partition == -1)
+          break;
+        else if (Partition != (int)ThisPartition)
+          Partition = -1;
+      }
+      assert(Partition != -2 && "Pointer not belonging to any partition");
+    }
+
+    return PtrToPartitions;
+  }
+
+  void print(raw_ostream &OS) const {
+    unsigned Index = 0;
+    for (const auto &P : PartitionContainer) {
+      OS << "Partition " << Index++ << " (" << &P << "):\n";
+      P.print();
+    }
+  }
+
+  void dump() const { print(dbgs()); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const InstPartitionContainer &Partitions) {
+    Partitions.print(OS);
+    return OS;
+  }
+#endif
+
+  void printBlocks() const {
+    unsigned Index = 0;
+    for (const auto &P : PartitionContainer) {
+      dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
+      P.printBlocks();
+    }
+  }
+
+private:
+  typedef std::list<InstPartition> PartitionContainerT;
+
+  /// \brief List of partitions.
+  PartitionContainerT PartitionContainer;
+
+  /// \brief Mapping from Instruction to partition Id.  If the instruction
+  /// belongs to multiple partitions the entry contains -1.
+  InstToPartitionIdT InstToPartitionId;
+
+  Loop *L;
+  LoopInfo *LI;
+  DominatorTree *DT;
+
+  /// \brief The control structure to merge adjacent partitions if both satisfy
+  /// the \p Predicate.
+  template <class UnaryPredicate>
+  void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
+    InstPartition *PrevMatch = nullptr;
+    for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
+      auto DoesMatch = Predicate(&*I);
+      if (PrevMatch == nullptr && DoesMatch) {
+        PrevMatch = &*I;
+        ++I;
+      } else if (PrevMatch != nullptr && DoesMatch) {
+        I->moveTo(*PrevMatch);
+        I = PartitionContainer.erase(I);
+      } else {
+        PrevMatch = nullptr;
+        ++I;
+      }
+    }
+  }
+};
+
+/// \brief For each memory instruction, this class maintains difference of the
+/// number of unsafe dependences that start out from this instruction minus
+/// those that end here.
+///
+/// By traversing the memory instructions in program order and accumulating this
+/// number, we know whether any unsafe dependence crosses over a program point.
+class MemoryInstructionDependences {
+  typedef MemoryDepChecker::Dependence Dependence;
+
+public:
+  struct Entry {
+    Instruction *Inst;
+    unsigned NumUnsafeDependencesStartOrEnd;
+
+    Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
+  };
+
+  typedef SmallVector<Entry, 8> AccessesType;
+
+  AccessesType::const_iterator begin() const { return Accesses.begin(); }
+  AccessesType::const_iterator end() const { return Accesses.end(); }
+
+  MemoryInstructionDependences(
+      const SmallVectorImpl<Instruction *> &Instructions,
+      const SmallVectorImpl<Dependence> &Dependences) {
+    Accesses.append(Instructions.begin(), Instructions.end());
+
+    DEBUG(dbgs() << "Backward dependences:\n");
+    for (auto &Dep : Dependences)
+      if (Dep.isPossiblyBackward()) {
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
+        --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
+
+        DEBUG(Dep.print(dbgs(), 2, Instructions));
+      }
+  }
+
+private:
+  AccessesType Accesses;
+};
+
+/// \brief The pass class.
+class LoopDistribute : public FunctionPass {
+public:
+  LoopDistribute() : FunctionPass(ID) {
+    initializeLoopDistributePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    LAA = &getAnalysis<LoopAccessAnalysis>();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+    // Build up a worklist of inner-loops to vectorize. This is necessary as the
+    // act of distributing a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist)
+      Changed |= processLoop(L);
+
+    // Process each loop nest in the function.
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  static char ID;
+
+private:
+  /// \brief Filter out checks between pointers from the same partition.
+  ///
+  /// \p PtrToPartition contains the partition number for pointers.  Partition
+  /// number -1 means that the pointer is used in multiple partitions.  In this
+  /// case we can't safely omit the check.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4>
+  includeOnlyCrossPartitionChecks(
+      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+      const SmallVectorImpl<int> &PtrToPartition,
+      const RuntimePointerChecking *RtPtrChecking) {
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+                 [&](const RuntimePointerChecking::PointerCheck &Check) {
+                   for (unsigned PtrIdx1 : Check.first->Members)
+                     for (unsigned PtrIdx2 : Check.second->Members)
+                       // Only include this check if there is a pair of pointers
+                       // that require checking and the pointers fall into
+                       // separate partitions.
+                       //
+                       // (Note that we already know at this point that the two
+                       // pointer groups need checking but it doesn't follow
+                       // that each pair of pointers within the two groups need
+                       // checking as well.
+                       //
+                       // In other words we don't want to include a check just
+                       // because there is a pair of pointers between the two
+                       // pointer groups that require checks and a different
+                       // pair whose pointers fall into different partitions.)
+                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                           !RuntimePointerChecking::arePointersInSamePartition(
+                               PtrToPartition, PtrIdx1, PtrIdx2))
+                         return true;
+                   return false;
+                 });
+
+    return Checks;
+  }
+
+  /// \brief Try to distribute an inner-most loop.
+  bool processLoop(Loop *L) {
+    assert(L->empty() && "Only process inner loops.");
+
+    DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
+                 << "\" checking " << *L << "\n");
+
+    BasicBlock *PH = L->getLoopPreheader();
+    if (!PH) {
+      DEBUG(dbgs() << "Skipping; no preheader");
+      return false;
+    }
+    if (!L->getExitBlock()) {
+      DEBUG(dbgs() << "Skipping; multiple exit blocks");
+      return false;
+    }
+    // LAA will check that we only have a single exiting block.
+
+    const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+
+    // Currently, we only distribute to isolate the part of the loop with
+    // dependence cycles to enable partial vectorization.
+    if (LAI.canVectorizeMemory()) {
+      DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
+      return false;
+    }
+    auto *Dependences = LAI.getDepChecker().getDependences();
+    if (!Dependences || Dependences->empty()) {
+      DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
+      return false;
+    }
+
+    InstPartitionContainer Partitions(L, LI, DT);
+
+    // First, go through each memory operation and assign them to consecutive
+    // partitions (the order of partitions follows program order).  Put those
+    // with unsafe dependences into "cyclic" partition otherwise put each store
+    // in its own "non-cyclic" partition (we'll merge these later).
+    //
+    // Note that a memory operation (e.g. Load2 below) at a program point that
+    // has an unsafe dependence (Store3->Load1) spanning over it must be
+    // included in the same cyclic partition as the dependent operations.  This
+    // is to preserve the original program order after distribution.  E.g.:
+    //
+    //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive
+    //  Load1   -.                     1                       0->1
+    //  Load2    | /Unsafe/            0                       1
+    //  Store3  -'                    -1                       1->0
+    //  Load4                          0                       0
+    //
+    // NumUnsafeDependencesActive > 0 indicates this situation and in this case
+    // we just keep assigning to the same cyclic partition until
+    // NumUnsafeDependencesActive reaches 0.
+    const MemoryDepChecker &DepChecker = LAI.getDepChecker();
+    MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
+                                     *Dependences);
+
+    int NumUnsafeDependencesActive = 0;
+    for (auto &InstDep : MID) {
+      Instruction *I = InstDep.Inst;
+      // We update NumUnsafeDependencesActive post-instruction, catch the
+      // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
+      if (NumUnsafeDependencesActive ||
+          InstDep.NumUnsafeDependencesStartOrEnd > 0)
+        Partitions.addToCyclicPartition(I);
+      else
+        Partitions.addToNewNonCyclicPartition(I);
+      NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
+      assert(NumUnsafeDependencesActive >= 0 &&
+             "Negative number of dependences active");
+    }
+
+    // Add partitions for values used outside.  These partitions can be out of
+    // order from the original program order.  This is OK because if the
+    // partition uses a load we will merge this partition with the original
+    // partition of the load that we set up in the previous loop (see
+    // mergeToAvoidDuplicatedLoads).
+    auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
+    for (auto *Inst : DefsUsedOutside)
+      Partitions.addToNewNonCyclicPartition(Inst);
+
+    DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+    if (Partitions.getSize() < 2)
+      return false;
+
+    // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
+    // should be able to vectorize these together.
+    Partitions.mergeBeforePopulating();
+    DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+    if (Partitions.getSize() < 2)
+      return false;
+
+    // Now, populate the partitions with non-memory operations.
+    Partitions.populateUsedSet();
+    DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+
+    // In order to preserve original lexical order for loads, keep them in the
+    // partition that we set up in the MemoryInstructionDependences loop.
+    if (Partitions.mergeToAvoidDuplicatedLoads()) {
+      DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+                   << Partitions);
+      if (Partitions.getSize() < 2)
+        return false;
+    }
+
+    // Don't distribute the loop if we need too many SCEV run-time checks.
+    const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
+    if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
+      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      return false;
+    }
+
+    DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+    // We're done forming the partitions set up the reverse mapping from
+    // instructions to partitions.
+    Partitions.setupPartitionIdOnInstructions();
+
+    // To keep things simple have an empty preheader before we version or clone
+    // the loop.  (Also split if this has no predecessor, i.e. entry, because we
+    // rely on PH having a predecessor.)
+    if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+      SplitBlock(PH, PH->getTerminator(), DT, LI);
+
+    // If we need run-time checks, version the loop now.
+    auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
+    const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
+    const auto &AllChecks = RtPtrChecking->getChecks();
+    auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+                                                  RtPtrChecking);
+
+    if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+      DEBUG(dbgs() << "\nPointers:\n");
+      DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+      LoopVersioning LVer(LAI, L, LI, DT, SE, false);
+      LVer.setAliasChecks(std::move(Checks));
+      LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LVer.versionLoop(DefsUsedOutside);
+    }
+
+    // Create identical copies of the original loop for each partition and hook
+    // them up sequentially.
+    Partitions.cloneLoops();
+
+    // Now, we remove the instruction from each loop that don't belong to that
+    // partition.
+    Partitions.removeUnusedInsts();
+    DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+    DEBUG(Partitions.printBlocks());
+
+    if (LDistVerify) {
+      LI->verify();
+      DT->verifyDomTree();
+    }
+
+    ++NumLoopsDistributed;
+    return true;
+  }
+
+  // Analyses used.
+  LoopInfo *LI;
+  LoopAccessAnalysis *LAA;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+};
+} // anonymous namespace
+
+char LoopDistribute::ID;
+static const char ldist_name[] = "Loop Distribition";
+
+INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
new file mode 100644
index 0000000..4521640
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -0,0 +1,1112 @@
+//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form.  In cases that this kicks in, it can be a significant
+// performance win.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+//   memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+//   fpowi
+// Future integer operation idioms to recognize:
+//   ctpop, ctlz, cttz
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set.  It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// We should enhance the memset/memcpy recognition to handle multiple stores in
+// the loop.  This would handle things like:
+//   void foo(_Complex float *P)
+//     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-idiom"
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+
+namespace {
+
+class LoopIdiomRecognize : public LoopPass {
+  Loop *CurLoop;
+  AliasAnalysis *AA;
+  DominatorTree *DT;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+  TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
+  const DataLayout *DL;
+
+public:
+  static char ID;
+  explicit LoopIdiomRecognize() : LoopPass(ID) {
+    initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreservedID(LCSSAID);
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+private:
+  typedef SmallVector<StoreInst *, 8> StoreList;
+  StoreList StoreRefsForMemset;
+  StoreList StoreRefsForMemcpy;
+  bool HasMemset;
+  bool HasMemsetPattern;
+  bool HasMemcpy;
+
+  /// \name Countable Loop Idiom Handling
+  /// @{
+
+  bool runOnCountableLoop();
+  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+                      SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+  void collectStores(BasicBlock *BB);
+  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
+  bool processLoopStore(StoreInst *SI, const SCEV *BECount);
+  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+                               unsigned StoreAlignment, Value *StoredVal,
+                               Instruction *TheStore, const SCEVAddRecExpr *Ev,
+                               const SCEV *BECount, bool NegStride);
+  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+
+  /// @}
+  /// \name Noncountable Loop Idiom Handling
+  /// @{
+
+  bool runOnNoncountableLoop();
+
+  bool recognizePopcount();
+  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+                               PHINode *CntPhi, Value *Var);
+
+  /// @}
+};
+
+} // End anonymous namespace.
+
+char LoopIdiomRecognize::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
+                    false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
+
+/// deleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// and zero out all the operands of this instruction.  If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+static void deleteDeadInstruction(Instruction *I,
+                                  const TargetLibraryInfo *TLI) {
+  SmallVector<Value *, 16> Operands(I->value_op_begin(), I->value_op_end());
+  I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  I->eraseFromParent();
+  for (Value *Op : Operands)
+    RecursivelyDeleteTriviallyDeadInstructions(Op, TLI);
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  CurLoop = L;
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
+    return false;
+
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *CurLoop->getHeader()->getParent());
+  DL = &CurLoop->getHeader()->getModule()->getDataLayout();
+
+  HasMemset = TLI->has(LibFunc::memset);
+  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc::memcpy);
+
+  if (HasMemset || HasMemsetPattern || HasMemcpy)
+    if (SE->hasLoopInvariantBackedgeTakenCount(L))
+      return runOnCountableLoop();
+
+  return runOnNoncountableLoop();
+}
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+  assert(!isa<SCEVCouldNotCompute>(BECount) &&
+         "runOnCountableLoop() called on a loop without a predictable"
+         "backedge-taken count");
+
+  // If this loop executes exactly one time, then it should be peeled, not
+  // optimized by this pass.
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    if (BECst->getAPInt() == 0)
+      return false;
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+  DEBUG(dbgs() << "loop-idiom Scanning: F["
+               << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
+               << CurLoop->getHeader()->getName() << "\n");
+
+  bool MadeChange = false;
+  // Scan all the blocks in the loop that are not in subloops.
+  for (auto *BB : CurLoop->getBlocks()) {
+    // Ignore blocks in subloops.
+    if (LI->getLoopFor(BB) != CurLoop)
+      continue;
+
+    MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
+  }
+  return MadeChange;
+}
+
+static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
+  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
+         "Don't overflow unsigned.");
+  return (unsigned)SizeInBits >> 3;
+}
+
+static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) {
+  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+  return ConstStride->getAPInt().getZExtValue();
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in.  Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return nullptr;
+
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = DL->getTypeSizeInBits(V->getType());
+  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+    return nullptr;
+
+  // Don't care enough about darwin/ppc to implement this.
+  if (DL->isBigEndian())
+    return nullptr;
+
+  // Convert to size in bytes.
+  Size /= 8;
+
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16)
+    return nullptr;
+
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16)
+    return C;
+
+  // Otherwise, we'll use an array of the constants.
+  unsigned ArraySize = 16 / Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
+bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
+                                      bool &ForMemcpy) {
+  // Don't touch volatile stores.
+  if (!SI->isSimple())
+    return false;
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
+  // Reject stores that are so large that they overflow an unsigned.
+  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *StoreEv =
+      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return false;
+
+  // Check to see if we have a constant stride.
+  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
+    return false;
+
+  // See if the store can be turned into a memset.
+
+  // If the stored value is a byte-wise value (like i32 -1), then it may be
+  // turned into a memset of i8 -1, assuming that all the consecutive bytes
+  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
+  // but it can be turned into memset_pattern if the target supports it.
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = nullptr;
+
+  // If we're allowed to form a memset, and the stored value would be
+  // acceptable for memset, use it.
+  if (HasMemset && SplatValue &&
+      // Verify that the stored value is loop invariant.  If not, we can't
+      // promote the memset.
+      CurLoop->isLoopInvariant(SplatValue)) {
+    // It looks like we can use SplatValue.
+    ForMemset = true;
+    return true;
+  } else if (HasMemsetPattern &&
+             // Don't create memset_pattern16s with address spaces.
+             StorePtr->getType()->getPointerAddressSpace() == 0 &&
+             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+    // It looks like we can use PatternValue!
+    ForMemset = true;
+    return true;
+  }
+
+  // Otherwise, see if the store can be turned into a memcpy.
+  if (HasMemcpy) {
+    // Check to see if the stride matches the size of the store.  If so, then we
+    // know that every byte is touched in the loop.
+    unsigned Stride = getStoreStride(StoreEv);
+    unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+    if (StoreSize != Stride && StoreSize != -Stride)
+      return false;
+
+    // The store must be feeding a non-volatile load.
+    LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // See if the pointer expression is an AddRec like {base,+,1} on the current
+    // loop, which indicates a strided load.  If we have something else, it's a
+    // random load we can't handle.
+    const SCEVAddRecExpr *LoadEv =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+      return false;
+
+    // The store and load must share the same stride.
+    if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+      return false;
+
+    // Success.  This store can be converted into a memcpy.
+    ForMemcpy = true;
+    return true;
+  }
+  // This store can't be transformed into a memset/memcpy.
+  return false;
+}
+
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+  StoreRefsForMemset.clear();
+  StoreRefsForMemcpy.clear();
+  for (Instruction &I : *BB) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
+    if (!SI)
+      continue;
+
+    bool ForMemset = false;
+    bool ForMemcpy = false;
+    // Make sure this is a strided store with a constant stride.
+    if (!isLegalStore(SI, ForMemset, ForMemcpy))
+      continue;
+
+    // Save the store locations.
+    if (ForMemset)
+      StoreRefsForMemset.push_back(SI);
+    else if (ForMemcpy)
+      StoreRefsForMemcpy.push_back(SI);
+  }
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count.  This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(
+    BasicBlock *BB, const SCEV *BECount,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+  // We can only promote stores in this block if they are unconditionally
+  // executed in the loop.  For a block to be unconditionally executed, it has
+  // to dominate all the exit blocks of the loop.  Verify this now.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!DT->dominates(BB, ExitBlocks[i]))
+      return false;
+
+  bool MadeChange = false;
+  // Look for store instructions, which may be optimized to memset/memcpy.
+  collectStores(BB);
+
+  // Look for a single store which can be optimized into a memset.
+  for (auto &SI : StoreRefsForMemset)
+    MadeChange |= processLoopStore(SI, BECount);
+
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : StoreRefsForMemcpy)
+    MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *Inst = &*I++;
+    // Look for memset instructions, which may be optimized to a larger memset.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+      WeakVH InstPtr(&*I);
+      if (!processLoopMemSet(MSI, BECount))
+        continue;
+      MadeChange = true;
+
+      // If processing the memset invalidated our iterator, start over from the
+      // top of the block.
+      if (!InstPtr)
+        I = BB->begin();
+      continue;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// processLoopStore - See if this store can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
+  assert(SI->isSimple() && "Expected only non-volatile stores.");
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
+  // Check to see if the stride matches the size of the store.  If so, then we
+  // know that every byte is touched in the loop.
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  if (StoreSize != Stride && StoreSize != -Stride)
+    return false;
+
+  bool NegStride = StoreSize == -Stride;
+
+  // See if we can optimize just this store in isolation.
+  return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
+                                 StoredVal, SI, StoreEv, BECount, NegStride);
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+                                           const SCEV *BECount) {
+  // We can only handle non-volatile memsets with a constant size.
+  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+    return false;
+
+  // If we're not allowed to hack on memset, we fail.
+  if (!TLI->has(LibFunc::memset))
+    return false;
+
+  Value *Pointer = MSI->getDest();
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+    return false;
+
+  // Reject memsets that are so large that they overflow an unsigned.
+  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+  if ((SizeInBytes >> 32) != 0)
+    return false;
+
+  // Check to see if the stride matches the size of the memset.  If so, then we
+  // know that every byte is touched in the loop.
+  const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+
+  // TODO: Could also handle negative stride here someday, that will require the
+  // validity check in mayLoopAccessLocation to be updated though.
+  if (!Stride || MSI->getLength() != Stride->getValue())
+    return false;
+
+  // Verify that the memset value is loop invariant.  If not, we can't promote
+  // the memset.
+  Value *SplatValue = MSI->getValue();
+  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+    return false;
+
+  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
+                                 MSI->getAlignment(), SplatValue, MSI, Ev,
+                                 BECount, /*NegStride=*/false);
+}
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access.  The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                                  const SCEV *BECount, unsigned StoreSize,
+                                  AliasAnalysis &AA,
+                                  Instruction *IgnoredStore) {
+  // Get the location that may be stored across the loop.  Since the access is
+  // strided positively through memory, we say that the modified location starts
+  // at the pointer and has infinite size.
+  uint64_t AccessSize = MemoryLocation::UnknownSize;
+
+  // If the loop iterates a fixed number of times, we can refine the access size
+  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
+
+  // TODO: For this to be really effective, we have to dive into the pointer
+  // operand in the store.  Store to &A[i] of 100 will always return may alias
+  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+  // which will then no-alias a store to &A[100].
+  MemoryLocation StoreLoc(Ptr, AccessSize);
+
+  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+       ++BI)
+    for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
+      if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access))
+        return true;
+
+  return false;
+}
+
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset.  Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+                                        Type *IntPtr, unsigned StoreSize,
+                                        ScalarEvolution *SE) {
+  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  if (StoreSize != 1)
+    Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+                           SCEV::FlagNUW);
+  return SE->getMinusSCEV(Start, Index);
+}
+
+/// processLoopStridedStore - We see a strided store of some value.  If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::processLoopStridedStore(
+    Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
+    Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
+    const SCEV *BECount, bool NegStride) {
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = nullptr;
+
+  if (!SplatValue)
+    PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+  assert((SplatValue || PatternValue) &&
+         "Expected either splat value or pattern value.");
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
+
+  const SCEV *Start = Ev->getStart();
+  // Handle negative strided loops.
+  if (NegStride)
+    Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
+
+  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
+  // this into a memset in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write to the aliased location.  Check for any overlap by generating the
+  // base pointer and checking the region.
+  Value *BasePtr =
+      Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
+                            *AA, TheStore)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
+    return false;
+  }
+
+  // Okay, everything looks good, insert the memset.
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
+  if (StoreSize != 1) {
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+                               SCEV::FlagNUW);
+  }
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+
+  CallInst *NewCall;
+  if (SplatValue) {
+    NewCall =
+        Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
+    Module *M = TheStore->getModule();
+    Value *MSP =
+        M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+
+    // Otherwise we should form a memset_pattern16.  PatternValue is known to be
+    // an constant array of 16-bytes.  Plop the value into a mergable global.
+    GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+                                            GlobalValue::PrivateLinkage,
+                                            PatternValue, ".memset_pattern");
+    GV->setUnnamedAddr(true); // Ok to merge these.
+    GV->setAlignment(16);
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+    NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+  }
+
+  DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
+               << "    from store to: " << *Ev << " at: " << *TheStore << "\n");
+  NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+  // Okay, the memset has been formed.  Zap the original store and anything that
+  // feeds into it.
+  deleteDeadInstruction(TheStore, TLI);
+  ++NumMemSet;
+  return true;
+}
+
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy.  This kicks in for stuff like
+///   for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+                                                    const SCEV *BECount) {
+  assert(SI->isSimple() && "Expected only non-volatile stores.");
+
+  Value *StorePtr = SI->getPointerOperand();
+  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getStoreStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
+  bool NegStride = StoreSize == -Stride;
+
+  // The store must be feeding a non-volatile load.
+  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+  assert(LI->isSimple() && "Expected only non-volatile stores.");
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  const SCEVAddRecExpr *LoadEv =
+      cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
+  const SCEV *StrStart = StoreEv->getStart();
+  unsigned StrAS = SI->getPointerAddressSpace();
+  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
+
+  // Handle negative strided loops.
+  if (NegStride)
+    StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
+
+  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
+  // this into a memcpy in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write the memory region we're storing to.  This includes the load that
+  // feeds the stores.  Check for an alias by generating the base address and
+  // checking everything.
+  Value *StoreBasePtr = Expander.expandCodeFor(
+      StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                            StoreSize, *AA, SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+    return false;
+  }
+
+  const SCEV *LdStart = LoadEv->getStart();
+  unsigned LdAS = LI->getPointerAddressSpace();
+
+  // Handle negative strided loops.
+  if (NegStride)
+    LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
+
+  // For a memcpy, we have to make sure that the input array is not being
+  // mutated by the loop.
+  Value *LoadBasePtr = Expander.expandCodeFor(
+      LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
+                            *AA, SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+    return false;
+  }
+
+  // Okay, everything is safe, we can transform this!
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
+
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+                               SCEV::FlagNUW);
+
+  Value *NumBytes =
+      Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
+
+  CallInst *NewCall =
+      Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
+                           std::min(SI->getAlignment(), LI->getAlignment()));
+  NewCall->setDebugLoc(SI->getDebugLoc());
+
+  DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
+               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+  // Okay, the memcpy has been formed.  Zap the original store and anything that
+  // feeds into it.
+  deleteDeadInstruction(SI, TLI);
+  ++NumMemCpy;
+  return true;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  return recognizePopcount();
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero, the control yields to
+/// the loop entry. If the branch matches the behavior, the variable involved
+/// in the comparion is returned. This function will be called to see if the
+/// precondition and postcondition of the loop are in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
+  if (!BI || !BI->isConditional())
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return nullptr;
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
+    return Cond->getOperand(0);
+
+  return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 != 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x1 = phi (x0, x2);
+///       cnt1 = phi(cnt0, cnt2);
+///
+///       cnt2 = cnt1 + 1;
+///        ...
+///       x2 = x1 & (x1 - 1);
+///        ...
+///    } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+                                Instruction *&CntInst, PHINode *&CntPhi,
+                                Value *&Var) {
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = nullptr;
+  VarX1 = VarX0 = nullptr;
+  PhiX = CountPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition(
+            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp)
+      return false;
+
+    Instruction *SubInst = cast<Instruction>(SubOneOp);
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    if (!Dec ||
+        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubInst->getOpcode() == Instruction::Add &&
+           Dec->isAllOnesValue()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  {
+    PhiX = dyn_cast<PHINode>(VarX1);
+    if (!PhiX ||
+        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+      return false;
+    }
+  }
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = nullptr;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                              IterE = LoopEntry->end();
+         Iter != IterE; Iter++) {
+      Instruction *Inst = &*Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+      if (!Phi || Phi->getParent() != LoopEntry)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (User *U : Inst->users()) {
+        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+          LiveOutLoop = true;
+          break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+    return false;
+
+  // Counting population are usually conducted by few arithmetic instructions.
+  // Such instructions can be easily "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but an unconditional branch.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  if (!PH)
+    return false;
+  if (&PH->front() != PH->getTerminator())
+    return false;
+  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+  if (!EntryBI || EntryBI->isConditional())
+    return false;
+
+  // It should have a precondition block where the generated popcount instrinsic
+  // function can be inserted.
+  auto *PreCondBB = PH->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  if (!PreCondBI || PreCondBI->isUnconditional())
+    return false;
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+    return false;
+
+  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+  return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                       DebugLoc DL) {
+  Value *Ops[] = {Val};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+                                                 Instruction *CntInst,
+                                                 PHINode *CntPhi, Value *Var) {
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  const DebugLoc DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilder<> Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+        Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the population counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+  //   "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond = cast<ICmpInst>(
+        Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCondBr->setCondition(NewPreCond);
+
+    RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enable us to to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop becomes dead after
+  //    the transformation. It is a lot easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, an infinite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+    Builder.SetInsertPoint(LbCond);
+    Instruction *TcDec = cast<Instruction>(
+        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                          "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred =
+        (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
new file mode 100644
index 0000000..b4102fe
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -0,0 +1,195 @@
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+namespace {
+  class LoopInstSimplify : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopInstSimplify() : LoopPass(ID) {
+      initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop*, LPPassManager&) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+  };
+}
+
+char LoopInstSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
+                "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
+                "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+  return new LoopInstSimplify();
+}
+
+bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
+
+  SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+  // The bit we are stealing from the pointer represents whether this basic
+  // block is the header of a subloop, in which case we only process its phis.
+  typedef PointerIntPair<BasicBlock*, 1> WorklistItem;
+  SmallVector<WorklistItem, 16> VisitStack;
+  SmallPtrSet<BasicBlock*, 32> Visited;
+
+  bool Changed = false;
+  bool LocalChanged;
+  do {
+    LocalChanged = false;
+
+    VisitStack.clear();
+    Visited.clear();
+
+    VisitStack.push_back(WorklistItem(L->getHeader(), false));
+
+    while (!VisitStack.empty()) {
+      WorklistItem Item = VisitStack.pop_back_val();
+      BasicBlock *BB = Item.getPointer();
+      bool IsSubloopHeader = Item.getInt();
+      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+      // Simplify instructions in the current basic block.
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+        Instruction *I = &*BI++;
+
+        // The first time through the loop ToSimplify is empty and we try to
+        // simplify all instructions. On later iterations ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(I))
+          continue;
+
+        // Don't bother simplifying unused instructions.
+        if (!I->use_empty()) {
+          Value *V = SimplifyInstruction(I, DL, TLI, DT, &AC);
+          if (V && LI->replacementPreservesLCSSAForm(I, V)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (User *U : I->users())
+              Next->insert(cast<Instruction>(U));
+
+            I->replaceAllUsesWith(V);
+            LocalChanged = true;
+            ++NumSimplified;
+          }
+        }
+        bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+        if (res) {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove
+          // more than one instruction, so simply incrementing the
+          // iterator does not work. When instructions get deleted
+          // re-iterate instead.
+          BI = BB->begin(); BE = BB->end();
+          LocalChanged |= res;
+        }
+
+        if (IsSubloopHeader && !isa<PHINode>(I))
+          break;
+      }
+
+      // Add all successors to the worklist, except for loop exit blocks and the
+      // bodies of subloops. We visit the headers of loops so that we can process
+      // their phis, but we contract the rest of the subloop body and only follow
+      // edges leading back to the original loop.
+      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
+           ++SI) {
+        BasicBlock *SuccBB = *SI;
+        if (!Visited.insert(SuccBB).second)
+          continue;
+
+        const Loop *SuccLoop = LI->getLoopFor(SuccBB);
+        if (SuccLoop && SuccLoop->getHeader() == SuccBB
+                     && L->contains(SuccLoop)) {
+          VisitStack.push_back(WorklistItem(SuccBB, true));
+
+          SmallVector<BasicBlock*, 8> SubLoopExitBlocks;
+          SuccLoop->getExitBlocks(SubLoopExitBlocks);
+
+          for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
+            BasicBlock *ExitBB = SubLoopExitBlocks[i];
+            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
+              VisitStack.push_back(WorklistItem(ExitBB, false));
+          }
+
+          continue;
+        }
+
+        bool IsExitBlock = std::binary_search(ExitBlocks.begin(),
+                                              ExitBlocks.end(), SuccBB);
+        if (IsExitBlock)
+          continue;
+
+        VisitStack.push_back(WorklistItem(SuccBB, false));
+      }
+    }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+
+    Changed |= LocalChanged;
+  } while (LocalChanged);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
new file mode 100644
index 0000000..4295235
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -0,0 +1,1307 @@
+//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+namespace {
+
+typedef SmallVector<Loop *, 8> LoopVector;
+
+// TODO: Check if we can use a sparse matrix here.
+typedef std::vector<std::vector<char>> CharMatrix;
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+struct LoopInterchange;
+
+#ifdef DUMP_DEP_MATRICIES
+void printDepMatrix(CharMatrix &DepMatrix) {
+  for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
+    std::vector<char> Vec = *I;
+    for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
+      DEBUG(dbgs() << *II << " ");
+    DEBUG(dbgs() << "\n");
+  }
+}
+#endif
+
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
+                                     Loop *L, DependenceAnalysis *DA) {
+  typedef SmallVector<Value *, 16> ValueVector;
+  ValueVector MemInstr;
+
+  if (Level > MaxLoopNestDepth) {
+    DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+                 << MaxLoopNestDepth << "\n");
+    return false;
+  }
+
+  // For each block.
+  for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+       BB != BE; ++BB) {
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
+         ++I) {
+      Instruction *Ins = dyn_cast<Instruction>(I);
+      if (!Ins)
+        return false;
+      LoadInst *Ld = dyn_cast<LoadInst>(I);
+      StoreInst *St = dyn_cast<StoreInst>(I);
+      if (!St && !Ld)
+        continue;
+      if (Ld && !Ld->isSimple())
+        return false;
+      if (St && !St->isSimple())
+        return false;
+      MemInstr.push_back(&*I);
+    }
+  }
+
+  DEBUG(dbgs() << "Found " << MemInstr.size()
+               << " Loads and Stores to analyze\n");
+
+  ValueVector::iterator I, IE, J, JE;
+
+  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+    for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+      std::vector<char> Dep;
+      Instruction *Src = dyn_cast<Instruction>(*I);
+      Instruction *Des = dyn_cast<Instruction>(*J);
+      if (Src == Des)
+        continue;
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Des))
+        continue;
+      if (auto D = DA->depends(Src, Des, true)) {
+        DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des
+                     << "\n");
+        if (D->isFlow()) {
+          // TODO: Handle Flow dependence.Check if it is sufficient to populate
+          // the Dependence Matrix with the direction reversed.
+          DEBUG(dbgs() << "Flow dependence not handled");
+          return false;
+        }
+        if (D->isAnti()) {
+          DEBUG(dbgs() << "Found Anti dependence \n");
+          unsigned Levels = D->getLevels();
+          char Direction;
+          for (unsigned II = 1; II <= Levels; ++II) {
+            const SCEV *Distance = D->getDistance(II);
+            const SCEVConstant *SCEVConst =
+                dyn_cast_or_null<SCEVConstant>(Distance);
+            if (SCEVConst) {
+              const ConstantInt *CI = SCEVConst->getValue();
+              if (CI->isNegative())
+                Direction = '<';
+              else if (CI->isZero())
+                Direction = '=';
+              else
+                Direction = '>';
+              Dep.push_back(Direction);
+            } else if (D->isScalar(II)) {
+              Direction = 'S';
+              Dep.push_back(Direction);
+            } else {
+              unsigned Dir = D->getDirection(II);
+              if (Dir == Dependence::DVEntry::LT ||
+                  Dir == Dependence::DVEntry::LE)
+                Direction = '<';
+              else if (Dir == Dependence::DVEntry::GT ||
+                       Dir == Dependence::DVEntry::GE)
+                Direction = '>';
+              else if (Dir == Dependence::DVEntry::EQ)
+                Direction = '=';
+              else
+                Direction = '*';
+              Dep.push_back(Direction);
+            }
+          }
+          while (Dep.size() != Level) {
+            Dep.push_back('I');
+          }
+
+          DepMatrix.push_back(Dep);
+          if (DepMatrix.size() > MaxMemInstrCount) {
+            DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+                         << " dependencies inside loop\n");
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  // We don't have a DepMatrix to check legality return false.
+  if (DepMatrix.size() == 0)
+    return false;
+  return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+static void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx,
+                                   unsigned ToIndx) {
+  unsigned numRows = DepMatrix.size();
+  for (unsigned i = 0; i < numRows; ++i) {
+    char TmpVal = DepMatrix[i][ToIndx];
+    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+    DepMatrix[i][FromIndx] = TmpVal;
+  }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+                                   unsigned Column) {
+  for (unsigned i = 0; i <= Column; ++i) {
+    if (DepMatrix[Row][i] == '<')
+      return false;
+    if (DepMatrix[Row][i] == '>')
+      return true;
+  }
+  // All dependencies were '=','S' or 'I'
+  return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+                                 unsigned Column) {
+  for (unsigned i = 0; i < Column; ++i) {
+    if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' ||
+        DepMatrix[Row][i] != 'I')
+      return false;
+  }
+  return true;
+}
+
+static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+                                unsigned OuterLoopId, char InnerDep,
+                                char OuterDep) {
+
+  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+    return false;
+
+  if (InnerDep == OuterDep)
+    return true;
+
+  // It is legal to interchange if and only if after interchange no row has a
+  // '>' direction as the leftmost non-'='.
+
+  if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+    return true;
+
+  if (InnerDep == '<')
+    return true;
+
+  if (InnerDep == '>') {
+    // If OuterLoopId represents outermost loop then interchanging will make the
+    // 1st dependency as '>'
+    if (OuterLoopId == 0)
+      return false;
+
+    // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+    // interchanging will result in this row having an outermost non '='
+    // dependency of '>'
+    if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+      return true;
+  }
+
+  return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorem] A permutation of the loops in a perfect nest is legal if and only
+// if
+// the direction matrix, after the same permutation is applied to its columns,
+// has no ">" direction as the leftmost non-"=" direction in any row.
+static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
+                                      unsigned InnerLoopId,
+                                      unsigned OuterLoopId) {
+
+  unsigned NumRows = DepMatrix.size();
+  // For each row check if it is valid to interchange.
+  for (unsigned Row = 0; Row < NumRows; ++Row) {
+    char InnerDep = DepMatrix[Row][InnerLoopId];
+    char OuterDep = DepMatrix[Row][OuterLoopId];
+    if (InnerDep == '*' || OuterDep == '*')
+      return false;
+    else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep,
+                                  OuterDep))
+      return false;
+  }
+  return true;
+}
+
+static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
+
+  DEBUG(dbgs() << "Calling populateWorklist called\n");
+  LoopVector LoopList;
+  Loop *CurrentLoop = &L;
+  const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+  while (!Vec->empty()) {
+    // The current loop has multiple subloops in it hence it is not tightly
+    // nested.
+    // Discard all loops above it added into Worklist.
+    if (Vec->size() != 1) {
+      LoopList.clear();
+      return;
+    }
+    LoopList.push_back(CurrentLoop);
+    CurrentLoop = Vec->front();
+    Vec = &CurrentLoop->getSubLoops();
+  }
+  LoopList.push_back(CurrentLoop);
+  V.push_back(std::move(LoopList));
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+  if (InnerIndexVar)
+    return InnerIndexVar;
+  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+    return nullptr;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PhiVar = cast<PHINode>(I);
+    Type *PhiTy = PhiVar->getType();
+    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+        !PhiTy->isPointerTy())
+      return nullptr;
+    const SCEVAddRecExpr *AddRec =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+    if (!AddRec || !AddRec->isAffine())
+      continue;
+    const SCEV *Step = AddRec->getStepRecurrence(*SE);
+    const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+    if (!C)
+      continue;
+    // Found the induction variable.
+    // FIXME: Handle loops with more than one induction variable. Note that,
+    // currently, legality makes sure we have only one induction variable.
+    return PhiVar;
+  }
+  return nullptr;
+}
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+
+  /// Check if the loops can be interchanged.
+  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+                           CharMatrix &DepMatrix);
+  /// Check if the loop structure is understood. We do not handle triangular
+  /// loops for now.
+  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+  bool currentLimitations();
+
+  bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
+
+private:
+  bool tightlyNested(Loop *Outer, Loop *Inner);
+  bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
+  bool areAllUsesReductions(Instruction *Ins, Loop *L);
+  bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
+  bool findInductionAndReductions(Loop *L,
+                                  SmallVector<PHINode *, 8> &Inductions,
+                                  SmallVector<PHINode *, 8> &Reductions);
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  bool PreserveLCSSA;
+
+  bool InnerLoopHasReduction;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+
+  /// Check if the loop interchange is profitable.
+  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix);
+
+private:
+  int getInstrOrderCost();
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+};
+
+/// LoopInterchangeTransform interchanges the loop.
+class LoopInterchangeTransform {
+public:
+  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                           LoopInfo *LI, DominatorTree *DT,
+                           BasicBlock *LoopNestExit,
+                           bool InnerLoopContainsReductions)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        LoopExit(LoopNestExit),
+        InnerLoopHasReduction(InnerLoopContainsReductions) {}
+
+  /// Interchange OuterLoop and InnerLoop.
+  bool transform();
+  void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+  void splitInnerLoopLatch(Instruction *);
+  void splitOuterLoopLatch();
+  void splitInnerLoopHeader();
+  bool adjustLoopLinks();
+  void adjustLoopPreheaders();
+  void adjustOuterLoopPreheader();
+  void adjustInnerLoopPreheader();
+  bool adjustLoopBranches();
+  void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
+                           BasicBlock *NewPred);
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  BasicBlock *LoopExit;
+  bool InnerLoopHasReduction;
+};
+
+// Main LoopInterchange Pass.
+struct LoopInterchange : public FunctionPass {
+  static char ID;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DependenceAnalysis *DA;
+  DominatorTree *DT;
+  bool PreserveLCSSA;
+  LoopInterchange()
+      : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {
+    initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DependenceAnalysis>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+  }
+
+  bool runOnFunction(Function &F) override {
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    DA = &getAnalysis<DependenceAnalysis>();
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    // Build up a worklist of loop pairs to analyze.
+    SmallVector<LoopVector, 8> Worklist;
+
+    for (Loop *L : *LI)
+      populateWorklist(*L, Worklist);
+
+    DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+    bool Changed = true;
+    while (!Worklist.empty()) {
+      LoopVector LoopList = Worklist.pop_back_val();
+      Changed = processLoopList(LoopList, F);
+    }
+    return Changed;
+  }
+
+  bool isComputableLoopNest(LoopVector LoopList) {
+    for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
+      Loop *L = *I;
+      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+      if (ExitCountOuter == SE->getCouldNotCompute()) {
+        DEBUG(dbgs() << "Couldn't compute Backedge count\n");
+        return false;
+      }
+      if (L->getNumBackEdges() != 1) {
+        DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+        return false;
+      }
+      if (!L->getExitingBlock()) {
+        DEBUG(dbgs() << "Loop Doesn't have unique exit block\n");
+        return false;
+      }
+    }
+    return true;
+  }
+
+  unsigned selectLoopForInterchange(LoopVector LoopList) {
+    // TODO: Add a better heuristic to select the loop to be interchanged based
+    // on the dependence matrix. Currently we select the innermost loop.
+    return LoopList.size() - 1;
+  }
+
+  bool processLoopList(LoopVector LoopList, Function &F) {
+
+    bool Changed = false;
+    CharMatrix DependencyMatrix;
+    if (LoopList.size() < 2) {
+      DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+      return false;
+    }
+    if (!isComputableLoopNest(LoopList)) {
+      DEBUG(dbgs() << "Not vaild loop candidate for interchange\n");
+      return false;
+    }
+    Loop *OuterMostLoop = *(LoopList.begin());
+
+    DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size()
+                 << "\n");
+
+    if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(),
+                                  OuterMostLoop, DA)) {
+      DEBUG(dbgs() << "Populating Dependency matrix failed\n");
+      return false;
+    }
+#ifdef DUMP_DEP_MATRICIES
+    DEBUG(dbgs() << "Dependence before inter change \n");
+    printDepMatrix(DependencyMatrix);
+#endif
+
+    BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
+    BranchInst *OuterMostLoopLatchBI =
+        dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
+    if (!OuterMostLoopLatchBI)
+      return false;
+
+    // Since we currently do not handle LCSSA PHI's any failure in loop
+    // condition will now branch to LoopNestExit.
+    // TODO: This should be removed once we handle LCSSA PHI nodes.
+
+    // Get the Outermost loop exit.
+    BasicBlock *LoopNestExit;
+    if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
+      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
+    else
+      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
+
+    if (isa<PHINode>(LoopNestExit->begin())) {
+      DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
+                      "since on failure all loops branch to loop nest exit.\n");
+      return false;
+    }
+
+    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Move the selected loop outwards to the best possible position.
+    for (unsigned i = SelecLoopId; i > 0; i--) {
+      bool Interchanged =
+          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+      if (!Interchanged)
+        return Changed;
+      // Loops interchanged reflect the same in LoopList
+      std::swap(LoopList[i - 1], LoopList[i]);
+
+      // Update the DependencyMatrix
+      interChangeDepedencies(DependencyMatrix, i, i - 1);
+      DT->recalculate(F);
+#ifdef DUMP_DEP_MATRICIES
+      DEBUG(dbgs() << "Dependence after inter change \n");
+      printDepMatrix(DependencyMatrix);
+#endif
+      Changed |= Interchanged;
+    }
+    return Changed;
+  }
+
+  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+                   unsigned OuterLoopId, BasicBlock *LoopNestExit,
+                   std::vector<std::vector<char>> &DependencyMatrix) {
+
+    DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId
+                 << " and OuterLoopId = " << OuterLoopId << "\n");
+    Loop *InnerLoop = LoopList[InnerLoopId];
+    Loop *OuterLoop = LoopList[OuterLoopId];
+
+    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
+                                PreserveLCSSA);
+    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+      return false;
+    }
+    DEBUG(dbgs() << "Loops are legal to interchange\n");
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      DEBUG(dbgs() << "Interchanging Loops not profitable\n");
+      return false;
+    }
+
+    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
+                                 LoopNestExit, LIL.hasInnerLoopReduction());
+    LIT.transform();
+    DEBUG(dbgs() << "Loops interchanged\n");
+    return true;
+  }
+};
+
+} // end of namespace
+bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
+  return !std::any_of(Ins->user_begin(), Ins->user_end(), [=](User *U) -> bool {
+    PHINode *UserIns = dyn_cast<PHINode>(U);
+    RecurrenceDescriptor RD;
+    return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
+  });
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
+    BasicBlock *BB) {
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    // Load corresponding to reduction PHI's are safe while concluding if
+    // tightly nested.
+    if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      if (!areAllUsesReductions(L, InnerLoop))
+        return true;
+    } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      return true;
+  }
+  return false;
+}
+
+bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
+    BasicBlock *BB) {
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    // Stores corresponding to reductions are safe while concluding if tightly
+    // nested.
+    if (StoreInst *L = dyn_cast<StoreInst>(I)) {
+      PHINode *PHI = dyn_cast<PHINode>(L->getOperand(0));
+      if (!PHI)
+        return true;
+    } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      return true;
+  }
+  return false;
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+  DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n");
+
+  // A perfectly nested loop will not have any branch in between the outer and
+  // inner block i.e. outer header will branch to either inner preheader and
+  // outerloop latch.
+  BranchInst *outerLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  if (!outerLoopHeaderBI)
+    return false;
+  unsigned num = outerLoopHeaderBI->getNumSuccessors();
+  for (unsigned i = 0; i < num; i++) {
+    if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
+        outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+      return false;
+  }
+
+  DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");
+  // We do not have any basic block in between now make sure the outer header
+  // and outer loop latch doesn't contain any unsafe instructions.
+  if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
+      containsUnsafeInstructionsInLatch(OuterLoopLatch))
+    return false;
+
+  DEBUG(dbgs() << "Loops are perfectly nested \n");
+  // We have a perfect loop nest.
+  return true;
+}
+
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+    PHINode *InnerInduction) {
+
+  unsigned Num = InnerInduction->getNumOperands();
+  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+  for (unsigned i = 0; i < Num; ++i) {
+    Value *Val = InnerInduction->getOperand(i);
+    if (isa<Constant>(Val))
+      continue;
+    Instruction *I = dyn_cast<Instruction>(Val);
+    if (!I)
+      return false;
+    // TODO: Handle triangular loops.
+    // e.g. for(int i=0;i<N;i++)
+    //        for(int j=i;j<N;j++)
+    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+            InnerLoopPreheader &&
+        !OuterLoop->isLoopInvariant(I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+    Loop *L, SmallVector<PHINode *, 8> &Inductions,
+    SmallVector<PHINode *, 8> &Reductions) {
+  if (!L->getLoopLatch() || !L->getLoopPredecessor())
+    return false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    RecurrenceDescriptor RD;
+    InductionDescriptor ID;
+    PHINode *PHI = cast<PHINode>(I);
+    if (InductionDescriptor::isInductionPHI(PHI, SE, ID))
+      Inductions.push_back(PHI);
+    else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+      Reductions.push_back(PHI);
+    else {
+      DEBUG(
+          dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
+  for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+    if (PHI->getNumIncomingValues() > 1)
+      return false;
+    Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
+    if (!Ins)
+      return false;
+    // Incoming value for lcssa phi's in outer loop exit can only be inner loop
+    // exits lcssa phi else it would not be tightly nested.
+    if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
+      return false;
+  }
+  return true;
+}
+
+static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
+                                         BasicBlock *LoopHeader) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
+    unsigned Num = BI->getNumSuccessors();
+    assert(Num == 2);
+    for (unsigned i = 0; i < Num; ++i) {
+      if (BI->getSuccessor(i) == LoopHeader)
+        continue;
+      return BI->getSuccessor(i);
+    }
+  }
+  return nullptr;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+
+  PHINode *InnerInductionVar;
+  SmallVector<PHINode *, 8> Inductions;
+  SmallVector<PHINode *, 8> Reductions;
+  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions))
+    return true;
+
+  // TODO: Currently we handle only loops with 1 induction variable.
+  if (Inductions.size() != 1) {
+    DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
+                 << "Failed to interchange due to current limitation\n");
+    return true;
+  }
+  if (Reductions.size() > 0)
+    InnerLoopHasReduction = true;
+
+  InnerInductionVar = Inductions.pop_back_val();
+  Reductions.clear();
+  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions))
+    return true;
+
+  // Outer loop cannot have reduction because then loops will not be tightly
+  // nested.
+  if (!Reductions.empty())
+    return true;
+  // TODO: Currently we handle only loops with 1 induction variable.
+  if (Inductions.size() != 1)
+    return true;
+
+  // TODO: Triangular loops are not handled for now.
+  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+    DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    return true;
+  }
+
+  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
+  BasicBlock *LoopExitBlock =
+      getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
+  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true))
+    return true;
+
+  LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
+  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false))
+    return true;
+
+  // TODO: Current limitation: Since we split the inner loop latch at the point
+  // were induction variable is incremented (induction.next); We cannot have
+  // more than 1 user of induction.next since it would result in broken code
+  // after split.
+  // e.g.
+  // for(i=0;i<N;i++) {
+  //    for(j = 0;j<M;j++) {
+  //      A[j+1][i+2] = A[j][i]+k;
+  //  }
+  // }
+  bool FoundInduction = false;
+  Instruction *InnerIndexVarInc = nullptr;
+  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+  else
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+  if (!InnerIndexVarInc)
+    return true;
+
+  // Since we split the inner loop latch on this induction variable. Make sure
+  // we do not have any instruction between the induction variable and branch
+  // instruction.
+
+  for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend();
+       I != E && !FoundInduction; ++I) {
+    if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I))
+      continue;
+    const Instruction &Ins = *I;
+    // We found an instruction. If this is not induction variable then it is not
+    // safe to split this loop latch.
+    if (!Ins.isIdenticalTo(InnerIndexVarInc))
+      return true;
+    else
+      FoundInduction = true;
+  }
+  // The loop latch ended and we didn't find the induction variable return as
+  // current limitation.
+  if (!FoundInduction)
+    return true;
+
+  return false;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+                                                  unsigned OuterLoopId,
+                                                  CharMatrix &DepMatrix) {
+
+  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+    DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+                 << "and OuterLoopId = " << OuterLoopId
+                 << "due to dependence\n");
+    return false;
+  }
+
+  // Create unique Preheaders if we already do not have one.
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+  // Create  a unique outer preheader -
+  // 1) If OuterLoop preheader is not present.
+  // 2) If OuterLoop Preheader is same as OuterLoop Header
+  // 3) If OuterLoop Preheader is same as Header of the previous loop.
+  // 4) If OuterLoop Preheader is Entry node.
+  if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
+      isa<PHINode>(OuterLoopPreHeader->begin()) ||
+      !OuterLoopPreHeader->getUniquePredecessor()) {
+    OuterLoopPreHeader =
+        InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
+  }
+
+  if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
+      InnerLoopPreHeader == OuterLoop->getHeader()) {
+    InnerLoopPreHeader =
+        InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
+  }
+
+  // TODO: The loops could not be interchanged due to current limitations in the
+  // transform module.
+  if (currentLimitations()) {
+    DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+    return false;
+  }
+
+  // Check if the loops are tightly nested.
+  if (!tightlyNested(OuterLoop, InnerLoop)) {
+    DEBUG(dbgs() << "Loops not tightly nested\n");
+    return false;
+  }
+
+  return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+  unsigned GoodOrder, BadOrder;
+  BadOrder = GoodOrder = 0;
+  for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
+       BI != BE; ++BI) {
+    for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) {
+      const Instruction &Ins = *I;
+      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+        unsigned NumOp = GEP->getNumOperands();
+        bool FoundInnerInduction = false;
+        bool FoundOuterInduction = false;
+        for (unsigned i = 0; i < NumOp; ++i) {
+          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+          if (!AR)
+            continue;
+
+          // If we find the inner induction after an outer induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[i][j] = A[i-1][j-1]+k;
+          // then it is a good order.
+          if (AR->getLoop() == InnerLoop) {
+            // We found an InnerLoop induction after OuterLoop induction. It is
+            // a good order.
+            FoundInnerInduction = true;
+            if (FoundOuterInduction) {
+              GoodOrder++;
+              break;
+            }
+          }
+          // If we find the outer induction after an inner induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[j][i] = A[j-1][i-1]+k;
+          // then it is a bad order.
+          if (AR->getLoop() == OuterLoop) {
+            // We found an OuterLoop induction after InnerLoop induction. It is
+            // a bad order.
+            FoundOuterInduction = true;
+            if (FoundInnerInduction) {
+              BadOrder++;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  return GoodOrder - BadOrder;
+}
+
+static bool isProfitabileForVectorization(unsigned InnerLoopId,
+                                          unsigned OuterLoopId,
+                                          CharMatrix &DepMatrix) {
+  // TODO: Improve this heuristic to catch more cases.
+  // If the inner loop is loop independent or doesn't carry any dependency it is
+  // profitable to move this to outer position.
+  unsigned Row = DepMatrix.size();
+  for (unsigned i = 0; i < Row; ++i) {
+    if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+      return false;
+    // TODO: We need to improve this heuristic.
+    if (DepMatrix[i][OuterLoopId] != '=')
+      return false;
+  }
+  // If outer loop has dependence and inner loop is loop independent then it is
+  // profitable to interchange to enable parallelism.
+  return true;
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+                                                unsigned OuterLoopId,
+                                                CharMatrix &DepMatrix) {
+
+  // TODO: Add better profitability checks.
+  // e.g
+  // 1) Construct dependency matrix and move the one with no loop carried dep
+  //    inside to enable vectorization.
+
+  // This is rough cost estimation algorithm. It counts the good and bad order
+  // of induction variables in the instruction and allows reordering if number
+  // of bad orders is more than good.
+  int Cost = 0;
+  Cost += getInstrOrderCost();
+  DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (Cost < 0)
+    return true;
+
+  // It is not profitable as per current cache profitability model. But check if
+  // we can move this loop outside to improve parallelism.
+  bool ImprovesPar =
+      isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+  return ImprovesPar;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+                                               Loop *InnerLoop) {
+  for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
+       ++I) {
+    if (*I == InnerLoop) {
+      OuterLoop->removeChildLoop(I);
+      return;
+    }
+  }
+  llvm_unreachable("Couldn't find loop");
+}
+
+void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
+                                                Loop *OuterLoop) {
+  Loop *OuterLoopParent = OuterLoop->getParentLoop();
+  if (OuterLoopParent) {
+    // Remove the loop from its parent loop.
+    removeChildLoop(OuterLoopParent, OuterLoop);
+    removeChildLoop(OuterLoop, InnerLoop);
+    OuterLoopParent->addChildLoop(InnerLoop);
+  } else {
+    removeChildLoop(OuterLoop, InnerLoop);
+    LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+  }
+
+  while (!InnerLoop->empty())
+    OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
+
+  InnerLoop->addChildLoop(OuterLoop);
+}
+
+bool LoopInterchangeTransform::transform() {
+
+  DEBUG(dbgs() << "transform\n");
+  bool Transformed = false;
+  Instruction *InnerIndexVar;
+
+  if (InnerLoop->getSubLoops().size() == 0) {
+    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+    DEBUG(dbgs() << "Calling Split Inner Loop\n");
+    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+    if (!InductionPHI) {
+      DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+      return false;
+    }
+
+    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+    else
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+    //
+    // Split at the place were the induction variable is
+    // incremented/decremented.
+    // TODO: This splitting logic may not work always. Fix this.
+    splitInnerLoopLatch(InnerIndexVar);
+    DEBUG(dbgs() << "splitInnerLoopLatch Done\n");
+
+    // Splits the inner loops phi nodes out into a separate basic block.
+    splitInnerLoopHeader();
+    DEBUG(dbgs() << "splitInnerLoopHeader Done\n");
+  }
+
+  Transformed |= adjustLoopLinks();
+  if (!Transformed) {
+    DEBUG(dbgs() << "adjustLoopLinks Failed\n");
+    return false;
+  }
+
+  restructureLoops(InnerLoop, OuterLoop);
+  return true;
+}
+
+void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
+  InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
+}
+
+void LoopInterchangeTransform::splitOuterLoopLatch() {
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch;
+  OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock,
+                              OuterLoopLatch->getFirstNonPHI(), DT, LI);
+}
+
+void LoopInterchangeTransform::splitInnerLoopHeader() {
+
+  // Split the inner loop header out. Here make sure that the reduction PHI's
+  // stay in the innerloop body.
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  if (InnerLoopHasReduction) {
+    // FIXME: Check if the induction PHI will always be the first PHI.
+    BasicBlock *New = InnerLoopHeader->splitBasicBlock(
+        ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
+    if (LI)
+      if (Loop *L = LI->getLoopFor(InnerLoopHeader))
+        L->addBasicBlockToLoop(New, *LI);
+
+    // Adjust Reduction PHI's in the block.
+    SmallVector<PHINode *, 8> PHIVec;
+    for (auto I = New->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PHI = dyn_cast<PHINode>(I);
+      Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
+      PHI->replaceAllUsesWith(V);
+      PHIVec.push_back((PHI));
+    }
+    for (auto I = PHIVec.begin(), E = PHIVec.end(); I != E; ++I) {
+      PHINode *P = *I;
+      P->eraseFromParent();
+    }
+  } else {
+    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+  }
+
+  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
+                  "InnerLoopHeader \n");
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+  auto &ToList = InsertBefore->getParent()->getInstList();
+  auto &FromList = FromBB->getInstList();
+
+  ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+                FromBB->getTerminator()->getIterator());
+}
+
+void LoopInterchangeTransform::adjustOuterLoopPreheader() {
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader();
+
+  moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator());
+}
+
+void LoopInterchangeTransform::adjustInnerLoopPreheader() {
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterHeader = OuterLoop->getHeader();
+
+  moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator());
+}
+
+void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
+                                                   BasicBlock *OldPred,
+                                                   BasicBlock *NewPred) {
+  for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PHI = cast<PHINode>(I);
+    unsigned Num = PHI->getNumIncomingValues();
+    for (unsigned i = 0; i < Num; ++i) {
+      if (PHI->getIncomingBlock(i) == OldPred)
+        PHI->setIncomingBlock(i, NewPred);
+    }
+  }
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+
+  DEBUG(dbgs() << "adjustLoopBranches called\n");
+  // Adjust the loop preheader
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchPredecessor =
+      InnerLoopLatch->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchSuccessor;
+  BasicBlock *OuterLoopLatchSuccessor;
+
+  BranchInst *OuterLoopLatchBI =
+      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+  BranchInst *InnerLoopLatchBI =
+      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+  BranchInst *OuterLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  BranchInst *InnerLoopHeaderBI =
+      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+      !InnerLoopHeaderBI)
+    return false;
+
+  BranchInst *InnerLoopLatchPredecessorBI =
+      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+  BranchInst *OuterLoopPredecessorBI =
+      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+    return false;
+  BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+  if (!InnerLoopHeaderSuccessor)
+    return false;
+
+  // Adjust Loop Preheader and headers
+
+  unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
+      OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
+  }
+
+  NumSucc = OuterLoopHeaderBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
+      OuterLoopHeaderBI->setSuccessor(i, LoopExit);
+    else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
+      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
+  }
+
+  // Adjust reduction PHI's now that the incoming block has changed.
+  updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
+                      OuterLoopHeader);
+
+  BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
+  InnerLoopHeaderBI->eraseFromParent();
+
+  // -------------Adjust loop latches-----------
+  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+  else
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+  NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
+      InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
+  }
+
+  // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
+  // the value and remove this PHI node from inner loop.
+  SmallVector<PHINode *, 8> LcssaVec;
+  for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *LcssaPhi = cast<PHINode>(I);
+    LcssaVec.push_back(LcssaPhi);
+  }
+  for (auto I = LcssaVec.begin(), E = LcssaVec.end(); I != E; ++I) {
+    PHINode *P = *I;
+    Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
+    P->replaceAllUsesWith(Incoming);
+    P->eraseFromParent();
+  }
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+  else
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+  if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
+    InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
+  else
+    InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+
+  updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
+    OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
+  } else {
+    OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+  }
+
+  return true;
+}
+void LoopInterchangeTransform::adjustLoopPreheaders() {
+
+  // We have interchanged the preheaders so we need to interchange the data in
+  // the preheader as well.
+  // This is because the content of inner preheader was previously executed
+  // inside the outer loop.
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BranchInst *InnerTermBI =
+      cast<BranchInst>(InnerLoopPreHeader->getTerminator());
+
+  // These instructions should now be executed inside the loop.
+  // Move instruction into a new block after outer header.
+  moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
+  // These instructions were not executed previously in the loop so move them to
+  // the older inner loop preheader.
+  moveBBContents(OuterLoopPreHeader, InnerTermBI);
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+
+  // Adjust all branches in the inner and outer loop.
+  bool Changed = adjustLoopBranches();
+  if (Changed)
+    adjustLoopPreheaders();
+  return Changed;
+}
+
+char LoopInterchange::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+                      "Interchanges loops for cache reuse", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+
+INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+                    "Interchanges loops for cache reuse", false, false)
+
+Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
new file mode 100644
index 0000000..1064d08
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -0,0 +1,566 @@
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads.  These form the candidates for the
+// transformation.  The source value of each store then propagated to the user
+// of the corresponding load.  This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <forward_list>
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+using namespace llvm;
+
+static cl::opt<unsigned> CheckPerElim(
+    "runtime-check-per-loop-load-elim", cl::Hidden,
+    cl::desc("Max number of memchecks allowed per eliminated load on average"),
+    cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+    "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Load Elimination"));
+
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// \brief Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+  LoadInst *Load;
+  StoreInst *Store;
+
+  StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+      : Load(Load), Store(Store) {}
+
+  /// \brief Return true if the dependence from the store to the load has a
+  /// distance of one.  E.g. A[i+1] = A[i]
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const {
+    Value *LoadPtr = Load->getPointerOperand();
+    Value *StorePtr = Store->getPointerOperand();
+    Type *LoadPtrType = LoadPtr->getType();
+    Type *LoadType = LoadPtrType->getPointerElementType();
+
+    assert(LoadPtrType->getPointerAddressSpace() ==
+               StorePtr->getType()->getPointerAddressSpace() &&
+           LoadType == StorePtr->getType()->getPointerElementType() &&
+           "Should be a known dependence");
+
+    auto &DL = Load->getParent()->getModule()->getDataLayout();
+    unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+    auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+    auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+    // We don't need to check non-wrapping here because forward/backward
+    // dependence wouldn't be valid if these weren't monotonic accesses.
+    auto *Dist = cast<SCEVConstant>(
+        PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+    const APInt &Val = Dist->getAPInt();
+    return Val.abs() == TypeByteSize;
+  }
+
+  Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &OS,
+                                 const StoreToLoadForwardingCandidate &Cand) {
+    OS << *Cand.Store << " -->\n";
+    OS.indent(2) << *Cand.Load << "\n";
+    return OS;
+  }
+#endif
+};
+
+/// \brief Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+                                  DominatorTree *DT) {
+  SmallVector<BasicBlock *, 8> Latches;
+  L->getLoopLatches(Latches);
+  return std::all_of(Latches.begin(), Latches.end(),
+                     [&](const BasicBlock *Latch) {
+                       return DT->dominates(StoreBlock, Latch);
+                     });
+}
+
+/// \brief The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+  LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+                         DominatorTree *DT)
+      : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {}
+
+  /// \brief Look through the loop-carried and loop-independent dependences in
+  /// this loop and find store->load dependences.
+  ///
+  /// Note that no candidate is returned if LAA has failed to analyze the loop
+  /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+  std::forward_list<StoreToLoadForwardingCandidate>
+  findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+    std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+    const auto *Deps = LAI.getDepChecker().getDependences();
+    if (!Deps)
+      return Candidates;
+
+    // Find store->load dependences (consequently true dep).  Both lexically
+    // forward and backward dependences qualify.  Disqualify loads that have
+    // other unknown dependences.
+
+    SmallSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+    for (const auto &Dep : *Deps) {
+      Instruction *Source = Dep.getSource(LAI);
+      Instruction *Destination = Dep.getDestination(LAI);
+
+      if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+        if (isa<LoadInst>(Source))
+          LoadsWithUnknownDepedence.insert(Source);
+        if (isa<LoadInst>(Destination))
+          LoadsWithUnknownDepedence.insert(Destination);
+        continue;
+      }
+
+      if (Dep.isBackward())
+        // Note that the designations source and destination follow the program
+        // order, i.e. source is always first.  (The direction is given by the
+        // DepType.)
+        std::swap(Source, Destination);
+      else
+        assert(Dep.isForward() && "Needs to be a forward dependence");
+
+      auto *Store = dyn_cast<StoreInst>(Source);
+      if (!Store)
+        continue;
+      auto *Load = dyn_cast<LoadInst>(Destination);
+      if (!Load)
+        continue;
+      Candidates.emplace_front(Load, Store);
+    }
+
+    if (!LoadsWithUnknownDepedence.empty())
+      Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+        return LoadsWithUnknownDepedence.count(C.Load);
+      });
+
+    return Candidates;
+  }
+
+  /// \brief Return the index of the instruction according to program order.
+  unsigned getInstrIndex(Instruction *Inst) {
+    auto I = InstOrder.find(Inst);
+    assert(I != InstOrder.end() && "No index for instruction");
+    return I->second;
+  }
+
+  /// \brief If a load has multiple candidates associated (i.e. different
+  /// stores), it means that it could be forwarding from multiple stores
+  /// depending on control flow.  Remove these candidates.
+  ///
+  /// Here, we rely on LAA to include the relevant loop-independent dependences.
+  /// LAA is known to omit these in the very simple case when the read and the
+  /// write within an alias set always takes place using the *same* pointer.
+  ///
+  /// However, we know that this is not the case here, i.e. we can rely on LAA
+  /// to provide us with loop-independent dependences for the cases we're
+  /// interested.  Consider the case for example where a loop-independent
+  /// dependece S1->S2 invalidates the forwarding S3->S2.
+  ///
+  ///         A[i]   = ...   (S1)
+  ///         ...    = A[i]  (S2)
+  ///         A[i+1] = ...   (S3)
+  ///
+  /// LAA will perform dependence analysis here because there are two
+  /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+  void removeDependencesFromMultipleStores(
+      std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+    // If Store is nullptr it means that we have multiple stores forwarding to
+    // this store.
+    typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>
+        LoadToSingleCandT;
+    LoadToSingleCandT LoadToSingleCand;
+
+    for (const auto &Cand : Candidates) {
+      bool NewElt;
+      LoadToSingleCandT::iterator Iter;
+
+      std::tie(Iter, NewElt) =
+          LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+      if (!NewElt) {
+        const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+        // Already multiple stores forward to this load.
+        if (OtherCand == nullptr)
+          continue;
+
+        // Handle the very basic of case when the two stores are in the same
+        // block so deciding which one forwards is easy.  The later one forwards
+        // as long as they both have a dependence distance of one to the load.
+        if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+            Cand.isDependenceDistanceOfOne(PSE) &&
+            OtherCand->isDependenceDistanceOfOne(PSE)) {
+          // They are in the same block, the later one will forward to the load.
+          if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+            OtherCand = &Cand;
+        } else
+          OtherCand = nullptr;
+      }
+    }
+
+    Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+      if (LoadToSingleCand[Cand.Load] != &Cand) {
+        DEBUG(dbgs() << "Removing from candidates: \n" << Cand
+                     << "  The load may have multiple stores forwarding to "
+                     << "it\n");
+        return true;
+      }
+      return false;
+    });
+  }
+
+  /// \brief Given two pointers operations by their RuntimePointerChecking
+  /// indices, return true if they require an alias check.
+  ///
+  /// We need a check if one is a pointer for a candidate load and the other is
+  /// a pointer for a possibly intervening store.
+  bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+                     const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath,
+                     const std::set<Value *> &CandLoadPtrs) {
+    Value *Ptr1 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+    Value *Ptr2 =
+        LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+    return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+            (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+  }
+
+  /// \brief Return pointers that are possibly written to on the path from a
+  /// forwarding store to a load.
+  ///
+  /// These pointers need to be alias-checked against the forwarding candidates.
+  SmallSet<Value *, 4> findPointersWrittenOnForwardingPath(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+    // From FirstStore to LastLoad neither of the elimination candidate loads
+    // should overlap with any of the stores.
+    //
+    // E.g.:
+    //
+    // st1 C[i]
+    // ld1 B[i] <-------,
+    // ld0 A[i] <----,  |              * LastLoad
+    // ...           |  |
+    // st2 E[i]      |  |
+    // st3 B[i+1] -- | -'              * FirstStore
+    // st0 A[i+1] ---'
+    // st4 D[i]
+    //
+    // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+    // ld0.
+
+    LoadInst *LastLoad =
+        std::max_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+                         })
+            ->Load;
+    StoreInst *FirstStore =
+        std::min_element(Candidates.begin(), Candidates.end(),
+                         [&](const StoreToLoadForwardingCandidate &A,
+                             const StoreToLoadForwardingCandidate &B) {
+                           return getInstrIndex(A.Store) <
+                                  getInstrIndex(B.Store);
+                         })
+            ->Store;
+
+    // We're looking for stores after the first forwarding store until the end
+    // of the loop, then from the beginning of the loop until the last
+    // forwarded-to load.  Collect the pointer for the stores.
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+    auto InsertStorePtr = [&](Instruction *I) {
+      if (auto *S = dyn_cast<StoreInst>(I))
+        PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+    };
+    const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+    std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+                  MemInstrs.end(), InsertStorePtr);
+    std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+                  InsertStorePtr);
+
+    return PtrsWrittenOnFwdingPath;
+  }
+
+  /// \brief Determine the pointer alias checks to prove that there are no
+  /// intervening stores.
+  SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+      const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+    SmallSet<Value *, 4> PtrsWrittenOnFwdingPath =
+        findPointersWrittenOnForwardingPath(Candidates);
+
+    // Collect the pointers of the candidate loads.
+    // FIXME: SmallSet does not work with std::inserter.
+    std::set<Value *> CandLoadPtrs;
+    std::transform(Candidates.begin(), Candidates.end(),
+                   std::inserter(CandLoadPtrs, CandLoadPtrs.begin()),
+                   std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
+
+    const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+                 [&](const RuntimePointerChecking::PointerCheck &Check) {
+                   for (auto PtrIdx1 : Check.first->Members)
+                     for (auto PtrIdx2 : Check.second->Members)
+                       if (needsChecking(PtrIdx1, PtrIdx2,
+                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
+                         return true;
+                   return false;
+                 });
+
+    DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
+    DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+    return Checks;
+  }
+
+  /// \brief Perform the transformation for a candidate.
+  void
+  propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+                                  SCEVExpander &SEE) {
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    Value *Ptr = Cand.Load->getPointerOperand();
+    auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+    auto *PH = L->getLoopPreheader();
+    Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+                                          PH->getTerminator());
+    Value *Initial =
+        new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+    PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+                                   &L->getHeader()->front());
+    PHI->addIncoming(Initial, PH);
+    PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+    Cand.Load->replaceAllUsesWith(PHI);
+  }
+
+  /// \brief Top-level driver for each loop: find store->load forwarding
+  /// candidates, add run-time checks and perform transformation.
+  bool processLoop() {
+    DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+                 << "\" checking " << *L << "\n");
+    // Look for store-to-load forwarding cases across the
+    // backedge. E.g.:
+    //
+    // loop:
+    //      %x = load %gep_i
+    //         = ... %x
+    //      store %y, %gep_i_plus_1
+    //
+    // =>
+    //
+    // ph:
+    //      %x.initial = load %gep_0
+    // loop:
+    //      %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+    //      %x = load %gep_i            <---- now dead
+    //         = ... %x.storeforward
+    //      store %y, %gep_i_plus_1
+
+    // First start with store->load dependences.
+    auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Generate an index for each load and store according to the original
+    // program order.  This will be used later.
+    InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+    // To keep things simple for now, remove those where the load is potentially
+    // fed by multiple stores.
+    removeDependencesFromMultipleStores(StoreToLoadDependences);
+    if (StoreToLoadDependences.empty())
+      return false;
+
+    // Filter the candidates further.
+    SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+    unsigned NumForwarding = 0;
+    for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) {
+      DEBUG(dbgs() << "Candidate " << Cand);
+      // Make sure that the stored values is available everywhere in the loop in
+      // the next iteration.
+      if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+        continue;
+
+      // Check whether the SCEV difference is the same as the induction step,
+      // thus we load the value in the next iteration.
+      if (!Cand.isDependenceDistanceOfOne(PSE))
+        continue;
+
+      ++NumForwarding;
+      DEBUG(dbgs()
+            << NumForwarding
+            << ". Valid store-to-load forwarding across the loop backedge\n");
+      Candidates.push_back(Cand);
+    }
+    if (Candidates.empty())
+      return false;
+
+    // Check intervening may-alias stores.  These need runtime checks for alias
+    // disambiguation.
+    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
+        collectMemchecks(Candidates);
+
+    // Too many checks are likely to outweigh the benefits of forwarding.
+    if (Checks.size() > Candidates.size() * CheckPerElim) {
+      DEBUG(dbgs() << "Too many run-time checks needed.\n");
+      return false;
+    }
+
+    if (LAI.PSE.getUnionPredicate().getComplexity() >
+        LoadElimSCEVCheckThreshold) {
+      DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+      return false;
+    }
+
+    // Point of no-return, start the transformation.  First, version the loop if
+    // necessary.
+    if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) {
+      LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false);
+      LV.setAliasChecks(std::move(Checks));
+      LV.setSCEVChecks(LAI.PSE.getUnionPredicate());
+      LV.versionLoop();
+    }
+
+    // Next, propagate the value stored by the store to the users of the load.
+    // Also for the first iteration, generate the initial value of the load.
+    SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+                     "storeforward");
+    for (const auto &Cand : Candidates)
+      propagateStoredValueToLoadUsers(Cand, SEE);
+    NumLoopLoadEliminted += NumForwarding;
+
+    return true;
+  }
+
+private:
+  Loop *L;
+
+  /// \brief Maps the load/store instructions to their index according to
+  /// program order.
+  DenseMap<Instruction *, unsigned> InstOrder;
+
+  // Analyses used.
+  LoopInfo *LI;
+  const LoopAccessInfo &LAI;
+  DominatorTree *DT;
+  PredicatedScalarEvolution PSE;
+};
+
+/// \brief The pass.  Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+  LoopLoadElimination() : FunctionPass(ID) {
+    initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto *LAA = &getAnalysis<LoopAccessAnalysis>();
+    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+    // Build up a worklist of inner-loops to vectorize. This is necessary as the
+    // act of distributing a loop creates new loops and can invalidate iterators
+    // across the loops.
+    SmallVector<Loop *, 8> Worklist;
+
+    for (Loop *TopLevelLoop : *LI)
+      for (Loop *L : depth_first(TopLevelLoop))
+        // We only handle inner-most loops.
+        if (L->empty())
+          Worklist.push_back(L);
+
+    // Now walk the identified inner loops.
+    bool Changed = false;
+    for (Loop *L : Worklist) {
+      const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+      // The actual work is performed by LoadEliminationForLoop.
+      LoadEliminationForLoop LEL(L, LI, LAI, DT);
+      Changed |= LEL.processLoop();
+    }
+
+    // Process each loop nest in the function.
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessAnalysis>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  static char ID;
+};
+}
+
+char LoopLoadElimination::ID;
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+namespace llvm {
+FunctionPass *createLoopLoadEliminationPass() {
+  return new LoopLoadElimination();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 0000000..27c2d88
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1527 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reroll"
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+  cl::desc("The maximum increment for loop rerolling"));
+
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+                          cl::Hidden,
+                          cl::desc("The maximum number of failures to tolerate"
+                                   " during fuzzy matching. (default: 400)"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+  enum IterationLimits {
+    /// The maximum number of iterations that we'll try and reroll. This
+    /// has to be less than 25 in order to fit into a SmallBitVector.
+    IL_MaxRerollIterations = 16,
+    /// The bitvector index used by loop induction variables and other
+    /// instructions that belong to all iterations.
+    IL_All,
+    IL_End
+  };
+
+  class LoopReroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopReroll() : LoopPass(ID) {
+      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+  protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+    bool PreserveLCSSA;
+
+    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+
+    // Map between induction variable and its increment
+    DenseMap<Instruction *, int64_t> IVToIncMap;
+
+    // A chain of isomorphic instructions, identified by a single-use PHI
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L)
+        : Valid(false), Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      typedef SmallInstructionVector::iterator iterator;
+      typedef SmallInstructionVector::const_iterator const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return std::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return std::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible
+      //     reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (Instruction *J : PossibleReds[i]) {
+              PossibleRedSet.insert(J);
+              PossibleRedIdx[J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    // A DAGRootSet models an induction variable being used in a rerollable
+    // loop. For example,
+    //
+    //   x[i*3+0] = y1
+    //   x[i*3+1] = y2
+    //   x[i*3+2] = y3
+    //
+    //   Base instruction -> i*3
+    //                    +---+----+
+    //                   /    |     \
+    //               ST[y1]  +1     +2  <-- Roots
+    //                        |      |
+    //                      ST[y2] ST[y3]
+    //
+    // There may be multiple DAGRoots, for example:
+    //
+    //   x[i*2+0] = ...   (1)
+    //   x[i*2+1] = ...   (1)
+    //   x[i*2+4] = ...   (2)
+    //   x[i*2+5] = ...   (2)
+    //   x[(i+1234)*2+5678] = ... (3)
+    //   x[(i+1234)*2+5679] = ... (3)
+    //
+    // The loop will be rerolled by adding a new loop induction variable,
+    // one for the Base instruction in each DAGRootSet.
+    //
+    struct DAGRootSet {
+      Instruction *BaseInst;
+      SmallInstructionVector Roots;
+      // The instructions between IV and BaseInst (but not including BaseInst).
+      SmallInstructionSet SubsumedInsts;
+    };
+
+    // The set of all DAG roots, and state tracking of all roots
+    // for a particular induction variable.
+    struct DAGRootTracker {
+      DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+                     ScalarEvolution *SE, AliasAnalysis *AA,
+                     TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+                     bool PreserveLCSSA,
+                     DenseMap<Instruction *, int64_t> &IncrMap)
+          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+            PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {}
+
+      /// Stage 1: Find all the DAG roots for the induction variable.
+      bool findRoots();
+      /// Stage 2: Validate if the found roots are valid.
+      bool validate(ReductionTracker &Reductions);
+      /// Stage 3: Assuming validate() returned true, perform the
+      /// replacement.
+      /// @param IterCount The maximum iteration count of L.
+      void replace(const SCEV *IterCount);
+
+    protected:
+      typedef MapVector<Instruction*, SmallBitVector> UsesTy;
+
+      bool findRootsRecursive(Instruction *IVU,
+                              SmallInstructionSet SubsumedInsts);
+      bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+      bool collectPossibleRoots(Instruction *Base,
+                                std::map<int64_t,Instruction*> &Roots);
+
+      bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+      void collectInLoopUserSet(const SmallInstructionVector &Roots,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+      void collectInLoopUserSet(Instruction *Root,
+                                const SmallInstructionSet &Exclude,
+                                const SmallInstructionSet &Final,
+                                DenseSet<Instruction *> &Users);
+
+      UsesTy::iterator nextInstr(int Val, UsesTy &In,
+                                 const SmallInstructionSet &Exclude,
+                                 UsesTy::iterator *StartI=nullptr);
+      bool isBaseInst(Instruction *I);
+      bool isRootInst(Instruction *I);
+      bool instrDependsOn(Instruction *I,
+                          UsesTy::iterator Start,
+                          UsesTy::iterator End);
+
+      LoopReroll *Parent;
+
+      // Members of Parent, replicated here for brevity.
+      Loop *L;
+      ScalarEvolution *SE;
+      AliasAnalysis *AA;
+      TargetLibraryInfo *TLI;
+      DominatorTree *DT;
+      LoopInfo *LI;
+      bool PreserveLCSSA;
+
+      // The loop induction variable.
+      Instruction *IV;
+      // Loop step amount.
+      int64_t Inc;
+      // Loop reroll count; if Inc == 1, this records the scaling applied
+      // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+      // If Inc is not 1, Scale = Inc.
+      uint64_t Scale;
+      // The roots themselves.
+      SmallVector<DAGRootSet,16> RootSets;
+      // All increment instructions for IV.
+      SmallInstructionVector LoopIncs;
+      // Map of all instructions in the loop (in order) to the iterations
+      // they are used in (or specially, IL_All for instructions
+      // used in the loop increment mechanism).
+      UsesTy Uses;
+      // Map between induction variable and its increment
+      DenseMap<Instruction *, int64_t> &IVToIncMap;
+    };
+
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+                ReductionTracker &Reductions);
+  };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+  return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (User *U : I->users()) {
+    if (!L->contains(cast<Instruction>(U)))
+      return true;
+  }
+  return false;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+            dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      if (const SCEVConstant *IncSCEV =
+          dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
+        const APInt &AInt = IncSCEV->getAPInt().abs();
+        if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
+          continue;
+        IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+                     << "\n");
+        PossibleIVs.push_back(&*I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+  if (C->user_empty())
+    return;
+
+  do {
+    C = cast<Instruction>(*C->user_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_empty())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (User *U : C->users()) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(U)))
+      if (cast<Instruction>(U) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(&*I, L);
+    if (!SLR.valid())
+      continue;
+
+    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+          SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Use &U : I->uses()) {
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(U) == L->getHeader())
+            continue;
+        }
+
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (SmallInstructionVector::const_iterator I = Roots.begin(),
+       IE = Roots.end(); I != IE; ++I)
+    collectInLoopUserSet(*I, Exclude, Final, Users);
+}
+
+static bool isSimpleLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+  if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+    switch (I->getOpcode()) {
+    default: return false;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::Shl:
+    case Instruction::AShr:
+    case Instruction::LShr:
+    case Instruction::GetElementPtr:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+  if (!BO || BO->getOpcode() != Instruction::Add)
+    return false;
+
+  for (auto *UU : BO->users()) {
+    PHINode *PN = dyn_cast<PHINode>(UU);
+    if (PN && PN == IV)
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+  SmallInstructionVector BaseUsers;
+
+  for (auto *I : Base->users()) {
+    ConstantInt *CI = nullptr;
+
+    if (isLoopIncrement(I, IV)) {
+      LoopIncs.push_back(cast<Instruction>(I));
+      continue;
+    }
+
+    // The root nodes must be either GEPs, ORs or ADDs.
+    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Or)
+        CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+      CI = dyn_cast<ConstantInt>(LastOperand);
+    }
+
+    if (!CI) {
+      if (Instruction *II = dyn_cast<Instruction>(I)) {
+        BaseUsers.push_back(II);
+        continue;
+      } else {
+        DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
+        return false;
+      }
+    }
+
+    int64_t V = std::abs(CI->getValue().getSExtValue());
+    if (Roots.find(V) != Roots.end())
+      // No duplicates, please.
+      return false;
+
+    Roots[V] = cast<Instruction>(I);
+  }
+
+  if (Roots.empty())
+    return false;
+
+  // If we found non-loop-inc, non-root users of Base, assume they are
+  // for the zeroth root index. This is because "add %a, 0" gets optimized
+  // away.
+  if (BaseUsers.size()) {
+    if (Roots.find(0) != Roots.end()) {
+      DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+      return false;
+    }
+    Roots[0] = Base;
+  }
+
+  // Calculate the number of users of the base, or lowest indexed, iteration.
+  unsigned NumBaseUses = BaseUsers.size();
+  if (NumBaseUses == 0)
+    NumBaseUses = Roots.begin()->second->getNumUses();
+
+  // Check that every node has the same number of users.
+  for (auto &KV : Roots) {
+    if (KV.first == 0)
+      continue;
+    if (KV.second->getNumUses() != NumBaseUses) {
+      DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+            << "#Base=" << NumBaseUses << ", #Root=" <<
+            KV.second->getNumUses() << "\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+  // Does the user look like it could be part of a root set?
+  // All its users must be simple arithmetic ops.
+  if (I->getNumUses() > IL_MaxRerollIterations)
+    return false;
+
+  if ((I->getOpcode() == Instruction::Mul ||
+       I->getOpcode() == Instruction::PHI) &&
+      I != IV &&
+      findRootsBase(I, SubsumedInsts))
+    return true;
+
+  SubsumedInsts.insert(I);
+
+  for (User *V : I->users()) {
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
+      continue;
+
+    if (!I || !isSimpleArithmeticOp(I) ||
+        !findRootsRecursive(I, SubsumedInsts))
+      return false;
+  }
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+
+  // The base instruction needs to be a multiply so
+  // that we can erase it.
+  if (IVU->getOpcode() != Instruction::Mul &&
+      IVU->getOpcode() != Instruction::PHI)
+    return false;
+
+  std::map<int64_t, Instruction*> V;
+  if (!collectPossibleRoots(IVU, V))
+    return false;
+
+  // If we didn't get a root for index zero, then IVU must be
+  // subsumed.
+  if (V.find(0) == V.end())
+    SubsumedInsts.insert(IVU);
+
+  // Partition the vector into monotonically increasing indexes.
+  DAGRootSet DRS;
+  DRS.BaseInst = nullptr;
+
+  for (auto &KV : V) {
+    if (!DRS.BaseInst) {
+      DRS.BaseInst = KV.second;
+      DRS.SubsumedInsts = SubsumedInsts;
+    } else if (DRS.Roots.empty()) {
+      DRS.Roots.push_back(KV.second);
+    } else if (V.find(KV.first - 1) != V.end()) {
+      DRS.Roots.push_back(KV.second);
+    } else {
+      // Linear sequence terminated.
+      RootSets.push_back(DRS);
+      DRS.BaseInst = KV.second;
+      DRS.SubsumedInsts = SubsumedInsts;
+      DRS.Roots.clear();
+    }
+  }
+  RootSets.push_back(DRS);
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::findRoots() {
+  Inc = IVToIncMap[IV];
+
+  assert(RootSets.empty() && "Unclean state!");
+  if (std::abs(Inc) == 1) {
+    for (auto *IVU : IV->users()) {
+      if (isLoopIncrement(IVU, IV))
+        LoopIncs.push_back(cast<Instruction>(IVU));
+    }
+    if (!findRootsRecursive(IV, SmallInstructionSet()))
+      return false;
+    LoopIncs.push_back(IV);
+  } else {
+    if (!findRootsBase(IV, SmallInstructionSet()))
+      return false;
+  }
+
+  // Ensure all sets have the same size.
+  if (RootSets.empty()) {
+    DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+    return false;
+  }
+  for (auto &V : RootSets) {
+    if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+      DEBUG(dbgs()
+            << "LRR: Aborting because not all root sets have the same size\n");
+      return false;
+    }
+  }
+
+  // And ensure all loop iterations are consecutive. We rely on std::map
+  // providing ordered traversal.
+  for (auto &V : RootSets) {
+    const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
+    if (!ADR)
+      return false;
+
+    // Consider a DAGRootSet with N-1 roots (so N different values including
+    //   BaseInst).
+    // Define d = Roots[0] - BaseInst, which should be the same as
+    //   Roots[I] - Roots[I-1] for all I in [1..N).
+    // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+    //   loop iteration J.
+    //
+    // Now, For the loop iterations to be consecutive:
+    //   D = d * N
+
+    unsigned N = V.Roots.size() + 1;
+    const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
+    const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+    if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
+      DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
+      return false;
+    }
+  }
+  Scale = RootSets[0].Roots.size() + 1;
+
+  if (Scale > IL_MaxRerollIterations) {
+    DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+          << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
+          << "\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
+
+  return true;
+}
+
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+  // Populate the MapVector with all instructions in the block, in order first,
+  // so we can iterate over the contents later in perfect order.
+  for (auto &I : *L->getHeader()) {
+    Uses[&I].resize(IL_End);
+  }
+
+  SmallInstructionSet Exclude;
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  for (auto &DRS : RootSets) {
+    DenseSet<Instruction*> VBase;
+    collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+    for (auto *I : VBase) {
+      Uses[I].set(0);
+    }
+
+    unsigned Idx = 1;
+    for (auto *Root : DRS.Roots) {
+      DenseSet<Instruction*> V;
+      collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+      // While we're here, check the use sets are the same size.
+      if (V.size() != VBase.size()) {
+        DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+        return false;
+      }
+
+      for (auto *I : V) {
+        Uses[I].set(Idx);
+      }
+      ++Idx;
+    }
+
+    // Make sure our subsumed instructions are remembered too.
+    for (auto *I : DRS.SubsumedInsts) {
+      Uses[I].set(IL_All);
+    }
+  }
+
+  // Make sure the loop increments are also accounted for.
+
+  Exclude.clear();
+  for (auto &DRS : RootSets) {
+    Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+    Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+    Exclude.insert(DRS.BaseInst);
+  }
+
+  DenseSet<Instruction*> V;
+  collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+  for (auto *I : V) {
+    Uses[I].set(IL_All);
+  }
+
+  return true;
+
+}
+
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+                                      const SmallInstructionSet &Exclude,
+                                      UsesTy::iterator *StartI) {
+  UsesTy::iterator I = StartI ? *StartI : In.begin();
+  while (I != In.end() && (I->second.test(Val) == 0 ||
+                           Exclude.count(I->first) != 0))
+    ++I;
+  return I;
+}
+
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (DRS.BaseInst == I)
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+  for (auto &DRS : RootSets) {
+    if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
+      return true;
+  }
+  return false;
+}
+
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+                                                UsesTy::iterator Start,
+                                                UsesTy::iterator End) {
+  for (auto *U : I->users()) {
+    for (auto It = Start; It != End; ++It)
+      if (U == It->first)
+        return true;
+  }
+  return false;
+}
+
+static bool isIgnorableInst(const Instruction *I) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+  const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case llvm::Intrinsic::annotation:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    // TODO: the following intrinsics may also be whitelisted:
+    //   lifetime_start, lifetime_end, invariant_start, invariant_end
+      return true;
+  }
+  return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet;
+  SmallInstructionSet PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet,
+                             PossibleRedPHISet, PossibleRedLastSet);
+
+  // Populate "Uses" with where each instruction is used.
+  if (!collectUsedInstructions(PossibleRedSet))
+    return false;
+
+  // Make sure we mark the reduction PHIs as used in all iterations.
+  for (auto *I : PossibleRedPHISet) {
+    Uses[I].set(IL_All);
+  }
+
+  // Make sure all instructions in the loop are in one and only one
+  // set.
+  for (auto &KV : Uses) {
+    if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
+      DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+            << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+      return false;
+    }
+  }
+
+  DEBUG(
+    for (auto &KV : Uses) {
+      dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+    }
+    );
+
+  for (unsigned Iter = 1; Iter < Scale; ++Iter) {
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    // Compare iteration Iter to the base.
+    SmallInstructionSet Visited;
+    auto BaseIt = nextInstr(0, Uses, Visited);
+    auto RootIt = nextInstr(Iter, Uses, Visited);
+    auto LastRootIt = Uses.begin();
+
+    while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+      Instruction *BaseInst = BaseIt->first;
+      Instruction *RootInst = RootIt->first;
+
+      // Skip over the IV or root instructions; only match their users.
+      bool Continue = false;
+      if (isBaseInst(BaseInst)) {
+        Visited.insert(BaseInst);
+        BaseIt = nextInstr(0, Uses, Visited);
+        Continue = true;
+      }
+      if (isRootInst(RootInst)) {
+        LastRootIt = RootIt;
+        Visited.insert(RootInst);
+        RootIt = nextInstr(Iter, Uses, Visited);
+        Continue = true;
+      }
+      if (Continue) continue;
+
+      if (!BaseInst->isSameOperationAs(RootInst)) {
+        // Last chance saloon. We don't try and solve the full isomorphism
+        // problem, but try and at least catch the case where two instructions
+        // *of different types* are round the wrong way. We won't be able to
+        // efficiently tell, given two ADD instructions, which way around we
+        // should match them, but given an ADD and a SUB, we can at least infer
+        // which one is which.
+        //
+        // This should allow us to deal with a greater subset of the isomorphism
+        // problem. It does however change a linear algorithm into a quadratic
+        // one, so limit the number of probes we do.
+        auto TryIt = RootIt;
+        unsigned N = NumToleratedFailedMatches;
+        while (TryIt != Uses.end() &&
+               !BaseInst->isSameOperationAs(TryIt->first) &&
+               N--) {
+          ++TryIt;
+          TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
+        }
+
+        if (TryIt == Uses.end() || TryIt == RootIt ||
+            instrDependsOn(TryIt->first, RootIt, TryIt)) {
+          DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                " vs. " << *RootInst << "\n");
+          return false;
+        }
+
+        RootIt = TryIt;
+        RootInst = TryIt->first;
+      }
+
+      // All instructions between the last root and this root
+      // may belong to some other iteration. If they belong to a
+      // future iteration, then they're dangerous to alias with.
+      //
+      // Note that because we allow a limited amount of flexibility in the order
+      // that we visit nodes, LastRootIt might be *before* RootIt, in which
+      // case we've already checked this set of instructions so we shouldn't
+      // do anything.
+      for (; LastRootIt < RootIt; ++LastRootIt) {
+        Instruction *I = LastRootIt->first;
+        if (LastRootIt->second.find_first() < (int)Iter)
+          continue;
+        if (I->mayWriteToMemory())
+          AST.add(I);
+        // Note: This is specifically guarded by a check on isa<PHINode>,
+        // which while a valid (somewhat arbitrary) micro-optimization, is
+        // needed because otherwise isSafeToSpeculativelyExecute returns
+        // false on PHI nodes.
+        if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
+            !isSafeToSpeculativelyExecute(I))
+          // Intervening instructions cause side effects.
+          FutureSideEffects = true;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some other root instruction.
+      if (RootIt->second.count() > 1) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (prev. case overlap)\n");
+        return false;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (RootInst->mayReadFromMemory())
+        for (auto &K : AST) {
+          if (K.aliasesUnknownInst(RootInst, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                            " vs. " << *RootInst << " (depends on future store)\n");
+            return false;
+          }
+        }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (simple) load/store dependencies.
+      if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) &&
+                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
+                                (!isSimpleLoadStore(RootInst) &&
+                                 !isSafeToSpeculativelyExecute(RootInst)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst <<
+                        " (side effects prevent reordering)\n");
+        return false;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
+
+      if (!(InReduction && BaseInst->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;
+        for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+          Value *Op2 = RootInst->getOperand(j);
+
+          // If this is part of a reduction (and the operation is not
+          // associatve), then we match all operands, but not those that are
+          // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(RootInst, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end()) {
+            Op2 = BMI->second;
+          } else {
+            for (auto &DRS : RootSets) {
+              if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+                Op2 = DRS.BaseInst;
+                break;
+              }
+            }
+          }
+
+          if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+            // If we've not already decided to swap the matched operands, and
+            // we've not already matched our first operand (note that we could
+            // have skipped matching the first operand because it is part of a
+            // reduction above), and the instruction is commutative, then try
+            // the swapped match.
+            if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+                BaseInst->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+                    << " vs. " << *RootInst << " (operand " << j << ")\n");
+              return false;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(BaseInst) &&
+           hasUsesOutsideLoop(BaseInst, L)) ||
+          (!PossibleRedLastSet.count(RootInst) &&
+           hasUsesOutsideLoop(RootInst, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
+                        " vs. " << *RootInst << " (uses outside loop)\n");
+        return false;
+      }
+
+      Reductions.recordPair(BaseInst, RootInst, Iter);
+      BaseMap.insert(std::make_pair(RootInst, BaseInst));
+
+      LastRootIt = RootIt;
+      Visited.insert(BaseInst);
+      Visited.insert(RootInst);
+      BaseIt = nextInstr(0, Uses, Visited);
+      RootIt = nextInstr(Iter, Uses, Visited);
+    }
+    assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
+            "Mismatched set sizes!");
+  }
+
+  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+                  *IV << "\n");
+
+  return true;
+}
+
+void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
+  BasicBlock *Header = L->getHeader();
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin();
+       J != Header->rend();) {
+    unsigned I = Uses[&*J].find_first();
+    if (I > 0 && I < IL_All) {
+      Instruction *D = &*J;
+      DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
+      D->eraseFromParent();
+      continue;
+    }
+
+    ++J;
+  }
+  bool Negative = IVToIncMap[IV] < 0;
+  const DataLayout &DL = Header->getModule()->getDataLayout();
+
+  // We need to create a new induction variable for each different BaseInst.
+  for (auto &DRS : RootSets) {
+    // Insert the new induction variable.
+    const SCEVAddRecExpr *RealIVSCEV =
+      cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+    const SCEV *Start = RealIVSCEV->getStart();
+    const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(
+        Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L,
+        SCEV::FlagAnyWrap));
+    { // Limit the lifetime of SCEVExpander.
+      SCEVExpander Expander(*SE, DL, "reroll");
+      Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());
+
+      for (auto &KV : Uses) {
+        if (KV.second.find_first() == 0)
+          KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
+      }
+
+      if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+        // FIXME: Why do we need this check?
+        if (Uses[BI].find_first() == IL_All) {
+          const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+
+          // Iteration count SCEV minus 1
+          const SCEV *ICMinus1SCEV = SE->getMinusSCEV(
+              ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));
+
+          Value *ICMinus1; // Iteration count minus 1
+          if (isa<SCEVConstant>(ICMinus1SCEV)) {
+            ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
+          } else {
+            BasicBlock *Preheader = L->getLoopPreheader();
+            if (!Preheader)
+              Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
+
+            ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
+                                              Preheader->getTerminator());
+          }
+
+          Value *Cond =
+            new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
+          BI->setCondition(Cond);
+
+          if (BI->getSuccessor(1) != Header)
+            BI->swapSuccessors();
+        }
+      }
+    }
+  }
+
+  SimplifyInstructionsInBlock(Header, TLI);
+  DeleteDeadPHIs(Header, TLI);
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (Instruction *J : PossibleReds[i]) {
+      // Note that all instructions in the chain must have been found because
+      // all instructions in the function must have been assigned to some
+      // iteration.
+      int Iter = PossibleRedIter[J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (User *U : PossibleReds[i].getReducedValue()->users()) {
+      Users.push_back(cast<Instruction>(U));
+    }
+
+    for (SmallInstructionVector::iterator J = Users.begin(),
+         JE = Users.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+                          IVToIncMap);
+
+  if (!DAGRoots.findRoots())
+    return false;
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *IV << "\n");
+
+  if (!DAGRoots.validate(Reductions))
+    return false;
+  if (!Reductions.validateSelected())
+    return false;
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+  DAGRoots.replace(IterCount);
+
+  ++NumRerolledLoops;
+  return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+        "] Loop %" << Header->getName() << " (" <<
+        L->getNumBlocks() << " block(s))\n");
+
+  bool Changed = false;
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+  const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
+  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  IVToIncMap.clear();
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return Changed;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
+       IE = PossibleIVs.end(); I != IE; ++I)
+    if (reroll(*I, L, Header, IterCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 0000000..5e6c2da
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,624 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned>
+DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
+       cl::desc("The default maximum header size for automatic loop rotation"));
+
+STATISTIC(NumRotated, "Number of loops rotated");
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA;
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal];
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+         UE = OrigHeaderVal->use_end(); UI != UE; ) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+  }
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
+                       const TargetTransformInfo *TTI, AssumptionCache *AC,
+                       DominatorTree *DT, ScalarEvolution *SE,
+                       bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+            << " instructions: "; L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MaxHeaderSize)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader)
+    return false;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) &&
+        !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() &&
+        !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) &&
+        !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
+    Value *V = SimplifyInstruction(C, DL);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      delete C;
+      ValueMap[Inst] = V;
+    } else {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+      ValueMap[Inst] = C;
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero())
+          != NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Update DominatorTree to reflect the CFG change we just made.  Then split
+    // edges as necessary to preserve LoopSimplify form.
+    if (DT) {
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+
+      // Update OrigHeader to be dominated by the new header block.
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+    }
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(),
+                                                 PE = ExitPreds.end();
+         PI != PE; ++PI) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(*PI);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      if (isa<IndirectBrInst>((*PI)->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == *PI;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          *PI, Exit, CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT) {
+      // Update OrigHeader to be dominated by the new header block.
+      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          pred_iterator PI = pred_begin(BB);
+          BasicBlock *NearestDom = *PI;
+          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+      // If the dominator changed, this may have an effect on other
+      // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
+    }
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd = !isa<Constant>(I->getOperand(0))
+                          ? I->getOperand(0)
+                          : !isa<Constant>(I->getOperand(1))
+                                ? I->getOperand(1)
+                                : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+        << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L as many times as possible. Return true if the loop is rotated
+/// at least once.
+static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI,
+                                  const TargetTransformInfo *TTI,
+                                  AssumptionCache *AC, DominatorTree *DT,
+                                  ScalarEvolution *SE) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT);
+
+  // One loop can be rotated multiple times.
+  bool MadeChange = false;
+  while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) {
+    MadeChange = true;
+    SimplifiedLatch = false;
+  }
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange;
+}
+
+namespace {
+
+class LoopRotate : public LoopPass {
+  unsigned MaxHeaderSize;
+
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+    initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+    if (SpecifiedMaxHeaderSize == -1)
+      MaxHeaderSize = DefaultRotationThreshold;
+    else
+      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+  }
+
+  // LCSSA form makes instruction renaming easier.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreservedID(LCSSAID);
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<SCEVAAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipOptnoneFunction(L))
+      return false;
+    Function &F = *L->getHeader()->getParent();
+
+    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+    return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE);
+  }
+};
+}
+
+char LoopRotate::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+  return new LoopRotate(MaxHeaderSize);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 0000000..2101225
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,5024 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
+//   ...
+//   %i.next = add %i, 1
+//   %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+//       of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+//       smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+//       multiple base registers, or as the increment expression in an addrec),
+//       we may not actually need both reg and (-1 * reg) in registers; the
+//       negation can be implemented by using a sub instead of an add. The
+//       lack of support for taking this into consideration when making
+//       register pressure decisions is partly worked around by the "Special"
+//       use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reduce"
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+  "enable-lsr-phielim", cl::Hidden, cl::init(true),
+  cl::desc("Enable LSR phi elimination"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+  "stress-ivchain", cl::Hidden, cl::init(false),
+  cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+struct MemAccessTy {
+  /// Used in situations where the accessed memory type is unknown.
+  static const unsigned UnknownAddressSpace = ~0u;
+
+  Type *MemTy;
+  unsigned AddrSpace;
+
+  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
+
+  MemAccessTy(Type *Ty, unsigned AS) :
+    MemTy(Ty), AddrSpace(AS) {}
+
+  bool operator==(MemAccessTy Other) const {
+    return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+  }
+
+  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+  static MemAccessTy getUnknown(LLVMContext &Ctx) {
+    return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace);
+  }
+};
+
+/// This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+  /// This represents the set of LSRUse indices which reference
+  /// a particular register.
+  SmallBitVector UsedByIndices;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+void RegSortData::print(raw_ostream &OS) const {
+  OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+LLVM_DUMP_METHOD
+void RegSortData::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+namespace {
+
+/// Map register candidates to information about how they are used.
+class RegUseTracker {
+  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
+
+  RegUsesTy RegUsesMap;
+  SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+  void countRegister(const SCEV *Reg, size_t LUIdx);
+  void dropRegister(const SCEV *Reg, size_t LUIdx);
+  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+  void clear();
+
+  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
+  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
+  iterator begin() { return RegSequence.begin(); }
+  iterator end()   { return RegSequence.end(); }
+  const_iterator begin() const { return RegSequence.begin(); }
+  const_iterator end() const   { return RegSequence.end(); }
+};
+
+}
+
+void
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
+  std::pair<RegUsesTy::iterator, bool> Pair =
+    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+  RegSortData &RSD = Pair.first->second;
+  if (Pair.second)
+    RegSequence.push_back(Reg);
+  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+  RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
+  RegUsesTy::iterator It = RegUsesMap.find(Reg);
+  assert(It != RegUsesMap.end());
+  RegSortData &RSD = It->second;
+  assert(RSD.UsedByIndices.size() > LUIdx);
+  RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+  assert(LUIdx <= LastLUIdx);
+
+  // Update RegUses. The data structure is not optimized for this purpose;
+  // we must iterate through it and update each of the bit vectors.
+  for (auto &Pair : RegUsesMap) {
+    SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
+    if (LUIdx < UsedByIndices.size())
+      UsedByIndices[LUIdx] =
+        LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
+    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+  }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  if (I == RegUsesMap.end())
+    return false;
+  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+  int i = UsedByIndices.find_first();
+  if (i == -1) return false;
+  if ((size_t)i != LUIdx) return true;
+  return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  assert(I != RegUsesMap.end() && "Unknown register!");
+  return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+  RegUsesMap.clear();
+  RegSequence.clear();
+}
+
+namespace {
+
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
+struct Formula {
+  /// Global base address used for complex addressing.
+  GlobalValue *BaseGV;
+
+  /// Base offset for complex addressing.
+  int64_t BaseOffset;
+
+  /// Whether any complex addressing has a base register.
+  bool HasBaseReg;
+
+  /// The scale of any complex addressing.
+  int64_t Scale;
+
+  /// The list of "base" registers for this use. When this is non-empty. The
+  /// canonical representation of a formula is
+  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// #1 enforces that the scaled register is always used when at least two
+  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+  /// #2 enforces that 1 * reg is reg.
+  /// This invariant can be temporarly broken while building a formula.
+  /// However, every formula inserted into the LSRInstance must be in canonical
+  /// form.
+  SmallVector<const SCEV *, 4> BaseRegs;
+
+  /// The 'scaled' register for this use. This should be non-null when Scale is
+  /// not zero.
+  const SCEV *ScaledReg;
+
+  /// An additional constant offset which added near the use. This requires a
+  /// temporary register, but the offset itself can live in an add immediate
+  /// field rather than a register.
+  int64_t UnfoldedOffset;
+
+  Formula()
+      : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
+        ScaledReg(nullptr), UnfoldedOffset(0) {}
+
+  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+  bool isCanonical() const;
+
+  void canonicalize();
+
+  bool unscale();
+
+  size_t getNumRegs() const;
+  Type *getType() const;
+
+  void deleteBaseReg(const SCEV *&S);
+
+  bool referencesReg(const SCEV *S) const;
+  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                  const RegUseTracker &RegUses) const;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+/// Recursion helper for initialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+                           SmallVectorImpl<const SCEV *> &Good,
+                           SmallVectorImpl<const SCEV *> &Bad,
+                           ScalarEvolution &SE) {
+  // Collect expressions which properly dominate the loop header.
+  if (SE.properlyDominates(S, L->getHeader())) {
+    Good.push_back(S);
+    return;
+  }
+
+  // Look at add operands.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (const SCEV *S : Add->operands())
+      DoInitialMatch(S, L, Good, Bad, SE);
+    return;
+  }
+
+  // Look at addrec operands.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+    if (!AR->getStart()->isZero()) {
+      DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+                                      AR->getStepRecurrence(SE),
+                                      // FIXME: AR->getNoWrapFlags()
+                                      AR->getLoop(), SCEV::FlagAnyWrap),
+                     L, Good, Bad, SE);
+      return;
+    }
+
+  // Handle a multiplication by -1 (negation) if it didn't fold.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+    if (Mul->getOperand(0)->isAllOnesValue()) {
+      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+      const SCEV *NewMul = SE.getMulExpr(Ops);
+
+      SmallVector<const SCEV *, 4> MyGood;
+      SmallVector<const SCEV *, 4> MyBad;
+      DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+        SE.getEffectiveSCEVType(NewMul->getType())));
+      for (const SCEV *S : MyGood)
+        Good.push_back(SE.getMulExpr(NegOne, S));
+      for (const SCEV *S : MyBad)
+        Bad.push_back(SE.getMulExpr(NegOne, S));
+      return;
+    }
+
+  // Ok, we can't do anything interesting. Just stuff the whole thing into a
+  // register and hope for the best.
+  Bad.push_back(S);
+}
+
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+  SmallVector<const SCEV *, 4> Good;
+  SmallVector<const SCEV *, 4> Bad;
+  DoInitialMatch(S, L, Good, Bad, SE);
+  if (!Good.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Good);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  if (!Bad.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Bad);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  canonicalize();
+}
+
+/// \brief Check whether or not this formula statisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical() const {
+  if (ScaledReg)
+    return Scale != 1 || !BaseRegs.empty();
+  return BaseRegs.size() <= 1;
+}
+
+/// \brief Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::canonicalize() {
+  if (isCanonical())
+    return;
+  // So far we did not need this case. This is easy to implement but it is
+  // useless to maintain dead code. Beside it could hurt compile time.
+  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+  ScaledReg = BaseRegs.back();
+  BaseRegs.pop_back();
+  Scale = 1;
+  size_t BaseRegsSize = BaseRegs.size();
+  size_t Try = 0;
+  // If ScaledReg is an invariant, try to find a variant expression.
+  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
+    std::swap(ScaledReg, BaseRegs[Try++]);
+}
+
+/// \brief Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::unscale() {
+  if (Scale != 1)
+    return false;
+  Scale = 0;
+  BaseRegs.push_back(ScaledReg);
+  ScaledReg = nullptr;
+  return true;
+}
+
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
+size_t Formula::getNumRegs() const {
+  return !!ScaledReg + BaseRegs.size();
+}
+
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
+Type *Formula::getType() const {
+  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+         ScaledReg ? ScaledReg->getType() :
+         BaseGV ? BaseGV->getType() :
+         nullptr;
+}
+
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
+  if (&S != &BaseRegs.back())
+    std::swap(S, BaseRegs.back());
+  BaseRegs.pop_back();
+}
+
+/// Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+  return S == ScaledReg ||
+         std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
+}
+
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                         const RegUseTracker &RegUses) const {
+  if (ScaledReg)
+    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+      return true;
+  for (const SCEV *BaseReg : BaseRegs)
+    if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
+      return true;
+  return false;
+}
+
+void Formula::print(raw_ostream &OS) const {
+  bool First = true;
+  if (BaseGV) {
+    if (!First) OS << " + "; else First = false;
+    BaseGV->printAsOperand(OS, /*PrintType=*/false);
+  }
+  if (BaseOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << BaseOffset;
+  }
+  for (const SCEV *BaseReg : BaseRegs) {
+    if (!First) OS << " + "; else First = false;
+    OS << "reg(" << *BaseReg << ')';
+  }
+  if (HasBaseReg && BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: HasBaseReg**";
+  } else if (!HasBaseReg && !BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: !HasBaseReg**";
+  }
+  if (Scale != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << Scale << "*reg(";
+    if (ScaledReg)
+      OS << *ScaledReg;
+    else
+      OS << "<unknown>";
+    OS << ')';
+  }
+  if (UnfoldedOffset != 0) {
+    if (!First) OS << " + ";
+    OS << "imm(" << UnfoldedOffset << ')';
+  }
+}
+
+LLVM_DUMP_METHOD
+void Formula::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// Return true if the given add can be sign-extended without changing its
+/// value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE,
+                                bool IgnoreSignificantBits = false) {
+  // Handle the trivial case, which works for any SCEV type.
+  if (LHS == RHS)
+    return SE.getConstant(LHS->getType(), 1);
+
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getAPInt();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
+
+  // Check for a division of a constant by a constant.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+    if (!RC)
+      return nullptr;
+    const APInt &LA = C->getAPInt();
+    const APInt &RA = RC->getAPInt();
+    if (LA.srem(RA) != 0)
+      return nullptr;
+    return SE.getConstant(LA.sdiv(RA));
+  }
+
+  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+    if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
+      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+                                      IgnoreSignificantBits);
+      if (!Step) return nullptr;
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return nullptr;
+      // FlagNW is independent of the start value, step direction, and is
+      // preserved with smaller magnitude steps.
+      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+    }
+    return nullptr;
+  }
+
+  // Distribute the sdiv over add operands, if the add doesn't overflow.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+      SmallVector<const SCEV *, 8> Ops;
+      for (const SCEV *S : Add->operands()) {
+        const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
+        if (!Op) return nullptr;
+        Ops.push_back(Op);
+      }
+      return SE.getAddExpr(Ops);
+    }
+    return nullptr;
+  }
+
+  // Check for a multiply operand that we can pull RHS out of.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+      SmallVector<const SCEV *, 4> Ops;
+      bool Found = false;
+      for (const SCEV *S : Mul->operands()) {
+        if (!Found)
+          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+                                           IgnoreSignificantBits)) {
+            S = Q;
+            Found = true;
+          }
+        Ops.push_back(S);
+      }
+      return Found ? SE.getMulExpr(Ops) : nullptr;
+    }
+    return nullptr;
+  }
+
+  // Otherwise we don't know.
+  return nullptr;
+}
+
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    if (C->getAPInt().getMinSignedBits() <= 64) {
+      S = SE.getConstant(C->getType(), 0);
+      return C->getValue()->getSExtValue();
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return 0;
+}
+
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+      S = SE.getConstant(GV->getType(), 0);
+      return GV;
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+    if (Result)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+    if (Result)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return nullptr;
+}
+
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getOperand(1) == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::prefetch:
+      case Intrinsic::x86_sse_storeu_ps:
+      case Intrinsic::x86_sse2_storeu_pd:
+      case Intrinsic::x86_sse2_storeu_dq:
+      case Intrinsic::x86_sse2_storel_dq:
+        if (II->getArgOperand(0) == OperandVal)
+          isAddress = true;
+        break;
+    }
+  }
+  return isAddress;
+}
+
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const Instruction *Inst) {
+  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    AccessTy.MemTy = SI->getOperand(0)->getType();
+    AccessTy.AddrSpace = SI->getPointerAddressSpace();
+  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      AccessTy.MemTy = II->getArgOperand(0)->getType();
+      break;
+    }
+  }
+
+  // All pointers have the same requirements, so canonicalize them to an
+  // arbitrary pointer type to minimize variation.
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+    AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+                                      PTy->getAddressSpace());
+
+  return AccessTy;
+}
+
+/// Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (SE.isSCEVable(PN->getType()) &&
+        (SE.getEffectiveSCEVType(PN->getType()) ==
+         SE.getEffectiveSCEVType(AR->getType())) &&
+        SE.getSCEV(PN) == AR)
+      return true;
+  }
+  return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+                                SmallPtrSetImpl<const SCEV*> &Processed,
+                                ScalarEvolution &SE) {
+  // Zero/One operand expressions
+  switch (S->getSCEVType()) {
+  case scUnknown:
+  case scConstant:
+    return false;
+  case scTruncate:
+    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scZeroExtend:
+    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scSignExtend:
+    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  }
+
+  if (!Processed.insert(S).second)
+    return false;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (const SCEV *S : Add->operands()) {
+      if (isHighCostExpansion(S, Processed, SE))
+        return true;
+    }
+    return false;
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    if (Mul->getNumOperands() == 2) {
+      // Multiplication by a constant is ok
+      if (isa<SCEVConstant>(Mul->getOperand(0)))
+        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+      // If we have the value of one operand, check if an existing
+      // multiplication already generates this expression.
+      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+        Value *UVal = U->getValue();
+        for (User *UR : UVal->users()) {
+          // If U is a constant, it may be used by a ConstantExpr.
+          Instruction *UI = dyn_cast<Instruction>(UR);
+          if (UI && UI->getOpcode() == Instruction::Mul &&
+              SE.isSCEVable(UI->getType())) {
+            return SE.getSCEV(UI) == Mul;
+          }
+        }
+      }
+    }
+  }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (isExistingPhi(AR, SE))
+      return false;
+  }
+
+  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+  return true;
+}
+
+/// If any of the instructions is the specified set are trivially dead, delete
+/// them and see if this makes any of their operands subsequently dead.
+static bool
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
+  bool Changed = false;
+
+  while (!DeadInsts.empty()) {
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = dyn_cast_or_null<Instruction>(V);
+
+    if (!I || !isInstructionTriviallyDead(I))
+      continue;
+
+    for (Use &O : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(O)) {
+        O = nullptr;
+        if (U->use_empty())
+          DeadInsts.emplace_back(U);
+      }
+
+    I->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+namespace {
+class LSRUse;
+}
+
+/// \brief Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F);
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F);
+
+namespace {
+
+/// This class is used to measure and compare candidate formulae.
+class Cost {
+  /// TODO: Some of these could be merged. Also, a lexical ordering
+  /// isn't always optimal.
+  unsigned NumRegs;
+  unsigned AddRecCost;
+  unsigned NumIVMuls;
+  unsigned NumBaseAdds;
+  unsigned ImmCost;
+  unsigned SetupCost;
+  unsigned ScaleCost;
+
+public:
+  Cost()
+    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
+      SetupCost(0), ScaleCost(0) {}
+
+  bool operator<(const Cost &Other) const;
+
+  void Lose();
+
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+             | ImmCost | SetupCost | ScaleCost) != ~0u)
+      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+           & ImmCost & SetupCost & ScaleCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return NumRegs == ~0u;
+  }
+
+  void RateFormula(const TargetTransformInfo &TTI,
+                   const Formula &F,
+                   SmallPtrSetImpl<const SCEV *> &Regs,
+                   const DenseSet<const SCEV *> &VisitedRegs,
+                   const Loop *L,
+                   const SmallVectorImpl<int64_t> &Offsets,
+                   ScalarEvolution &SE, DominatorTree &DT,
+                   const LSRUse &LU,
+                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+private:
+  void RateRegister(const SCEV *Reg,
+                    SmallPtrSetImpl<const SCEV *> &Regs,
+                    const Loop *L,
+                    ScalarEvolution &SE, DominatorTree &DT);
+  void RatePrimaryRegister(const SCEV *Reg,
+                           SmallPtrSetImpl<const SCEV *> &Regs,
+                           const Loop *L,
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
+};
+
+}
+
+/// Tally up interesting quantities from the given register.
+void Cost::RateRegister(const SCEV *Reg,
+                        SmallPtrSetImpl<const SCEV *> &Regs,
+                        const Loop *L,
+                        ScalarEvolution &SE, DominatorTree &DT) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has already run on inner loops, will not run on outer
+    // loops, and cannot be expected to change sibling loops.
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
+      if (isExistingPhi(AR, SE))
+        return;
+
+      // Otherwise, do not consider this formula at all.
+      Lose();
+      return;
+    }
+    AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+    // Add the step value register, if it needs one.
+    // TODO: The non-affine case isn't precisely modeled here.
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
+        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
+    }
+  }
+  ++NumRegs;
+
+  // Rough heuristic; favor registers which don't require extra setup
+  // instructions in the preheader.
+  if (!isa<SCEVUnknown>(Reg) &&
+      !isa<SCEVConstant>(Reg) &&
+      !(isa<SCEVAddRecExpr>(Reg) &&
+        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+    ++SetupCost;
+
+    NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+                 SE.hasComputableLoopEvolution(Reg, L);
+}
+
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const SCEV *Reg,
+                               SmallPtrSetImpl<const SCEV *> &Regs,
+                               const Loop *L,
+                               ScalarEvolution &SE, DominatorTree &DT,
+                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Lose();
+    return;
+  }
+  if (Regs.insert(Reg).second) {
+    RateRegister(Reg, Regs, L, SE, DT);
+    if (LoserRegs && isLoser())
+      LoserRegs->insert(Reg);
+  }
+}
+
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
+                       SmallPtrSetImpl<const SCEV *> &Regs,
+                       const DenseSet<const SCEV *> &VisitedRegs,
+                       const Loop *L,
+                       const SmallVectorImpl<int64_t> &Offsets,
+                       ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
+                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  // Tally up the registers.
+  if (const SCEV *ScaledReg = F.ScaledReg) {
+    if (VisitedRegs.count(ScaledReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+  for (const SCEV *BaseReg : F.BaseRegs) {
+    if (VisitedRegs.count(BaseReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.getNumRegs();
+  if (NumBaseParts > 1)
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+  NumBaseAdds += (F.UnfoldedOffset != 0);
+
+  // Accumulate non-free scaling amounts.
+  ScaleCost += getScalingFactorCost(TTI, LU, F);
+
+  // Tally up the non-zero immediates.
+  for (int64_t O : Offsets) {
+    int64_t Offset = (uint64_t)O + F.BaseOffset;
+    if (F.BaseGV)
+      ImmCost += 64; // Handle symbolic values conservatively.
+                     // TODO: This should probably be the pointer size.
+    else if (Offset != 0)
+      ImmCost += APInt(64, Offset, true).getMinSignedBits();
+  }
+  assert(isValid() && "invalid cost");
+}
+
+/// Set this cost to a losing value.
+void Cost::Lose() {
+  NumRegs = ~0u;
+  AddRecCost = ~0u;
+  NumIVMuls = ~0u;
+  NumBaseAdds = ~0u;
+  ImmCost = ~0u;
+  SetupCost = ~0u;
+  ScaleCost = ~0u;
+}
+
+/// Choose the lower cost.
+bool Cost::operator<(const Cost &Other) const {
+  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
+                  ImmCost, SetupCost) <
+         std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
+                  Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
+                  Other.SetupCost);
+}
+
+void Cost::print(raw_ostream &OS) const {
+  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
+  if (AddRecCost != 0)
+    OS << ", with addrec cost " << AddRecCost;
+  if (NumIVMuls != 0)
+    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
+  if (NumBaseAdds != 0)
+    OS << ", plus " << NumBaseAdds << " base add"
+       << (NumBaseAdds == 1 ? "" : "s");
+  if (ScaleCost != 0)
+    OS << ", plus " << ScaleCost << " scale cost";
+  if (ImmCost != 0)
+    OS << ", plus " << ImmCost << " imm cost";
+  if (SetupCost != 0)
+    OS << ", plus " << SetupCost << " setup cost";
+}
+
+LLVM_DUMP_METHOD
+void Cost::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+namespace {
+
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+  /// The instruction which will be updated.
+  Instruction *UserInst;
+
+  /// The operand of the instruction which will be replaced. The operand may be
+  /// used more than once; every instance will be replaced.
+  Value *OperandValToReplace;
+
+  /// If this user is to use the post-incremented value of an induction
+  /// variable, this variable is non-null and holds the loop associated with the
+  /// induction variable.
+  PostIncLoopSet PostIncLoops;
+
+  /// The index of the LSRUse describing the expression which this fixup needs,
+  /// minus an offset (below).
+  size_t LUIdx;
+
+  /// A constant offset to be added to the LSRUse expression.  This allows
+  /// multiple fixups to share the same LSRUse with different offsets, for
+  /// example in an unrolled loop.
+  int64_t Offset;
+
+  bool isUseFullyOutsideLoop(const Loop *L) const;
+
+  LSRFixup();
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+LSRFixup::LSRFixup()
+  : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
+    Offset(0) {}
+
+/// Test whether this fixup always uses its value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+  // PHI nodes use their value in their incoming blocks.
+  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == OperandValToReplace &&
+          L->contains(PN->getIncomingBlock(i)))
+        return false;
+    return true;
+  }
+
+  return !L->contains(UserInst);
+}
+
+void LSRFixup::print(raw_ostream &OS) const {
+  OS << "UserInst=";
+  // Store is common and interesting enough to be worth special-casing.
+  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+    OS << "store ";
+    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+  } else if (UserInst->getType()->isVoidTy())
+    OS << UserInst->getOpcodeName();
+  else
+    UserInst->printAsOperand(OS, /*PrintType=*/false);
+
+  OS << ", OperandValToReplace=";
+  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
+
+  for (const Loop *PIL : PostIncLoops) {
+    OS << ", PostIncLoop=";
+    PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+  }
+
+  if (LUIdx != ~size_t(0))
+    OS << ", LUIdx=" << LUIdx;
+
+  if (Offset != 0)
+    OS << ", Offset=" << Offset;
+}
+
+LLVM_DUMP_METHOD
+void LSRFixup::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+namespace {
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
+    V.push_back(reinterpret_cast<const SCEV *>(-1));
+    return V;
+  }
+
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
+    V.push_back(reinterpret_cast<const SCEV *>(-2));
+    return V;
+  }
+
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+  }
+
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied.  TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+  /// An enum for a kind of use, indicating what types of scaled and immediate
+  /// operands it might support.
+  enum KindType {
+    Basic,   ///< A normal use, with no folding.
+    Special, ///< A special case of basic, allowing -1 scales.
+    Address, ///< An address use; folding according to TargetLowering
+    ICmpZero ///< An equality icmp with both operands folded into one.
+    // TODO: Add a generic icmp too?
+  };
+
+  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+
+  KindType Kind;
+  MemAccessTy AccessTy;
+
+  SmallVector<int64_t, 8> Offsets;
+  int64_t MinOffset;
+  int64_t MaxOffset;
+
+  /// This records whether all of the fixups using this LSRUse are outside of
+  /// the loop, in which case some special-case heuristics may be used.
+  bool AllFixupsOutsideLoop;
+
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
+  /// This records the widest use type for any fixup using this
+  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+  /// fixup widths to be equivalent, because the narrower one may be relying on
+  /// the implicit truncation to truncate away bogus bits.
+  Type *WidestFixupType;
+
+  /// A list of ways to build a value that can satisfy this user.  After the
+  /// list is populated, one of these is selected heuristically and used to
+  /// formulate a replacement for OperandValToReplace in UserInst.
+  SmallVector<Formula, 12> Formulae;
+
+  /// The set of register candidates used by all formulae in this LSRUse.
+  SmallPtrSet<const SCEV *, 4> Regs;
+
+  LSRUse(KindType K, MemAccessTy AT)
+      : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
+        AllFixupsOutsideLoop(true), RigidFormula(false),
+        WidestFixupType(nullptr) {}
+
+  bool HasFormulaWithSameRegs(const Formula &F) const;
+  bool InsertFormula(const Formula &F);
+  void DeleteFormula(Formula &F);
+  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+  return Uniquifier.count(Key);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.  The formula must be in canonical form.
+bool LSRUse::InsertFormula(const Formula &F) {
+  assert(F.isCanonical() && "Invalid canonical representation");
+
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+
+  if (!Uniquifier.insert(Key).second)
+    return false;
+
+  // Using a register to hold the value of 0 is not profitable.
+  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+         "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+  for (const SCEV *BaseReg : F.BaseRegs)
+    assert(!BaseReg->isZero() && "Zero allocated in a base register!");
+#endif
+
+  // Add the formula to the list.
+  Formulae.push_back(F);
+
+  // Record registers now being used by this use.
+  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  if (F.ScaledReg)
+    Regs.insert(F.ScaledReg);
+
+  return true;
+}
+
+/// Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+  if (&F != &Formulae.back())
+    std::swap(F, Formulae.back());
+  Formulae.pop_back();
+}
+
+/// Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+  // Now that we've filtered out some formulae, recompute the Regs set.
+  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
+  Regs.clear();
+  for (const Formula &F : Formulae) {
+    if (F.ScaledReg) Regs.insert(F.ScaledReg);
+    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  }
+
+  // Update the RegTracker.
+  for (const SCEV *S : OldRegs)
+    if (!Regs.count(S))
+      RegUses.dropRegister(S, LUIdx);
+}
+
+void LSRUse::print(raw_ostream &OS) const {
+  OS << "LSR Use: Kind=";
+  switch (Kind) {
+  case Basic:    OS << "Basic"; break;
+  case Special:  OS << "Special"; break;
+  case ICmpZero: OS << "ICmpZero"; break;
+  case Address:
+    OS << "Address of ";
+    if (AccessTy.MemTy->isPointerTy())
+      OS << "pointer"; // the full pointer type could be really verbose
+    else {
+      OS << *AccessTy.MemTy;
+    }
+
+    OS << " in addrspace(" << AccessTy.AddrSpace << ')';
+  }
+
+  OS << ", Offsets={";
+  bool NeedComma = false;
+  for (int64_t O : Offsets) {
+    if (NeedComma) OS << ',';
+    OS << O;
+    NeedComma = true;
+  }
+  OS << '}';
+
+  if (AllFixupsOutsideLoop)
+    OS << ", all-fixups-outside-loop";
+
+  if (WidestFixupType)
+    OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+LLVM_DUMP_METHOD
+void LSRUse::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
+  switch (Kind) {
+  case LSRUse::Address:
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+                                     HasBaseReg, Scale, AccessTy.AddrSpace);
+
+  case LSRUse::ICmpZero:
+    // There's not even a target hook for querying whether it would be legal to
+    // fold a GV into an ICmp.
+    if (BaseGV)
+      return false;
+
+    // ICmp only has two operands; don't allow more than two non-trivial parts.
+    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+      return false;
+
+    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+    // putting the scaled register in the other operand of the icmp.
+    if (Scale != 0 && Scale != -1)
+      return false;
+
+    // If we have low-level target information, ask the target if it can fold an
+    // integer immediate on an icmp.
+    if (BaseOffset != 0) {
+      // We have one of:
+      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+      // Offs is the ICmp immediate.
+      if (Scale == 0)
+        // The cast does the right thing with INT64_MIN.
+        BaseOffset = -(uint64_t)BaseOffset;
+      return TTI.isLegalICmpImmediate(BaseOffset);
+    }
+
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+    return true;
+
+  case LSRUse::Basic:
+    // Only handle single-register values.
+    return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+  case LSRUse::Special:
+    // Special case Basic to handle -1 scales.
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
+  // Check for overflow.
+  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+      (MinOffset > 0))
+    return false;
+  MinOffset = (uint64_t)BaseOffset + MinOffset;
+  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+      (MaxOffset > 0))
+    return false;
+  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+                              HasBaseReg, Scale) &&
+         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, MemAccessTy AccessTy,
+                                 const Formula &F) {
+  // For the purpose of isAMCompletelyFolded either having a canonical formula
+  // or a scale not equal to zero is correct.
+  // Problems may arise from non canonical formulae having a scale == 0.
+  // Strictly speaking it would best to just rely on canonical formulae.
+  // However, when we generate the scaled formulae, we first check that the
+  // scaling factor is profitable before computing the actual ScaledReg for
+  // compile time sake.
+  assert((F.isCanonical() || F.Scale != 0));
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, GlobalValue *BaseGV,
+                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+  // We know how to expand completely foldable formulae.
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale) ||
+         // Or formulae that use a base register produced by a sum of base
+         // registers.
+         (Scale == 1 &&
+          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                               BaseGV, BaseOffset, true, 0));
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind,
+                       MemAccessTy AccessTy, const Formula &F) {
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+                    F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F) {
+  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+                              F.Scale);
+}
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F) {
+  if (!F.Scale)
+    return 0;
+
+  // If the use is not completely folded in that instruction, we will have to
+  // pay an extra cost only for scale != 1.
+  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                            LU.AccessTy, F))
+    return F.Scale != 1;
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    // Check the scaling factor cost with both the min and max offsets.
+    int ScaleCostMinOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+    int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+        F.Scale, LU.AccessTy.AddrSpace);
+
+    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+           "Legal addressing mode has an illegal cost!");
+    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+  }
+  case LSRUse::ICmpZero:
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    // The use is completely folded, i.e., everything is folded into the
+    // instruction.
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             LSRUse::KindType Kind, MemAccessTy AccessTy,
+                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  // Canonicalize a scale of 1 to a base register if the formula doesn't
+  // already have a base register.
+  if (!HasBaseReg && Scale == 1) {
+    Scale = 0;
+    HasBaseReg = true;
+  }
+
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             ScalarEvolution &SE, int64_t MinOffset,
+                             int64_t MaxOffset, LSRUse::KindType Kind,
+                             MemAccessTy AccessTy, const SCEV *S,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (S->isZero()) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t BaseOffset = ExtractImmediate(S, SE);
+  GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+  // If there's anything else involved, it's not foldable.
+  if (!S->isZero()) return false;
+
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// An individual increment in a Chain of IV increments.  Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+  Instruction *UserInst;
+  Value* IVOperand;
+  const SCEV *IncExpr;
+
+  IVInc(Instruction *U, Value *O, const SCEV *E):
+    UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// The list of IV increments in program order.  We typically add the head of a
+// chain without finding subsequent links.
+struct IVChain {
+  SmallVector<IVInc,1> Incs;
+  const SCEV *ExprBase;
+
+  IVChain() : ExprBase(nullptr) {}
+
+  IVChain(const IVInc &Head, const SCEV *Base)
+    : Incs(1, Head), ExprBase(Base) {}
+
+  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+
+  // Return the first increment in the chain.
+  const_iterator begin() const {
+    assert(!Incs.empty());
+    return std::next(Incs.begin());
+  }
+  const_iterator end() const {
+    return Incs.end();
+  }
+
+  // Returns true if this chain contains any increments.
+  bool hasIncs() const { return Incs.size() >= 2; }
+
+  // Add an IVInc to the end of this chain.
+  void add(const IVInc &X) { Incs.push_back(X); }
+
+  // Returns the last UserInst in the chain.
+  Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+  // Returns true if IncExpr can be profitably added to this chain.
+  bool isProfitableIncrement(const SCEV *OperExpr,
+                             const SCEV *IncExpr,
+                             ScalarEvolution&);
+};
+
+/// Helper for CollectChains to track multiple IV increment uses.  Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
+struct ChainUsers {
+  SmallPtrSet<Instruction*, 4> FarUsers;
+  SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// This class holds state for the main loop strength reduction logic.
+class LSRInstance {
+  IVUsers &IU;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  const TargetTransformInfo &TTI;
+  Loop *const L;
+  bool Changed;
+
+  /// This is the insert position that the current loop's induction variable
+  /// increment should be placed. In simple loops, this is the latch block's
+  /// terminator. But in more complicated cases, this is a position which will
+  /// dominate all the in-loop post-increment users.
+  Instruction *IVIncInsertPos;
+
+  /// Interesting factors between use strides.
+  SmallSetVector<int64_t, 8> Factors;
+
+  /// Interesting use types, to facilitate truncation reuse.
+  SmallSetVector<Type *, 4> Types;
+
+  /// The list of operands which are to be replaced.
+  SmallVector<LSRFixup, 16> Fixups;
+
+  /// The list of interesting uses.
+  SmallVector<LSRUse, 16> Uses;
+
+  /// Track which uses use which register candidates.
+  RegUseTracker RegUses;
+
+  // Limit the number of chains to avoid quadratic behavior. We don't expect to
+  // have more than a few IV increment chains in a loop. Missing a Chain falls
+  // back to normal LSR behavior for those uses.
+  static const unsigned MaxChains = 8;
+
+  /// IV users can form a chain of IV increments.
+  SmallVector<IVChain, MaxChains> IVChainVec;
+
+  /// IV users that belong to profitable IVChains.
+  SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+  void OptimizeShadowIV();
+  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+  void OptimizeLoopTermCond();
+
+  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
+  void FinalizeChain(IVChain &Chain);
+  void CollectChains();
+  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                       SmallVectorImpl<WeakVH> &DeadInsts);
+
+  void CollectInterestingTypesAndFactors();
+  void CollectFixupsAndInitialFormulae();
+
+  LSRFixup &getNewFixup() {
+    Fixups.push_back(LSRFixup());
+    return Fixups.back();
+  }
+
+  // Support for sharing of LSRUses between LSRFixups.
+  typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
+  UseMapTy UseMap;
+
+  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+                          LSRUse::KindType Kind, MemAccessTy AccessTy);
+
+  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+                                    MemAccessTy AccessTy);
+
+  void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void CountRegisters(const Formula &F, size_t LUIdx);
+  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+  void CollectLoopInvariantFixupsAndFormulae();
+
+  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+                              unsigned Depth = 0);
+
+  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                  const Formula &Base, unsigned Depth,
+                                  size_t Idx, bool IsScaledReg = false);
+  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base, size_t Idx,
+                                   bool IsScaledReg = false);
+  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base,
+                                   const SmallVectorImpl<int64_t> &Worklist,
+                                   size_t Idx, bool IsScaledReg = false);
+  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateCrossUseConstantOffsets();
+  void GenerateAllReuseFormulae();
+
+  void FilterOutUndesirableDedicatedRegisters();
+
+  size_t EstimateSearchSpaceComplexity() const;
+  void NarrowSearchSpaceByDetectingSupersets();
+  void NarrowSearchSpaceByCollapsingUnrolledCode();
+  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByPickingWinnerRegs();
+  void NarrowSearchSpaceUsingHeuristics();
+
+  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                    Cost &SolutionCost,
+                    SmallVectorImpl<const Formula *> &Workspace,
+                    const Cost &CurCost,
+                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                    DenseSet<const SCEV *> &VisitedRegs) const;
+  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+  BasicBlock::iterator
+    HoistInsertPosition(BasicBlock::iterator IP,
+                        const SmallVectorImpl<Instruction *> &Inputs) const;
+  BasicBlock::iterator
+    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+                                  const LSRFixup &LF,
+                                  const LSRUse &LU,
+                                  SCEVExpander &Rewriter) const;
+
+  Value *Expand(const LSRFixup &LF,
+                const Formula &F,
+                BasicBlock::iterator IP,
+                SCEVExpander &Rewriter,
+                SmallVectorImpl<WeakVH> &DeadInsts) const;
+  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
+                     const Formula &F,
+                     SCEVExpander &Rewriter,
+                     SmallVectorImpl<WeakVH> &DeadInsts) const;
+  void Rewrite(const LSRFixup &LF,
+               const Formula &F,
+               SCEVExpander &Rewriter,
+               SmallVectorImpl<WeakVH> &DeadInsts) const;
+  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
+
+public:
+  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+              LoopInfo &LI, const TargetTransformInfo &TTI);
+
+  bool getChanged() const { return Changed; }
+
+  void print_factors_and_types(raw_ostream &OS) const;
+  void print_fixups(raw_ostream &OS) const;
+  void print_uses(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+       UI != E; /* empty */) {
+    IVUsers::const_iterator CandidateUI = UI;
+    ++UI;
+    Instruction *ShadowUse = CandidateUI->getUser();
+    Type *DestTy = nullptr;
+    bool IsSigned = false;
+
+    /* If shadow use is a int->float cast then insert a second IV
+       to eliminate this cast.
+
+         for (unsigned i = 0; i < n; ++i)
+           foo((double)i);
+
+       is transformed into
+
+         double d = 0.0;
+         for (unsigned i = 0; i < n; ++i, ++d)
+           foo(d);
+    */
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
+      DestTy = UCast->getDestTy();
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
+      DestTy = SCast->getDestTy();
+    }
+    if (!DestTy) continue;
+
+    // If target does not support DestTy natively then do not apply
+    // this transformation.
+    if (!TTI.isTypeLegal(DestTy)) continue;
+
+    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+    if (!PH) continue;
+    if (PH->getNumIncomingValues() != 2) continue;
+
+    Type *SrcTy = PH->getType();
+    int Mantissa = DestTy->getFPMantissaWidth();
+    if (Mantissa == -1) continue;
+    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+      continue;
+
+    unsigned Entry, Latch;
+    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+      Entry = 0;
+      Latch = 1;
+    } else {
+      Entry = 1;
+      Latch = 0;
+    }
+
+    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+    if (!Init) continue;
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
+
+    BinaryOperator *Incr =
+      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+    if (!Incr) continue;
+    if (Incr->getOpcode() != Instruction::Add
+        && Incr->getOpcode() != Instruction::Sub)
+      continue;
+
+    /* Initialize new IV, double d = 0.0 in above example. */
+    ConstantInt *C = nullptr;
+    if (Incr->getOperand(0) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+    else if (Incr->getOperand(1) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+    else
+      continue;
+
+    if (!C) continue;
+
+    // Ignore negative constants, as the code below doesn't handle them
+    // correctly. TODO: Remove this restriction.
+    if (!C->getValue().isStrictlyPositive()) continue;
+
+    /* Add new PHINode. */
+    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+    /* create new increment. '++d' in above example. */
+    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+    BinaryOperator *NewIncr =
+      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+                               Instruction::FAdd : Instruction::FSub,
+                             NewPH, CFP, "IV.S.next.", Incr);
+
+    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+    /* Remove cast operation */
+    ShadowUse->replaceAllUsesWith(NewPH);
+    ShadowUse->eraseFromParent();
+    Changed = true;
+    break;
+  }
+}
+
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+  for (IVStrideUse &U : IU)
+    if (U.getUser() == Cond) {
+      // NOTE: we could handle setcc instructions with multiple uses here, but
+      // InstCombine does it as well for simple uses, it's not clear that it
+      // occurs enough in real life to handle.
+      CondUse = &U;
+      return true;
+    }
+  return false;
+}
+
+/// Rewrite the loop's terminating condition if it uses a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+///   i = 0;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+//
+///   if (n > 0) {
+///     i = 0;
+///     do {
+///       p[i] = 0.0;
+///     } while (++i < n);
+///   }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+///   i = 0;
+///   max = n < 1 ? 1 : n;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+///
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+  // Check that the loop matches the pattern we're looking for.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+      Cond->getPredicate() != CmpInst::ICMP_NE)
+    return Cond;
+
+  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+  if (!Sel || !Sel->hasOneUse()) return Cond;
+
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return Cond;
+  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+  // Add one to the backedge-taken count to get the trip count.
+  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+  if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+  // Check for a max calculation that matches the pattern. There's no check
+  // for ICMP_ULE here because the comparison would be with zero, which
+  // isn't interesting.
+  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+  const SCEVNAryExpr *Max = nullptr;
+  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+    Pred = ICmpInst::ICMP_SLE;
+    Max = S;
+  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_SLT;
+    Max = S;
+  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_ULT;
+    Max = U;
+  } else {
+    // No match; bail.
+    return Cond;
+  }
+
+  // To handle a max with more than two operands, this optimization would
+  // require additional checking and setup.
+  if (Max->getNumOperands() != 2)
+    return Cond;
+
+  const SCEV *MaxLHS = Max->getOperand(0);
+  const SCEV *MaxRHS = Max->getOperand(1);
+
+  // ScalarEvolution canonicalizes constants to the left. For < and >, look
+  // for a comparison with 1. For <= and >=, a comparison with zero.
+  if (!MaxLHS ||
+      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+    return Cond;
+
+  // Check the relevant induction variable for conformance to
+  // the pattern.
+  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine() ||
+      AR->getStart() != One ||
+      AR->getStepRecurrence(SE) != One)
+    return Cond;
+
+  assert(AR->getLoop() == L &&
+         "Loop condition operand is an addrec in a different loop!");
+
+  // Check the right operand of the select, and remember it, as it will
+  // be used in the new comparison instruction.
+  Value *NewRHS = nullptr;
+  if (ICmpInst::isTrueWhenEqual(Pred)) {
+    // Look for n+1, and grab n.
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
+    if (!NewRHS)
+      return Cond;
+  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+    NewRHS = Sel->getOperand(1);
+  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+    NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
+  else
+    // Max doesn't match expected pattern.
+    return Cond;
+
+  // Determine the new comparison opcode. It may be signed or unsigned,
+  // and the original comparison may be either equality or inequality.
+  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+    Pred = CmpInst::getInversePredicate(Pred);
+
+  // Ok, everything looks ok to change the condition into an SLT or SGE and
+  // delete the max calculation.
+  ICmpInst *NewCond =
+    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+  // Delete the max calculation instructions.
+  Cond->replaceAllUsesWith(NewCond);
+  CondUse->setUser(NewCond);
+  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
+  Sel->eraseFromParent();
+  if (Cmp->use_empty())
+    Cmp->eraseFromParent();
+  return NewCond;
+}
+
+/// Change loop terminating condition to use the postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+  SmallPtrSet<Instruction *, 4> PostIncs;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  for (BasicBlock *ExitingBlock : ExitingBlocks) {
+
+    // Get the terminating condition for the loop if possible.  If we
+    // can, we want to change it to use a post-incremented version of its
+    // induction variable, to allow coalescing the live ranges for the IV into
+    // one register value.
+
+    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    if (!TermBr)
+      continue;
+    // FIXME: Overly conservative, termination condition could be an 'or' etc..
+    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+      continue;
+
+    // Search IVUsesByStride to find Cond's IVUse if there is one.
+    IVStrideUse *CondUse = nullptr;
+    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+    if (!FindIVUserForCond(Cond, CondUse))
+      continue;
+
+    // If the trip count is computed in terms of a max (due to ScalarEvolution
+    // being unable to find a sufficient guard, for example), change the loop
+    // comparison to use SLT or ULT instead of NE.
+    // One consequence of doing this now is that it disrupts the count-down
+    // optimization. That's not always a bad thing though, because in such
+    // cases it may still be worthwhile to avoid a max.
+    Cond = OptimizeMax(Cond, CondUse);
+
+    // If this exiting block dominates the latch block, it may also use
+    // the post-inc value if it won't be shared with other uses.
+    // Check for dominance.
+    if (!DT.dominates(ExitingBlock, LatchBlock))
+      continue;
+
+    // Conservatively avoid trying to use the post-inc value in non-latch
+    // exits if there may be pre-inc users in intervening blocks.
+    if (LatchBlock != ExitingBlock)
+      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+        // Test if the use is reachable from the exiting block. This dominator
+        // query is a conservative approximation of reachability.
+        if (&*UI != CondUse &&
+            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+          // Conservatively assume there may be reuse if the quotient of their
+          // strides could be a legal scale.
+          const SCEV *A = IU.getStride(*CondUse, L);
+          const SCEV *B = IU.getStride(*UI, L);
+          if (!A || !B) continue;
+          if (SE.getTypeSizeInBits(A->getType()) !=
+              SE.getTypeSizeInBits(B->getType())) {
+            if (SE.getTypeSizeInBits(A->getType()) >
+                SE.getTypeSizeInBits(B->getType()))
+              B = SE.getSignExtendExpr(B, A->getType());
+            else
+              A = SE.getSignExtendExpr(A, B->getType());
+          }
+          if (const SCEVConstant *D =
+                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+            const ConstantInt *C = D->getValue();
+            // Stride of one or negative one can have reuse with non-addresses.
+            if (C->isOne() || C->isAllOnesValue())
+              goto decline_post_inc;
+            // Avoid weird situations.
+            if (C->getValue().getMinSignedBits() >= 64 ||
+                C->getValue().isMinSignedValue())
+              goto decline_post_inc;
+            // Check for possible scaled-address reuse.
+            MemAccessTy AccessTy = getAccessType(UI->getUser());
+            int64_t Scale = C->getSExtValue();
+            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                          /*BaseOffset=*/0,
+                                          /*HasBaseReg=*/false, Scale,
+                                          AccessTy.AddrSpace))
+              goto decline_post_inc;
+            Scale = -Scale;
+            if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+                                          /*BaseOffset=*/0,
+                                          /*HasBaseReg=*/false, Scale,
+                                          AccessTy.AddrSpace))
+              goto decline_post_inc;
+          }
+        }
+
+    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
+                 << *Cond << '\n');
+
+    // It's possible for the setcc instruction to be anywhere in the loop, and
+    // possible for it to have multiple users.  If it is not immediately before
+    // the exiting block branch, move it.
+    if (&*++BasicBlock::iterator(Cond) != TermBr) {
+      if (Cond->hasOneUse()) {
+        Cond->moveBefore(TermBr);
+      } else {
+        // Clone the terminating condition and insert into the loopend.
+        ICmpInst *OldCond = Cond;
+        Cond = cast<ICmpInst>(Cond->clone());
+        Cond->setName(L->getHeader()->getName() + ".termcond");
+        ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
+
+        // Clone the IVUse, as the old use still exists!
+        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+        TermBr->replaceUsesOfWith(OldCond, Cond);
+      }
+    }
+
+    // If we get to here, we know that we can transform the setcc instruction to
+    // use the post-incremented version of the IV, allowing us to coalesce the
+    // live ranges for the IV correctly.
+    CondUse->transformToPostInc(L);
+    Changed = true;
+
+    PostIncs.insert(Cond);
+  decline_post_inc:;
+  }
+
+  // Determine an insertion point for the loop induction variable increment. It
+  // must dominate all the post-inc comparisons we just set up, and it must
+  // dominate the loop latch edge.
+  IVIncInsertPos = L->getLoopLatch()->getTerminator();
+  for (Instruction *Inst : PostIncs) {
+    BasicBlock *BB =
+      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+                                    Inst->getParent());
+    if (BB == Inst->getParent())
+      IVIncInsertPos = Inst;
+    else if (BB != IVIncInsertPos->getParent())
+      IVIncInsertPos = BB->getTerminator();
+  }
+}
+
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+                                     bool HasBaseReg, LSRUse::KindType Kind,
+                                     MemAccessTy AccessTy) {
+  int64_t NewMinOffset = LU.MinOffset;
+  int64_t NewMaxOffset = LU.MaxOffset;
+  MemAccessTy NewAccessTy = AccessTy;
+
+  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+  // something conservative, however this can pessimize in the case that one of
+  // the uses will have all its uses outside the loop, for example.
+  if (LU.Kind != Kind)
+    return false;
+
+  // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
+  if (Kind == LSRUse::Address) {
+    if (AccessTy != LU.AccessTy)
+      NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext());
+  }
+
+  // Conservatively assume HasBaseReg is true for now.
+  if (NewOffset < LU.MinOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+                          LU.MaxOffset - NewOffset, HasBaseReg))
+      return false;
+    NewMinOffset = NewOffset;
+  } else if (NewOffset > LU.MaxOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+                          NewOffset - LU.MinOffset, HasBaseReg))
+      return false;
+    NewMaxOffset = NewOffset;
+  }
+
+  // Update the use.
+  LU.MinOffset = NewMinOffset;
+  LU.MaxOffset = NewMaxOffset;
+  LU.AccessTy = NewAccessTy;
+  if (NewOffset != LU.Offsets.back())
+    LU.Offsets.push_back(NewOffset);
+  return true;
+}
+
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type.  Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+                                               LSRUse::KindType Kind,
+                                               MemAccessTy AccessTy) {
+  const SCEV *Copy = Expr;
+  int64_t Offset = ExtractImmediate(Expr, SE);
+
+  // Basic uses can't accept any offset, for example.
+  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+                        Offset, /*HasBaseReg=*/ true)) {
+    Expr = Copy;
+    Offset = 0;
+  }
+
+  std::pair<UseMapTy::iterator, bool> P =
+    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+  if (!P.second) {
+    // A use already existed with this base.
+    size_t LUIdx = P.first->second;
+    LSRUse &LU = Uses[LUIdx];
+    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+      // Reuse this use.
+      return std::make_pair(LUIdx, Offset);
+  }
+
+  // Create a new use.
+  size_t LUIdx = Uses.size();
+  P.first->second = LUIdx;
+  Uses.push_back(LSRUse(Kind, AccessTy));
+  LSRUse &LU = Uses[LUIdx];
+
+  // We don't need to track redundant offsets, but we don't need to go out
+  // of our way here to avoid them.
+  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
+    LU.Offsets.push_back(Offset);
+
+  LU.MinOffset = Offset;
+  LU.MaxOffset = Offset;
+  return std::make_pair(LUIdx, Offset);
+}
+
+/// Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+  if (&LU != &Uses.back())
+    std::swap(LU, Uses.back());
+  Uses.pop_back();
+
+  // Update RegUses.
+  RegUses.swapAndDropUse(LUIdx, Uses.size());
+}
+
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+                                       const LSRUse &OrigLU) {
+  // Search all uses for the formula. This could be more clever.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // Check whether this use is close enough to OrigLU, to see whether it's
+    // worthwhile looking through its formulae.
+    // Ignore ICmpZero uses because they may contain formulae generated by
+    // GenerateICmpZeroScales, in which case adding fixup offsets may
+    // be invalid.
+    if (&LU != &OrigLU &&
+        LU.Kind != LSRUse::ICmpZero &&
+        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.WidestFixupType == OrigLU.WidestFixupType &&
+        LU.HasFormulaWithSameRegs(OrigF)) {
+      // Scan through this use's formulae.
+      for (const Formula &F : LU.Formulae) {
+        // Check to see if this formula has the same registers and symbols
+        // as OrigF.
+        if (F.BaseRegs == OrigF.BaseRegs &&
+            F.ScaledReg == OrigF.ScaledReg &&
+            F.BaseGV == OrigF.BaseGV &&
+            F.Scale == OrigF.Scale &&
+            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+          if (F.BaseOffset == 0)
+            return &LU;
+          // This is the formula where all the registers and symbols matched;
+          // there aren't going to be any others. Since we declined it, we
+          // can skip the rest of the formulae and proceed to the next LSRUse.
+          break;
+        }
+      }
+    }
+  }
+
+  // Nothing looked good.
+  return nullptr;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+  SmallSetVector<const SCEV *, 4> Strides;
+
+  // Collect interesting types and strides.
+  SmallVector<const SCEV *, 4> Worklist;
+  for (const IVStrideUse &U : IU) {
+    const SCEV *Expr = IU.getExpr(U);
+
+    // Collect interesting types.
+    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+    // Add strides for mentioned loops.
+    Worklist.push_back(Expr);
+    do {
+      const SCEV *S = Worklist.pop_back_val();
+      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+        if (AR->getLoop() == L)
+          Strides.insert(AR->getStepRecurrence(SE));
+        Worklist.push_back(AR->getStart());
+      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+        Worklist.append(Add->op_begin(), Add->op_end());
+      }
+    } while (!Worklist.empty());
+  }
+
+  // Compute interesting factors from the set of interesting strides.
+  for (SmallSetVector<const SCEV *, 4>::const_iterator
+       I = Strides.begin(), E = Strides.end(); I != E; ++I)
+    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+         std::next(I); NewStrideIter != E; ++NewStrideIter) {
+      const SCEV *OldStride = *I;
+      const SCEV *NewStride = *NewStrideIter;
+
+      if (SE.getTypeSizeInBits(OldStride->getType()) !=
+          SE.getTypeSizeInBits(NewStride->getType())) {
+        if (SE.getTypeSizeInBits(OldStride->getType()) >
+            SE.getTypeSizeInBits(NewStride->getType()))
+          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+        else
+          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+      }
+      if (const SCEVConstant *Factor =
+            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+                                                        SE, true))) {
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
+      } else if (const SCEVConstant *Factor =
+                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+                                                               NewStride,
+                                                               SE, true))) {
+        if (Factor->getAPInt().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getAPInt().getSExtValue());
+      }
+    }
+
+  // If all uses use the same type, don't bother looking for truncation-based
+  // reuse.
+  if (Types.size() == 1)
+    Types.clear();
+
+  DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+              Loop *L, ScalarEvolution &SE) {
+  for(; OI != OE; ++OI) {
+    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+      if (!SE.isSCEVable(Oper->getType()))
+        continue;
+
+      if (const SCEVAddRecExpr *AR =
+          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+        if (AR->getLoop() == L)
+          break;
+      }
+    }
+  }
+  return OI;
+}
+
+/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
+/// a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+    return Trunc->getOperand(0);
+  return Oper;
+}
+
+/// Return true if we allow an IV chain to include both types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+  Type *LType = LVal->getType();
+  Type *RType = RVal->getType();
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+}
+
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+  switch (S->getSCEVType()) {
+  default: // uncluding scUnknown.
+    return S;
+  case scConstant:
+    return nullptr;
+  case scTruncate:
+    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+  case scZeroExtend:
+    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+  case scSignExtend:
+    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+  case scAddExpr: {
+    // Skip over scaled operands (scMulExpr) to follow add operands as long as
+    // there's nothing more complex.
+    // FIXME: not sure if we want to recognize negation.
+    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+           E(Add->op_begin()); I != E; ++I) {
+      const SCEV *SubExpr = *I;
+      if (SubExpr->getSCEVType() == scAddExpr)
+        return getExprBase(SubExpr);
+
+      if (SubExpr->getSCEVType() != scMulExpr)
+        return SubExpr;
+    }
+    return S; // all operands are scaled, be conservative.
+  }
+  case scAddRecExpr:
+    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+  }
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+                                    const SCEV *IncExpr,
+                                    ScalarEvolution &SE) {
+  // Aggressively form chains when -stress-ivchain.
+  if (StressIVChain)
+    return true;
+
+  // Do not replace a constant offset from IV head with a nonconstant IV
+  // increment.
+  if (!isa<SCEVConstant>(IncExpr)) {
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+      return 0;
+  }
+
+  SmallPtrSet<const SCEV*, 8> Processed;
+  return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool
+isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
+                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+  if (StressIVChain)
+    return true;
+
+  if (!Chain.hasIncs())
+    return false;
+
+  if (!Users.empty()) {
+    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+          for (Instruction *Inst : Users) {
+            dbgs() << "  " << *Inst << "\n";
+          });
+    return false;
+  }
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+  // The chain itself may require a register, so intialize cost to 1.
+  int cost = 1;
+
+  // A complete chain likely eliminates the need for keeping the original IV in
+  // a register. LSR does not currently know how to form a complete chain unless
+  // the header phi already exists.
+  if (isa<PHINode>(Chain.tailUserInst())
+      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+    --cost;
+  }
+  const SCEV *LastIncExpr = nullptr;
+  unsigned NumConstIncrements = 0;
+  unsigned NumVarIncrements = 0;
+  unsigned NumReusedIncrements = 0;
+  for (const IVInc &Inc : Chain) {
+    if (Inc.IncExpr->isZero())
+      continue;
+
+    // Incrementing by zero or some constant is neutral. We assume constants can
+    // be folded into an addressing mode or an add's immediate operand.
+    if (isa<SCEVConstant>(Inc.IncExpr)) {
+      ++NumConstIncrements;
+      continue;
+    }
+
+    if (Inc.IncExpr == LastIncExpr)
+      ++NumReusedIncrements;
+    else
+      ++NumVarIncrements;
+
+    LastIncExpr = Inc.IncExpr;
+  }
+  // An IV chain with a single increment is handled by LSR's postinc
+  // uses. However, a chain with multiple increments requires keeping the IV's
+  // value live longer than it needs to be if chained.
+  if (NumConstIncrements > 1)
+    --cost;
+
+  // Materializing increment expressions in the preheader that didn't exist in
+  // the original code may cost a register. For example, sign-extended array
+  // indices can produce ridiculous increments like this:
+  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+  cost += NumVarIncrements;
+
+  // Reusing variable increments likely saves a register to hold the multiple of
+  // the stride.
+  cost -= NumReusedIncrements;
+
+  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+               << "\n");
+
+  return cost < 0;
+}
+
+/// Add this IV user to an existing chain or make it the head of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+  // When IVs are used as types of varying widths, they are generally converted
+  // to a wider type with some uses remaining narrow under a (free) trunc.
+  Value *const NextIV = getWideOperand(IVOper);
+  const SCEV *const OperExpr = SE.getSCEV(NextIV);
+  const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+  // Visit all existing chains. Check if its IVOper can be computed as a
+  // profitable loop invariant increment from the last link in the Chain.
+  unsigned ChainIdx = 0, NChains = IVChainVec.size();
+  const SCEV *LastIncExpr = nullptr;
+  for (; ChainIdx < NChains; ++ChainIdx) {
+    IVChain &Chain = IVChainVec[ChainIdx];
+
+    // Prune the solution space aggressively by checking that both IV operands
+    // are expressions that operate on the same unscaled SCEVUnknown. This
+    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+    // first avoids creating extra SCEV expressions.
+    if (!StressIVChain && Chain.ExprBase != OperExprBase)
+      continue;
+
+    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+    if (!isCompatibleIVType(PrevIV, NextIV))
+      continue;
+
+    // A phi node terminates a chain.
+    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+      continue;
+
+    // The increment must be loop-invariant so it can be kept in a register.
+    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+    if (!SE.isLoopInvariant(IncExpr, L))
+      continue;
+
+    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+      LastIncExpr = IncExpr;
+      break;
+    }
+  }
+  // If we haven't found a chain, create a new one, unless we hit the max. Don't
+  // bother for phi nodes, because they must be last in the chain.
+  if (ChainIdx == NChains) {
+    if (isa<PHINode>(UserInst))
+      return;
+    if (NChains >= MaxChains && !StressIVChain) {
+      DEBUG(dbgs() << "IV Chain Limit\n");
+      return;
+    }
+    LastIncExpr = OperExpr;
+    // IVUsers may have skipped over sign/zero extensions. We don't currently
+    // attempt to form chains involving extensions unless they can be hoisted
+    // into this loop's AddRec.
+    if (!isa<SCEVAddRecExpr>(LastIncExpr))
+      return;
+    ++NChains;
+    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+                                 OperExprBase));
+    ChainUsersVec.resize(NChains);
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                 << ") IV=" << *LastIncExpr << "\n");
+  } else {
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                 << ") IV+" << *LastIncExpr << "\n");
+    // Add this IV user to the end of the chain.
+    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+  }
+  IVChain &Chain = IVChainVec[ChainIdx];
+
+  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+  // This chain's NearUsers become FarUsers.
+  if (!LastIncExpr->isZero()) {
+    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+                                            NearUsers.end());
+    NearUsers.clear();
+  }
+
+  // All other uses of IVOperand become near uses of the chain.
+  // We currently ignore intermediate values within SCEV expressions, assuming
+  // they will eventually be used be the current chain, or can be computed
+  // from one of the chain increments. To be more precise we could
+  // transitively follow its user and only add leaf IV users to the set.
+  for (User *U : IVOper->users()) {
+    Instruction *OtherUse = dyn_cast<Instruction>(U);
+    if (!OtherUse)
+      continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
+    if (SE.isSCEVable(OtherUse->getType())
+        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+        && IU.isIVUserOrOperand(OtherUse)) {
+      continue;
+    }
+    NearUsers.insert(OtherUse);
+  }
+
+  // Since this user is part of the chain, it's no longer considered a use
+  // of the chain.
+  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+///        = A[i]
+///        = A[i+x]
+/// A[i]   =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+  DEBUG(dbgs() << "Collecting IV Chains.\n");
+  SmallVector<ChainUsers, 8> ChainUsersVec;
+
+  SmallVector<BasicBlock *,8> LatchPath;
+  BasicBlock *LoopHeader = L->getHeader();
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+    LatchPath.push_back(Rung->getBlock());
+  }
+  LatchPath.push_back(LoopHeader);
+
+  // Walk the instruction stream from the loop header to the loop latch.
+  for (SmallVectorImpl<BasicBlock *>::reverse_iterator
+         BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
+       BBIter != BBEnd; ++BBIter) {
+    for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
+         I != E; ++I) {
+      // Skip instructions that weren't seen by IVUsers analysis.
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))
+        continue;
+
+      // Ignore users that are part of a SCEV expression. This way we only
+      // consider leaf IV Users. This effectively rediscovers a portion of
+      // IVUsers analysis but in program order this time.
+      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))
+        continue;
+
+      // Remove this instruction from any NearUsers set it may be in.
+      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+           ChainIdx < NChains; ++ChainIdx) {
+        ChainUsersVec[ChainIdx].NearUsers.erase(&*I);
+      }
+      // Search for operands that can be chained.
+      SmallPtrSet<Instruction*, 4> UniqueOperands;
+      User::op_iterator IVOpEnd = I->op_end();
+      User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
+      while (IVOpIter != IVOpEnd) {
+        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+        if (UniqueOperands.insert(IVOpInst).second)
+          ChainInstruction(&*I, IVOpInst, ChainUsersVec);
+        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+      }
+    } // Continue walking down the instructions.
+  } // Continue walking down the domtree.
+  // Visit phi backedges to determine if the chain can generate the IV postinc.
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (!SE.isSCEVable(PN->getType()))
+      continue;
+
+    Instruction *IncV =
+      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+    if (IncV)
+      ChainInstruction(PN, IncV, ChainUsersVec);
+  }
+  // Remove any unprofitable chains.
+  unsigned ChainIdx = 0;
+  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+       UsersIdx < NChains; ++UsersIdx) {
+    if (!isProfitableChain(IVChainVec[UsersIdx],
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+      continue;
+    // Preserve the chain at UsesIdx.
+    if (ChainIdx != UsersIdx)
+      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+    FinalizeChain(IVChainVec[ChainIdx]);
+    ++ChainIdx;
+  }
+  IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+
+  for (const IVInc &Inc : Chain) {
+    DEBUG(dbgs() << "        Inc: " << Inc.UserInst << "\n");
+    auto UseI = std::find(Inc.UserInst->op_begin(), Inc.UserInst->op_end(),
+                          Inc.IVOperand);
+    assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
+    IVIncSet.insert(UseI);
+  }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+                             Value *Operand, const TargetTransformInfo &TTI) {
+  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+  if (!IncConst || !isAddressUse(UserInst, Operand))
+    return false;
+
+  if (IncConst->getAPInt().getMinSignedBits() > 64)
+    return false;
+
+  MemAccessTy AccessTy = getAccessType(UserInst);
+  int64_t IncOffset = IncConst->getValue()->getSExtValue();
+  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+                        IncOffset, /*HaseBaseReg=*/false))
+    return false;
+
+  return true;
+}
+
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                                  SmallVectorImpl<WeakVH> &DeadInsts) {
+  // Find the new IVOperand for the head of the chain. It may have been replaced
+  // by LSR.
+  const IVInc &Head = Chain.Incs[0];
+  User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+                                             IVOpEnd, L, SE);
+  Value *IVSrc = nullptr;
+  while (IVOpIter != IVOpEnd) {
+    IVSrc = getWideOperand(*IVOpIter);
+
+    // If this operand computes the expression that the chain needs, we may use
+    // it. (Check this after setting IVSrc which is used below.)
+    //
+    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+    // narrow for the chain, so we can no longer use it. We do allow using a
+    // wider phi, assuming the LSR checked for free truncation. In that case we
+    // should already have a truncate on this operand such that
+    // getSCEV(IVSrc) == IncExpr.
+    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+        || SE.getSCEV(IVSrc) == Head.IncExpr) {
+      break;
+    }
+    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+  }
+  if (IVOpIter == IVOpEnd) {
+    // Gracefully give up on this chain.
+    DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+  Type *IVTy = IVSrc->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+  const SCEV *LeftOverExpr = nullptr;
+  for (const IVInc &Inc : Chain) {
+    Instruction *InsertPt = Inc.UserInst;
+    if (isa<PHINode>(InsertPt))
+      InsertPt = L->getLoopLatch()->getTerminator();
+
+    // IVOper will replace the current IV User's operand. IVSrc is the IV
+    // value currently held in a register.
+    Value *IVOper = IVSrc;
+    if (!Inc.IncExpr->isZero()) {
+      // IncExpr was the result of subtraction of two narrow values, so must
+      // be signed.
+      const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
+      LeftOverExpr = LeftOverExpr ?
+        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+    }
+    if (LeftOverExpr && !LeftOverExpr->isZero()) {
+      // Expand the IV increment.
+      Rewriter.clearPostInc();
+      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+                                             SE.getUnknown(IncV));
+      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+      // If an IV increment can't be folded, use it as the next IV value.
+      if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
+        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+        IVSrc = IVOper;
+        LeftOverExpr = nullptr;
+      }
+    }
+    Type *OperTy = Inc.IVOperand->getType();
+    if (IVTy != OperTy) {
+      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+             "cannot extend a chained IV");
+      IRBuilder<> Builder(InsertPt);
+      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+    }
+    Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
+    DeadInsts.emplace_back(Inc.IVOperand);
+  }
+  // If LSR created a new, wider phi, we may also replace its postinc. We only
+  // do this if we also found a wide value for the head of the chain.
+  if (isa<PHINode>(Chain.tailUserInst())) {
+    for (BasicBlock::iterator I = L->getHeader()->begin();
+         PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
+      if (!isCompatibleIVType(Phi, IVSrc))
+        continue;
+      Instruction *PostIncV = dyn_cast<Instruction>(
+        Phi->getIncomingValueForBlock(L->getLoopLatch()));
+      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+        continue;
+      Value *IVOper = IVSrc;
+      Type *PostIncTy = PostIncV->getType();
+      if (IVTy != PostIncTy) {
+        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+      }
+      Phi->replaceUsesOfWith(PostIncV, IVOper);
+      DeadInsts.emplace_back(PostIncV);
+    }
+  }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+  for (const IVStrideUse &U : IU) {
+    Instruction *UserInst = U.getUser();
+    // Skip IV users that are part of profitable IV Chains.
+    User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
+                                       U.getOperandValToReplace());
+    assert(UseI != UserInst->op_end() && "cannot find IV operand");
+    if (IVIncSet.count(UseI))
+      continue;
+
+    // Record the uses.
+    LSRFixup &LF = getNewFixup();
+    LF.UserInst = UserInst;
+    LF.OperandValToReplace = U.getOperandValToReplace();
+    LF.PostIncLoops = U.getPostIncLoops();
+
+    LSRUse::KindType Kind = LSRUse::Basic;
+    MemAccessTy AccessTy;
+    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
+      Kind = LSRUse::Address;
+      AccessTy = getAccessType(LF.UserInst);
+    }
+
+    const SCEV *S = IU.getExpr(U);
+
+    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+    // (N - i == 0), and this allows (N - i) to be the expression that we work
+    // with rather than just N or i, so we can consider the register
+    // requirements for both N and i at the same time. Limiting this code to
+    // equality icmps is not a problem because all interesting loops use
+    // equality icmps, thanks to IndVarSimplify.
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
+      if (CI->isEquality()) {
+        // Swap the operands if needed to put the OperandValToReplace on the
+        // left, for consistency.
+        Value *NV = CI->getOperand(1);
+        if (NV == LF.OperandValToReplace) {
+          CI->setOperand(1, CI->getOperand(0));
+          CI->setOperand(0, NV);
+          NV = CI->getOperand(1);
+          Changed = true;
+        }
+
+        // x == y  -->  x - y == 0
+        const SCEV *N = SE.getSCEV(NV);
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
+          // S is normalized, so normalize N before folding it into S
+          // to keep the result normalized.
+          N = TransformForPostIncUse(Normalize, N, CI, nullptr,
+                                     LF.PostIncLoops, SE, DT);
+          Kind = LSRUse::ICmpZero;
+          S = SE.getMinusSCEV(N, S);
+        }
+
+        // -1 and the negations of all interesting strides (except the negation
+        // of -1) are now also interesting.
+        for (size_t i = 0, e = Factors.size(); i != e; ++i)
+          if (Factors[i] != -1)
+            Factors.insert(-(uint64_t)Factors[i]);
+        Factors.insert(-1);
+      }
+
+    // Set up the initial formula for this use.
+    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+    LF.LUIdx = P.first;
+    LF.Offset = P.second;
+    LSRUse &LU = Uses[LF.LUIdx];
+    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+    if (!LU.WidestFixupType ||
+        SE.getTypeSizeInBits(LU.WidestFixupType) <
+        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+      LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+    // If this is the first use of this LSRUse, give it a formula.
+    if (LU.Formulae.empty()) {
+      InsertInitialFormula(S, LU, LF.LUIdx);
+      CountRegisters(LU.Formulae.back(), LF.LUIdx);
+    }
+  }
+
+  DEBUG(print_fixups(dbgs()));
+}
+
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
+  Formula F;
+  F.initialMatch(S, L, SE);
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+                                       LSRUse &LU, size_t LUIdx) {
+  Formula F;
+  F.BaseRegs.push_back(S);
+  F.HasBaseReg = true;
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// Note which registers are used by the given formula, updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+  if (F.ScaledReg)
+    RegUses.countRegister(F.ScaledReg, LUIdx);
+  for (const SCEV *BaseReg : F.BaseRegs)
+    RegUses.countRegister(BaseReg, LUIdx);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+  // Do not insert formula that we will not be able to expand.
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+         "Formula is illegal");
+  if (!LU.InsertFormula(F))
+    return false;
+
+  CountRegisters(F, LUIdx);
+  return true;
+}
+
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+  SmallPtrSet<const SCEV *, 32> Visited;
+
+  while (!Worklist.empty()) {
+    const SCEV *S = Worklist.pop_back_val();
+
+    // Don't process the same SCEV twice
+    if (!Visited.insert(S).second)
+      continue;
+
+    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+      Worklist.append(N->op_begin(), N->op_end());
+    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+      Worklist.push_back(C->getOperand());
+    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      Worklist.push_back(D->getLHS());
+      Worklist.push_back(D->getRHS());
+    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+      const Value *V = US->getValue();
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
+        if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
+      for (const Use &U : V->uses()) {
+        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+        // Ignore non-instructions.
+        if (!UserInst)
+          continue;
+        // Ignore instructions in other functions (as can happen with
+        // Constants).
+        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+          continue;
+        // Ignore instructions not dominated by the loop.
+        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+          UserInst->getParent() :
+          cast<PHINode>(UserInst)->getIncomingBlock(
+            PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+        if (!DT.dominates(L->getHeader(), UseBB))
+          continue;
+        // Don't bother if the instruction is in a BB which ends in an EHPad.
+        if (UseBB->getTerminator()->isEHPad())
+          continue;
+        // Ignore uses which are part of other SCEV expressions, to avoid
+        // analyzing them multiple times.
+        if (SE.isSCEVable(UserInst->getType())) {
+          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+          // If the user is a no-op, look through to its uses.
+          if (!isa<SCEVUnknown>(UserS))
+            continue;
+          if (UserS == US) {
+            Worklist.push_back(
+              SE.getUnknown(const_cast<Instruction *>(UserInst)));
+            continue;
+          }
+        }
+        // Ignore icmp instructions which are already being analyzed.
+        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+          unsigned OtherIdx = !U.getOperandNo();
+          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+            continue;
+        }
+
+        LSRFixup &LF = getNewFixup();
+        LF.UserInst = const_cast<Instruction *>(UserInst);
+        LF.OperandValToReplace = U;
+        std::pair<size_t, int64_t> P = getUse(
+            S, LSRUse::Basic, MemAccessTy());
+        LF.LUIdx = P.first;
+        LF.Offset = P.second;
+        LSRUse &LU = Uses[LF.LUIdx];
+        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        if (!LU.WidestFixupType ||
+            SE.getTypeSizeInBits(LU.WidestFixupType) <
+            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+          LU.WidestFixupType = LF.OperandValToReplace->getType();
+        InsertSupplementalFormula(US, LU, LF.LUIdx);
+        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+        break;
+      }
+    }
+  }
+}
+
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+                                   SmallVectorImpl<const SCEV *> &Ops,
+                                   const Loop *L,
+                                   ScalarEvolution &SE,
+                                   unsigned Depth = 0) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return S;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    // Break out add operands.
+    for (const SCEV *S : Add->operands()) {
+      const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+    }
+    return nullptr;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Split a non-zero base out of an addrec.
+    if (AR->getStart()->isZero())
+      return S;
+
+    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+                                            C, Ops, L, SE, Depth+1);
+    // Split the non-zero AddRec unless it is part of a nested recurrence that
+    // does not pertain to this loop.
+    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+      Remainder = nullptr;
+    }
+    if (Remainder != AR->getStart()) {
+      if (!Remainder)
+        Remainder = SE.getConstant(AR->getType(), 0);
+      return SE.getAddRecExpr(Remainder,
+                              AR->getStepRecurrence(SE),
+                              AR->getLoop(),
+                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                              SCEV::FlagAnyWrap);
+    }
+  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    // Break (C * (a + b + c)) into C*a + C*b + C*c.
+    if (Mul->getNumOperands() != 2)
+      return S;
+    if (const SCEVConstant *Op0 =
+        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+      const SCEV *Remainder =
+        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(SE.getMulExpr(C, Remainder));
+      return nullptr;
+    }
+  }
+  return S;
+}
+
+/// \brief Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                             const Formula &Base,
+                                             unsigned Depth, size_t Idx,
+                                             bool IsScaledReg) {
+  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  SmallVector<const SCEV *, 8> AddOps;
+  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+  if (Remainder)
+    AddOps.push_back(Remainder);
+
+  if (AddOps.size() == 1)
+    return;
+
+  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+                                                     JE = AddOps.end();
+       J != JE; ++J) {
+
+    // Loop-variant "unknown" values are uninteresting; we won't be able to
+    // do anything meaningful with them.
+    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+      continue;
+
+    // Don't pull a constant into a register if the constant could be folded
+    // into an immediate field.
+    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, *J, Base.getNumRegs() > 1))
+      continue;
+
+    // Collect all operands except *J.
+    SmallVector<const SCEV *, 8> InnerAddOps(
+        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+    InnerAddOps.append(std::next(J),
+                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+    // Don't leave just a constant behind in a register if the constant could
+    // be folded into an immediate field.
+    if (InnerAddOps.size() == 1 &&
+        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+      continue;
+
+    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+    if (InnerSum->isZero())
+      continue;
+    Formula F = Base;
+
+    // Add the remaining pieces of the add back into the new formula.
+    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                InnerSumSC->getValue()->getZExtValue())) {
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+      if (IsScaledReg)
+        F.ScaledReg = nullptr;
+      else
+        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+    } else if (IsScaledReg)
+      F.ScaledReg = InnerSum;
+    else
+      F.BaseRegs[Idx] = InnerSum;
+
+    // Add J as its own register, or an unfolded immediate.
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                SC->getValue()->getZExtValue()))
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+    else
+      F.BaseRegs.push_back(*J);
+    // We may have changed the number of register in base regs, adjust the
+    // formula accordingly.
+    F.canonicalize();
+
+    if (InsertFormula(LU, LUIdx, F))
+      // If that formula hadn't been seen before, recurse to find more like
+      // it.
+      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+  }
+}
+
+/// Split out subexpressions from adds and the bases of addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base, unsigned Depth) {
+  assert(Base.isCanonical() && "Input must be in the canonical form");
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
+
+  if (Base.Scale == 1)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+                               /* Idx */ -1, /* IsScaledReg */ true);
+}
+
+///  Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+                                       Formula Base) {
+  // This method is only interesting on a plurality of registers.
+  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+    return;
+
+  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+  // processing the formula.
+  Base.unscale();
+  Formula F = Base;
+  F.BaseRegs.clear();
+  SmallVector<const SCEV *, 4> Ops;
+  for (const SCEV *BaseReg : Base.BaseRegs) {
+    if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+        !SE.hasComputableLoopEvolution(BaseReg, L))
+      Ops.push_back(BaseReg);
+    else
+      F.BaseRegs.push_back(BaseReg);
+  }
+  if (Ops.size() > 1) {
+    const SCEV *Sum = SE.getAddExpr(Ops);
+    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+    // opportunity to fold something. For now, just ignore such cases
+    // rather than proceed with zero in a register.
+    if (!Sum->isZero()) {
+      F.BaseRegs.push_back(Sum);
+      F.canonicalize();
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                              const Formula &Base, size_t Idx,
+                                              bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  GlobalValue *GV = ExtractSymbol(G, SE);
+  if (G->isZero() || !GV)
+    return;
+  Formula F = Base;
+  F.BaseGV = GV;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // We can't add a symbolic offset if the address already contains one.
+  if (Base.BaseGV) return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+  if (Base.Scale == 1)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+    LSRUse &LU, unsigned LUIdx, const Formula &Base,
+    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  for (int64_t Offset : Worklist) {
+    Formula F = Base;
+    F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+    if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
+                   LU.AccessTy, F)) {
+      // Add the offset to the base register.
+      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+      // If it cancelled out, drop the base register, otherwise update it.
+      if (NewG->isZero()) {
+        if (IsScaledReg) {
+          F.Scale = 0;
+          F.ScaledReg = nullptr;
+        } else
+          F.deleteBaseReg(F.BaseRegs[Idx]);
+        F.canonicalize();
+      } else if (IsScaledReg)
+        F.ScaledReg = NewG;
+      else
+        F.BaseRegs[Idx] = NewG;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+
+  int64_t Imm = ExtractImmediate(G, SE);
+  if (G->isZero() || Imm == 0)
+    return;
+  Formula F = Base;
+  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // TODO: For now, just add the min and max offset, because it usually isn't
+  // worthwhile looking at everything inbetween.
+  SmallVector<int64_t, 2> Worklist;
+  Worklist.push_back(LU.MinOffset);
+  if (LU.MaxOffset != LU.MinOffset)
+    Worklist.push_back(LU.MaxOffset);
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+  if (Base.Scale == 1)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base) {
+  if (LU.Kind != LSRUse::ICmpZero) return;
+
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+  if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+  // Don't do this if there is more than one offset.
+  if (LU.MinOffset != LU.MaxOffset) return;
+
+  assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+  // Check each interesting stride.
+  for (int64_t Factor : Factors) {
+    // Check that the multiplication doesn't overflow.
+    if (Base.BaseOffset == INT64_MIN && Factor == -1)
+      continue;
+    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+    if (NewBaseOffset / Factor != Base.BaseOffset)
+      continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+      continue;
+
+    // Check that multiplying with the use offset doesn't overflow.
+    int64_t Offset = LU.MinOffset;
+    if (Offset == INT64_MIN && Factor == -1)
+      continue;
+    Offset = (uint64_t)Offset * Factor;
+    if (Offset / Factor != LU.MinOffset)
+      continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, Offset))
+      continue;
+
+    Formula F = Base;
+    F.BaseOffset = NewBaseOffset;
+
+    // Check that this scale is legal.
+    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+      continue;
+
+    // Compensate for the use having MinOffset built into it.
+    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+    // Check that multiplying with each base register doesn't overflow.
+    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+        goto next;
+    }
+
+    // Check that multiplying with the scaled register doesn't overflow.
+    if (F.ScaledReg) {
+      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+        continue;
+    }
+
+    // Check that multiplying with the unfolded offset doesn't overflow.
+    if (F.UnfoldedOffset != 0) {
+      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+        continue;
+      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+        continue;
+      // If the offset will be truncated, check that it is in bounds.
+      if (!IntTy->isPointerTy() &&
+          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+        continue;
+    }
+
+    // If we make it here and it's legal, add it.
+    (void)InsertFormula(LU, LUIdx, F);
+  next:;
+  }
+}
+
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+
+  // If this Formula already has a scaled register, we can't add another one.
+  // Try to unscale the formula to generate a better scale.
+  if (Base.Scale != 0 && !Base.unscale())
+    return;
+
+  assert(Base.Scale == 0 && "unscale did not did its job!");
+
+  // Check each interesting stride.
+  for (int64_t Factor : Factors) {
+    Base.Scale = Factor;
+    Base.HasBaseReg = Base.BaseRegs.size() > 1;
+    // Check whether this scale is going to be legal.
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    Base)) {
+      // As a special-case, handle special out-of-loop Basic users specially.
+      // TODO: Reconsider this special case.
+      if (LU.Kind == LSRUse::Basic &&
+          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+                     LU.AccessTy, Base) &&
+          LU.AllFixupsOutsideLoop)
+        LU.Kind = LSRUse::Special;
+      else
+        continue;
+    }
+    // For an ICmpZero, negating a solitary base register won't lead to
+    // new solutions.
+    if (LU.Kind == LSRUse::ICmpZero &&
+        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+      continue;
+    // For each addrec base reg, apply the scale, if possible.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+      if (const SCEVAddRecExpr *AR =
+            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+        if (FactorS->isZero())
+          continue;
+        // Divide out the factor, ignoring high bits, since we'll be
+        // scaling the value back up in the end.
+        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+          // TODO: This could be optimized to avoid all the copying.
+          Formula F = Base;
+          F.ScaledReg = Quotient;
+          F.deleteBaseReg(F.BaseRegs[i]);
+          // The canonical representation of 1*reg is reg, which is already in
+          // Base. In that case, do not try to insert the formula, it will be
+          // rejected anyway.
+          if (F.Scale == 1 && F.BaseRegs.empty())
+            continue;
+          (void)InsertFormula(LU, LUIdx, F);
+        }
+      }
+  }
+}
+
+/// Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Don't bother truncating symbolic values.
+  if (Base.BaseGV) return;
+
+  // Determine the integer type for the base formula.
+  Type *DstTy = Base.getType();
+  if (!DstTy) return;
+  DstTy = SE.getEffectiveSCEVType(DstTy);
+
+  for (Type *SrcTy : Types) {
+    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+      Formula F = Base;
+
+      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+      for (const SCEV *&BaseReg : F.BaseRegs)
+        BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+
+      // TODO: This assumes we've done basic processing on all uses and
+      // have an idea what the register usage is.
+      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+        continue;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+namespace {
+
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
+struct WorkItem {
+  size_t LUIdx;
+  int64_t Imm;
+  const SCEV *OrigReg;
+
+  WorkItem(size_t LI, int64_t I, const SCEV *R)
+    : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+void WorkItem::print(raw_ostream &OS) const {
+  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+     << " , add offset " << Imm;
+}
+
+LLVM_DUMP_METHOD
+void WorkItem::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+  // Group the registers by their value without any added constant offset.
+  typedef std::map<int64_t, const SCEV *> ImmMapTy;
+  DenseMap<const SCEV *, ImmMapTy> Map;
+  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+  SmallVector<const SCEV *, 8> Sequence;
+  for (const SCEV *Use : RegUses) {
+    const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
+    int64_t Imm = ExtractImmediate(Reg, SE);
+    auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
+    if (Pair.second)
+      Sequence.push_back(Reg);
+    Pair.first->second.insert(std::make_pair(Imm, Use));
+    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
+  }
+
+  // Now examine each set of registers with the same base value. Build up
+  // a list of work to do and do the work in a separate step so that we're
+  // not adding formulae and register counts while we're searching.
+  SmallVector<WorkItem, 32> WorkItems;
+  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+  for (const SCEV *Reg : Sequence) {
+    const ImmMapTy &Imms = Map.find(Reg)->second;
+
+    // It's not worthwhile looking for reuse if there's only one offset.
+    if (Imms.size() == 1)
+      continue;
+
+    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+          for (const auto &Entry : Imms)
+            dbgs() << ' ' << Entry.first;
+          dbgs() << '\n');
+
+    // Examine each offset.
+    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+         J != JE; ++J) {
+      const SCEV *OrigReg = J->second;
+
+      int64_t JImm = J->first;
+      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+      if (!isa<SCEVConstant>(OrigReg) &&
+          UsedByIndicesMap[Reg].count() == 1) {
+        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+        continue;
+      }
+
+      // Conservatively examine offsets between this orig reg a few selected
+      // other orig regs.
+      ImmMapTy::const_iterator OtherImms[] = {
+        Imms.begin(), std::prev(Imms.end()),
+        Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
+                         2)
+      };
+      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+        ImmMapTy::const_iterator M = OtherImms[i];
+        if (M == J || M == JE) continue;
+
+        // Compute the difference between the two.
+        int64_t Imm = (uint64_t)JImm - M->first;
+        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
+             LUIdx = UsedByIndices.find_next(LUIdx))
+          // Make a memo of this use, offset, and register tuple.
+          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
+            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+      }
+    }
+  }
+
+  Map.clear();
+  Sequence.clear();
+  UsedByIndicesMap.clear();
+  UniqueItems.clear();
+
+  // Now iterate through the worklist and add new formulae.
+  for (const WorkItem &WI : WorkItems) {
+    size_t LUIdx = WI.LUIdx;
+    LSRUse &LU = Uses[LUIdx];
+    int64_t Imm = WI.Imm;
+    const SCEV *OrigReg = WI.OrigReg;
+
+    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+    // TODO: Use a more targeted data structure.
+    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+      Formula F = LU.Formulae[L];
+      // FIXME: The code for the scaled and unscaled registers looks
+      // very similar but slightly different. Investigate if they
+      // could be merged. That way, we would not have to unscale the
+      // Formula.
+      F.unscale();
+      // Use the immediate in the scaled register.
+      if (F.ScaledReg == OrigReg) {
+        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+        // Don't create 50 + reg(-50).
+        if (F.referencesReg(SE.getSCEV(
+                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+          continue;
+        Formula NewF = F;
+        NewF.BaseOffset = Offset;
+        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        NewF))
+          continue;
+        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+        // If the new scale is a constant in a register, and adding the constant
+        // value to the immediate would produce a value closer to zero than the
+        // immediate itself, then the formula isn't worthwhile.
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+          if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+              (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+                  .ule(std::abs(NewF.BaseOffset)))
+            continue;
+
+        // OK, looks good.
+        NewF.canonicalize();
+        (void)InsertFormula(LU, LUIdx, NewF);
+      } else {
+        // Use the immediate in a base register.
+        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+          const SCEV *BaseReg = F.BaseRegs[N];
+          if (BaseReg != OrigReg)
+            continue;
+          Formula NewF = F;
+          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+          }
+          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+          // If the new formula has a constant in a register, and adding the
+          // constant value to the immediate would produce a value closer to
+          // zero than the immediate itself, then the formula isn't worthwhile.
+          for (const SCEV *NewReg : NewF.BaseRegs)
+            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+              if ((C->getAPInt() + NewF.BaseOffset)
+                      .abs()
+                      .slt(std::abs(NewF.BaseOffset)) &&
+                  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+                      countTrailingZeros<uint64_t>(NewF.BaseOffset))
+                goto skip_formula;
+
+          // Ok, looks good.
+          NewF.canonicalize();
+          (void)InsertFormula(LU, LUIdx, NewF);
+          break;
+        skip_formula:;
+        }
+      }
+    }
+  }
+}
+
+/// Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+  // queries are more precise.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateScales(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+  }
+
+  GenerateCrossUseConstantOffsets();
+
+  DEBUG(dbgs() << "\n"
+                  "After generating reuse formulae:\n";
+        print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+  SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+
+  // Collect the best formula for each unique set of shared registers. This
+  // is reset for each use.
+  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
+    BestFormulaeTy;
+  BestFormulaeTy BestFormulae;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+         FIdx != NumForms; ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+
+      // Some formulas are instant losers. For example, they may depend on
+      // nonexistent AddRecs from other loops. These need to be filtered
+      // immediately, otherwise heuristics could choose them over others leading
+      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+      // avoids the need to recompute this information across formulae using the
+      // same bad AddRec. Passing LoserRegs is also essential unless we remove
+      // the corresponding bad register from the Regs set.
+      Cost CostF;
+      Regs.clear();
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
+                        &LoserRegs);
+      if (CostF.isLoser()) {
+        // During initial formula generation, undesirable formulae are generated
+        // by uses within other loops that have some non-trivial address mode or
+        // use the postinc form of the IV. LSR needs to provide these formulae
+        // as the basis of rediscovering the desired formula that uses an AddRec
+        // corresponding to the existing phi. Once all formulae have been
+        // generated, these initial losers may be pruned.
+        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+              dbgs() << "\n");
+      }
+      else {
+        SmallVector<const SCEV *, 4> Key;
+        for (const SCEV *Reg : F.BaseRegs) {
+          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+            Key.push_back(Reg);
+        }
+        if (F.ScaledReg &&
+            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+          Key.push_back(F.ScaledReg);
+        // Unstable sort by host order ok, because this is only used for
+        // uniquifying.
+        std::sort(Key.begin(), Key.end());
+
+        std::pair<BestFormulaeTy::const_iterator, bool> P =
+          BestFormulae.insert(std::make_pair(Key, FIdx));
+        if (P.second)
+          continue;
+
+        Formula &Best = LU.Formulae[P.first->second];
+
+        Cost CostBest;
+        Regs.clear();
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+                             DT, LU);
+        if (CostF < CostBest)
+          std::swap(F, Best);
+        DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+              dbgs() << "\n"
+                        "    in favor of formula "; Best.print(dbgs());
+              dbgs() << '\n');
+      }
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+
+    // Now that we've filtered out some formulae, recompute the Regs set.
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  DEBUG(if (ChangedFormulae) {
+          dbgs() << "\n"
+                    "After filtering out undesirable candidates:\n";
+          print_uses(dbgs());
+        });
+}
+
+// This is a rough guess that seems to work fairly well.
+static const size_t ComplexityLimit = UINT16_MAX;
+
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+  size_t Power = 1;
+  for (const LSRUse &LU : Uses) {
+    size_t FSize = LU.Formulae.size();
+    if (FSize >= ComplexityLimit) {
+      Power = ComplexityLimit;
+      break;
+    }
+    Power *= FSize;
+    if (Power >= ComplexityLimit)
+      break;
+  }
+  return Power;
+}
+
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+                    "which use a superset of registers used by other "
+                    "formulae.\n");
+
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        // Look for a formula with a constant or GV in a register. If the use
+        // also has a formula with that same value in an immediate field,
+        // delete the one that uses a register.
+        for (SmallVectorImpl<const SCEV *>::const_iterator
+             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+            Formula NewF = F;
+            NewF.BaseOffset += C->getValue()->getSExtValue();
+            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                (I - F.BaseRegs.begin()));
+            if (LU.HasFormulaWithSameRegs(NewF)) {
+              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+              LU.DeleteFormula(F);
+              --i;
+              --e;
+              Any = true;
+              break;
+            }
+          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+              if (!F.BaseGV) {
+                Formula NewF = F;
+                NewF.BaseGV = GV;
+                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                    (I - F.BaseRegs.begin()));
+                if (LU.HasFormulaWithSameRegs(NewF)) {
+                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                        dbgs() << '\n');
+                  LU.DeleteFormula(F);
+                  --i;
+                  --e;
+                  Any = true;
+                  break;
+                }
+              }
+          }
+        }
+      }
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by assuming that uses separated "
+                  "by a constant offset will use the same registers.\n");
+
+  // This is especially useful for unrolled loops.
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (const Formula &F : LU.Formulae) {
+      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+        continue;
+
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
+
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Update the relocs to reference the new use.
+      for (LSRFixup &Fixup : Fixups) {
+        if (Fixup.LUIdx == LUIdx) {
+          Fixup.LUIdx = LUThatHas - &Uses.front();
+          Fixup.Offset += F.BaseOffset;
+          // Add the new offset to LUThatHas' offset list.
+          if (LUThatHas->Offsets.back() != Fixup.Offset) {
+            LUThatHas->Offsets.push_back(Fixup.Offset);
+            if (Fixup.Offset > LUThatHas->MaxOffset)
+              LUThatHas->MaxOffset = Fixup.Offset;
+            if (Fixup.Offset < LUThatHas->MinOffset)
+              LUThatHas->MinOffset = Fixup.Offset;
+          }
+          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+        }
+        if (Fixup.LUIdx == NumUses-1)
+          Fixup.LUIdx = LUIdx;
+      }
+
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                    "undesirable dedicated registers.\n");
+
+    FilterOutUndesirableDedicatedRegisters();
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+  // With all other options exhausted, loop until the system is simple
+  // enough to handle.
+  SmallPtrSet<const SCEV *, 4> Taken;
+  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    // Ok, we have too many of formulae on our hands to conveniently handle.
+    // Use a rough heuristic to thin out the list.
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    // Pick the register which is used by the most LSRUses, which is likely
+    // to be a good reuse register candidate.
+    const SCEV *Best = nullptr;
+    unsigned BestNum = 0;
+    for (const SCEV *Reg : RegUses) {
+      if (Taken.count(Reg))
+        continue;
+      if (!Best)
+        Best = Reg;
+      else {
+        unsigned Count = RegUses.getUsedByIndices(Reg).count();
+        if (Count > BestNum) {
+          Best = Reg;
+          BestNum = Count;
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+                 << " will yield profitable reuse.\n");
+    Taken.insert(Best);
+
+    // In any use with formulae which references this register, delete formulae
+    // which don't reference it.
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      if (!LU.Regs.count(Best)) continue;
+
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        if (!F.referencesReg(Best)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+          LU.DeleteFormula(F);
+          --e;
+          --i;
+          Any = true;
+          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+          continue;
+        }
+      }
+
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  NarrowSearchSpaceByDetectingSupersets();
+  NarrowSearchSpaceByCollapsingUnrolledCode();
+  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                               Cost &SolutionCost,
+                               SmallVectorImpl<const Formula *> &Workspace,
+                               const Cost &CurCost,
+                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                               DenseSet<const SCEV *> &VisitedRegs) const {
+  // Some ideas:
+  //  - prune more:
+  //    - use more aggressive filtering
+  //    - sort the formula so that the most profitable solutions are found first
+  //    - sort the uses too
+  //  - search faster:
+  //    - don't compute a cost, and then compare. compare while computing a cost
+  //      and bail early.
+  //    - track register sets with SmallBitVector
+
+  const LSRUse &LU = Uses[Workspace.size()];
+
+  // If this use references any register that's already a part of the
+  // in-progress solution, consider it a requirement that a formula must
+  // reference that register in order to be considered. This prunes out
+  // unprofitable searching.
+  SmallSetVector<const SCEV *, 4> ReqRegs;
+  for (const SCEV *S : CurRegs)
+    if (LU.Regs.count(S))
+      ReqRegs.insert(S);
+
+  SmallPtrSet<const SCEV *, 16> NewRegs;
+  Cost NewCost;
+  for (const Formula &F : LU.Formulae) {
+    // Ignore formulae which may not be ideal in terms of register reuse of
+    // ReqRegs.  The formula should use all required registers before
+    // introducing new ones.
+    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+    for (const SCEV *Reg : ReqRegs) {
+      if ((F.ScaledReg && F.ScaledReg == Reg) ||
+          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
+          F.BaseRegs.end()) {
+        --NumReqRegsToFind;
+        if (NumReqRegsToFind == 0)
+          break;
+      }
+    }
+    if (NumReqRegsToFind != 0) {
+      // If none of the formulae satisfied the required registers, then we could
+      // clear ReqRegs and try again. Currently, we simply give up in this case.
+      continue;
+    }
+
+    // Evaluate the cost of the current formula. If it's already worse than
+    // the current best, prune the search at that point.
+    NewCost = CurCost;
+    NewRegs = CurRegs;
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        LU);
+    if (NewCost < SolutionCost) {
+      Workspace.push_back(&F);
+      if (Workspace.size() != Uses.size()) {
+        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+                     NewRegs, VisitedRegs);
+        if (F.getNumRegs() == 1 && Workspace.size() == 1)
+          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+      } else {
+        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+              dbgs() << ".\n Regs:";
+              for (const SCEV *S : NewRegs)
+                dbgs() << ' ' << *S;
+              dbgs() << '\n');
+
+        SolutionCost = NewCost;
+        Solution = Workspace;
+      }
+      Workspace.pop_back();
+    }
+  }
+}
+
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+  SmallVector<const Formula *, 8> Workspace;
+  Cost SolutionCost;
+  SolutionCost.Lose();
+  Cost CurCost;
+  SmallPtrSet<const SCEV *, 16> CurRegs;
+  DenseSet<const SCEV *> VisitedRegs;
+  Workspace.reserve(Uses.size());
+
+  // SolveRecurse does all the work.
+  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+               CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
+
+  // Ok, we've now made all our decisions.
+  DEBUG(dbgs() << "\n"
+                  "The chosen solution requires "; SolutionCost.print(dbgs());
+        dbgs() << ":\n";
+        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+          dbgs() << "  ";
+          Uses[i].print(dbgs());
+          dbgs() << "\n"
+                    "    ";
+          Solution[i]->print(dbgs());
+          dbgs() << '\n';
+        });
+
+  assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+                                 const SmallVectorImpl<Instruction *> &Inputs)
+                                                                         const {
+  for (;;) {
+    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+    BasicBlock *IDom;
+    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+      if (!Rung) return IP;
+      Rung = Rung->getIDom();
+      if (!Rung) return IP;
+      IDom = Rung->getBlock();
+
+      // Don't climb into a loop though.
+      const Loop *IDomLoop = LI.getLoopFor(IDom);
+      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+      if (IDomDepth <= IPLoopDepth &&
+          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+        break;
+    }
+
+    bool AllDominate = true;
+    Instruction *BetterPos = nullptr;
+    Instruction *Tentative = IDom->getTerminator();
+    for (Instruction *Inst : Inputs) {
+      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+        AllDominate = false;
+        break;
+      }
+      // Attempt to find an insert position in the middle of the block,
+      // instead of at the end, so that it can be used for other expansions.
+      if (IDom == Inst->getParent() &&
+          (!BetterPos || !DT.dominates(Inst, BetterPos)))
+        BetterPos = &*std::next(BasicBlock::iterator(Inst));
+    }
+    if (!AllDominate)
+      break;
+    if (BetterPos)
+      IP = BetterPos->getIterator();
+    else
+      IP = Tentative->getIterator();
+  }
+
+  return IP;
+}
+
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+                                           const LSRFixup &LF,
+                                           const LSRUse &LU,
+                                           SCEVExpander &Rewriter) const {
+  // Collect some instructions which must be dominated by the
+  // expanding replacement. These must be dominated by any operands that
+  // will be required in the expansion.
+  SmallVector<Instruction *, 4> Inputs;
+  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+    Inputs.push_back(I);
+  if (LU.Kind == LSRUse::ICmpZero)
+    if (Instruction *I =
+          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+      Inputs.push_back(I);
+  if (LF.PostIncLoops.count(L)) {
+    if (LF.isUseFullyOutsideLoop(L))
+      Inputs.push_back(L->getLoopLatch()->getTerminator());
+    else
+      Inputs.push_back(IVIncInsertPos);
+  }
+  // The expansion must also be dominated by the increment positions of any
+  // loops it for which it is using post-inc mode.
+  for (const Loop *PIL : LF.PostIncLoops) {
+    if (PIL == L) continue;
+
+    // Be dominated by the loop exit.
+    SmallVector<BasicBlock *, 4> ExitingBlocks;
+    PIL->getExitingBlocks(ExitingBlocks);
+    if (!ExitingBlocks.empty()) {
+      BasicBlock *BB = ExitingBlocks[0];
+      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+      Inputs.push_back(BB->getTerminator());
+    }
+  }
+
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
+         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+         "Insertion point must be a normal instruction");
+
+  // Then, climb up the immediate dominator tree as far as we can go while
+  // still being dominated by the input positions.
+  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+  // Don't insert instructions before PHI nodes.
+  while (isa<PHINode>(IP)) ++IP;
+
+  // Ignore landingpad instructions.
+  while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;
+
+  // Ignore debug intrinsics.
+  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+  // IP consistent across expansions and allows the previously inserted
+  // instructions to be reused by subsequent expansion.
+  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+    ++IP;
+
+  return IP;
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
+Value *LSRInstance::Expand(const LSRFixup &LF,
+                           const Formula &F,
+                           BasicBlock::iterator IP,
+                           SCEVExpander &Rewriter,
+                           SmallVectorImpl<WeakVH> &DeadInsts) const {
+  const LSRUse &LU = Uses[LF.LUIdx];
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
+
+  // Determine an input position which will be dominated by the operands and
+  // which will dominate the result.
+  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+
+  // Inform the Rewriter if we have a post-increment use, so that it can
+  // perform an advantageous expansion.
+  Rewriter.setPostInc(LF.PostIncLoops);
+
+  // This is the type that the user actually needs.
+  Type *OpTy = LF.OperandValToReplace->getType();
+  // This will be the type that we'll initially expand to.
+  Type *Ty = F.getType();
+  if (!Ty)
+    // No type known; just expand directly to the ultimate type.
+    Ty = OpTy;
+  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+    // Expand directly to the ultimate type if it's the right size.
+    Ty = OpTy;
+  // This is the type to do integer arithmetic in.
+  Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+  // Build up a list of operands to add together to form the full base.
+  SmallVector<const SCEV *, 8> Ops;
+
+  // Expand the BaseRegs portion.
+  for (const SCEV *Reg : F.BaseRegs) {
+    assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+    Reg = TransformForPostIncUse(Denormalize, Reg,
+                                 LF.UserInst, LF.OperandValToReplace,
+                                 Loops, SE, DT);
+
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
+  }
+
+  // Expand the ScaledReg portion.
+  Value *ICmpScaledV = nullptr;
+  if (F.Scale != 0) {
+    const SCEV *ScaledS = F.ScaledReg;
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
+                                     LF.UserInst, LF.OperandValToReplace,
+                                     Loops, SE, DT);
+
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // Expand ScaleReg as if it was part of the base regs.
+      if (F.Scale == 1)
+        Ops.push_back(
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
+      else {
+        // An interesting way of "folding" with an icmp is to use a negated
+        // scale, which we'll implement by inserting it into the other operand
+        // of the icmp.
+        assert(F.Scale == -1 &&
+               "The only scale supported by ICmpZero uses is -1!");
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
+      }
+    } else {
+      // Otherwise just expand the scaled register and an explicit scale,
+      // which is expected to be matched as part of the address.
+
+      // Flush the operand list to suppress SCEVExpander hoisting address modes.
+      // Unless the addressing mode will not be folded.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+          isAMCompletelyFolded(TTI, LU, F)) {
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+        Ops.clear();
+        Ops.push_back(SE.getUnknown(FullV));
+      }
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
+      if (F.Scale != 1)
+        ScaledS =
+            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
+      Ops.push_back(ScaledS);
+    }
+  }
+
+  // Expand the GV portion.
+  if (F.BaseGV) {
+    // Flush the operand list to suppress SCEVExpander hoisting.
+    if (!Ops.empty()) {
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+      Ops.clear();
+      Ops.push_back(SE.getUnknown(FullV));
+    }
+    Ops.push_back(SE.getUnknown(F.BaseGV));
+  }
+
+  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+  // unfolded offsets. LSR assumes they both live next to their uses.
+  if (!Ops.empty()) {
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
+    Ops.clear();
+    Ops.push_back(SE.getUnknown(FullV));
+  }
+
+  // Expand the immediate portion.
+  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+  if (Offset != 0) {
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // The other interesting way of "folding" with an ICmpZero is to use a
+      // negated immediate.
+      if (!ICmpScaledV)
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+      else {
+        Ops.push_back(SE.getUnknown(ICmpScaledV));
+        ICmpScaledV = ConstantInt::get(IntTy, Offset);
+      }
+    } else {
+      // Just add the immediate values. These again are expected to be matched
+      // as part of the address.
+      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+    }
+  }
+
+  // Expand the unfolded offset portion.
+  int64_t UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset != 0) {
+    // Just add the immediate values.
+    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+                                                       UnfoldedOffset)));
+  }
+
+  // Emit instructions summing all the operands.
+  const SCEV *FullS = Ops.empty() ?
+                      SE.getConstant(IntTy, 0) :
+                      SE.getAddExpr(Ops);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
+
+  // We're done expanding now, so reset the rewriter.
+  Rewriter.clearPostInc();
+
+  // An ICmpZero Formula represents an ICmp which we're handling as a
+  // comparison against zero. Now that we've expanded an expression for that
+  // form, update the ICmp's other operand.
+  if (LU.Kind == LSRUse::ICmpZero) {
+    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+    DeadInsts.emplace_back(CI->getOperand(1));
+    assert(!F.BaseGV && "ICmp does not support folding a global value and "
+                           "a scale at the same time!");
+    if (F.Scale == -1) {
+      if (ICmpScaledV->getType() != OpTy) {
+        Instruction *Cast =
+          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+                                                   OpTy, false),
+                           ICmpScaledV, OpTy, "tmp", CI);
+        ICmpScaledV = Cast;
+      }
+      CI->setOperand(1, ICmpScaledV);
+    } else {
+      // A scale of 1 means that the scale has been expanded as part of the
+      // base regs.
+      assert((F.Scale == 0 || F.Scale == 1) &&
+             "ICmp does not support folding a global value and "
+             "a scale at the same time!");
+      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+                                           -(uint64_t)Offset);
+      if (C->getType() != OpTy)
+        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                          OpTy, false),
+                                  C, OpTy);
+
+      CI->setOperand(1, C);
+    }
+  }
+
+  return FullV;
+}
+
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(PHINode *PN,
+                                const LSRFixup &LF,
+                                const Formula &F,
+                                SCEVExpander &Rewriter,
+                                SmallVectorImpl<WeakVH> &DeadInsts) const {
+  DenseMap<BasicBlock *, Value *> Inserted;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+      BasicBlock *BB = PN->getIncomingBlock(i);
+
+      // If this is a critical edge, split the edge so that we do not insert
+      // the code on all predecessor/successor paths.  We do this unless this
+      // is the canonical backedge for this loop, which complicates post-inc
+      // users.
+      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+          !isa<IndirectBrInst>(BB->getTerminator())) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
+          // Split the critical edge.
+          BasicBlock *NewBB = nullptr;
+          if (!Parent->isLandingPad()) {
+            NewBB = SplitCriticalEdge(BB, Parent,
+                                      CriticalEdgeSplittingOptions(&DT, &LI)
+                                          .setMergeIdenticalEdges()
+                                          .setDontDeleteUselessPHIs());
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
+            NewBB = NewBBs[0];
+          }
+          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+          // phi predecessors are identical. The simple thing to do is skip
+          // splitting in this case rather than complicate the API.
+          if (NewBB) {
+            // If PN is outside of the loop and BB is in the loop, we want to
+            // move the block to be immediately before the PHI block, not
+            // immediately after BB.
+            if (L->contains(BB) && !L->contains(PN))
+              NewBB->moveBefore(PN->getParent());
+
+            // Splitting the edge can reduce the number of PHI entries we have.
+            e = PN->getNumIncomingValues();
+            BB = NewBB;
+            i = PN->getBasicBlockIndex(BB);
+          }
+        }
+      }
+
+      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+        Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
+      if (!Pair.second)
+        PN->setIncomingValue(i, Pair.first->second);
+      else {
+        Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
+                              Rewriter, DeadInsts);
+
+        // If this is reuse-by-noop-cast, insert the noop cast.
+        Type *OpTy = LF.OperandValToReplace->getType();
+        if (FullV->getType() != OpTy)
+          FullV =
+            CastInst::Create(CastInst::getCastOpcode(FullV, false,
+                                                     OpTy, false),
+                             FullV, LF.OperandValToReplace->getType(),
+                             "tmp", BB->getTerminator());
+
+        PN->setIncomingValue(i, FullV);
+        Pair.first->second = FullV;
+      }
+    }
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
+void LSRInstance::Rewrite(const LSRFixup &LF,
+                          const Formula &F,
+                          SCEVExpander &Rewriter,
+                          SmallVectorImpl<WeakVH> &DeadInsts) const {
+  // First, find an insertion point that dominates UserInst. For PHI nodes,
+  // find the nearest block which dominates all the relevant uses.
+  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
+  } else {
+    Value *FullV =
+        Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+
+    // If this is reuse-by-noop-cast, insert the noop cast.
+    Type *OpTy = LF.OperandValToReplace->getType();
+    if (FullV->getType() != OpTy) {
+      Instruction *Cast =
+        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+                         FullV, OpTy, "tmp", LF.UserInst);
+      FullV = Cast;
+    }
+
+    // Update the user. ICmpZero is handled specially here (for now) because
+    // Expand may have updated one of the operands of the icmp already, and
+    // its new value may happen to be equal to LF.OperandValToReplace, in
+    // which case doing replaceUsesOfWith leads to replacing both operands
+    // with the same value. TODO: Reorganize this.
+    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
+      LF.UserInst->setOperand(0, FullV);
+    else
+      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+  }
+
+  DeadInsts.emplace_back(LF.OperandValToReplace);
+}
+
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+    const SmallVectorImpl<const Formula *> &Solution) {
+  // Keep track of instructions we may have made dead, so that
+  // we can remove them after we are done working.
+  SmallVector<WeakVH, 16> DeadInsts;
+
+  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
+                        "lsr");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+  Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
+  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+  // Mark phi nodes that terminate chains so the expander tries to reuse them.
+  for (const IVChain &Chain : IVChainVec) {
+    if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
+      Rewriter.setChainedPhi(PN);
+  }
+
+  // Expand the new value definitions and update the users.
+  for (const LSRFixup &Fixup : Fixups) {
+    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
+
+    Changed = true;
+  }
+
+  for (const IVChain &Chain : IVChainVec) {
+    GenerateIVChain(Chain, Rewriter, DeadInsts);
+    Changed = true;
+  }
+  // Clean up after ourselves. This must be done before deleting any
+  // instructions.
+  Rewriter.clear();
+
+  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+}
+
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+                         DominatorTree &DT, LoopInfo &LI,
+                         const TargetTransformInfo &TTI)
+    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
+      IVIncInsertPos(nullptr) {
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return;
+
+  // If there's no interesting work to be done, bail early.
+  if (IU.empty()) return;
+
+  // If there's too much analysis to be done, bail early. We won't be able to
+  // model the problem anyway.
+  unsigned NumUsers = 0;
+  for (const IVStrideUse &U : IU) {
+    if (++NumUsers > MaxIVUsers) {
+      (void)U;
+      DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
+      return;
+    }
+  }
+
+#ifndef NDEBUG
+  // All dominating loops must have preheaders, or SCEVExpander may not be able
+  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+  //
+  // IVUsers analysis should only create users that are dominated by simple loop
+  // headers. Since this loop should dominate all of its users, its user list
+  // should be empty if this loop itself is not within a simple loop nest.
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *BB = Rung->getBlock();
+    const Loop *DomLoop = LI.getLoopFor(BB);
+    if (DomLoop && DomLoop->getHeader() == BB) {
+      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+    }
+  }
+#endif // DEBUG
+
+  DEBUG(dbgs() << "\nLSR on loop ";
+        L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+        dbgs() << ":\n");
+
+  // First, perform some low-level loop optimizations.
+  OptimizeShadowIV();
+  OptimizeLoopTermCond();
+
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
+  if (!L->empty()) {
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
+  // Start collecting data and preparing for the solver.
+  CollectChains();
+  CollectInterestingTypesAndFactors();
+  CollectFixupsAndInitialFormulae();
+  CollectLoopInvariantFixupsAndFormulae();
+
+  assert(!Uses.empty() && "IVUsers reported at least one use");
+  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+        print_uses(dbgs()));
+
+  // Now use the reuse data to generate a bunch of interesting ways
+  // to formulate the values needed for the uses.
+  GenerateAllReuseFormulae();
+
+  FilterOutUndesirableDedicatedRegisters();
+  NarrowSearchSpaceUsingHeuristics();
+
+  SmallVector<const Formula *, 8> Solution;
+  Solve(Solution);
+
+  // Release memory that is no longer needed.
+  Factors.clear();
+  Types.clear();
+  RegUses.clear();
+
+  if (Solution.empty())
+    return;
+
+#ifndef NDEBUG
+  // Formulae should be legal.
+  for (const LSRUse &LU : Uses) {
+    for (const Formula &F : LU.Formulae)
+      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        F) && "Illegal formula generated!");
+  };
+#endif
+
+  // Now that we've decided what we want, make it so.
+  ImplementSolution(Solution);
+}
+
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+  if (Factors.empty() && Types.empty()) return;
+
+  OS << "LSR has identified the following interesting factors and types: ";
+  bool First = true;
+
+  for (int64_t Factor : Factors) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '*' << Factor;
+  }
+
+  for (Type *Ty : Types) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '(' << *Ty << ')';
+  }
+  OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+  OS << "LSR is examining the following fixup sites:\n";
+  for (const LSRFixup &LF : Fixups) {
+    dbgs() << "  ";
+    LF.print(OS);
+    OS << '\n';
+  }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+  OS << "LSR is examining the following uses:\n";
+  for (const LSRUse &LU : Uses) {
+    dbgs() << "  ";
+    LU.print(OS);
+    OS << '\n';
+    for (const Formula &F : LU.Formulae) {
+      OS << "    ";
+      F.print(OS);
+      OS << '\n';
+    }
+  }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+  print_factors_and_types(OS);
+  print_fixups(OS);
+  print_uses(OS);
+}
+
+LLVM_DUMP_METHOD
+void LSRInstance::dump() const {
+  print(errs()); errs() << '\n';
+}
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopStrengthReduce();
+
+private:
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+}
+
+char LoopStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+                "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsers)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+                "Loop Strength Reduction", false, false)
+
+
+Pass *llvm::createLoopStrengthReducePass() {
+  return new LoopStrengthReduce();
+}
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We split critical edges, so we change the CFG.  However, we do update
+  // many analyses if they are around.
+  AU.addPreservedID(LoopSimplifyID);
+
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  // Requiring LoopSimplify a second time here prevents IVUsers from running
+  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<IVUsers>();
+  AU.addPreserved<IVUsers>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  auto &IU = getAnalysis<IVUsers>();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *L->getHeader()->getParent());
+  bool Changed = false;
+
+  // Run the main LSR transformation.
+  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
+
+  // Remove any extra phis created by processing inner loops.
+  Changed |= DeleteDeadPHIs(L->getHeader());
+  if (EnablePhiElim && L->isLoopSimplifyForm()) {
+    SmallVector<WeakVH, 16> DeadInsts;
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL,
+                          "lsr");
+#ifndef NDEBUG
+    Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+    unsigned numFolded = Rewriter.replaceCongruentIVs(
+        L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *L->getHeader()->getParent()));
+    if (numFolded) {
+      Changed = true;
+      DeleteTriviallyDeadInstructions(DeadInsts);
+      DeleteDeadPHIs(L->getHeader());
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
new file mode 100644
index 0000000..ecef6db
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -0,0 +1,1030 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <climits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+static cl::opt<unsigned>
+    UnrollThreshold("unroll-threshold", cl::Hidden,
+                    cl::desc("The baseline cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold(
+    "unroll-percent-dynamic-cost-saved-threshold", cl::Hidden,
+    cl::desc("The percentage of estimated dynamic cost which must be saved by "
+             "unrolling to allow unrolling up to the max threshold."));
+
+static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount(
+    "unroll-dynamic-cost-savings-discount", cl::Hidden,
+    cl::desc("This is the amount discounted from the total unroll cost when "
+             "the unrolled form has a high dynamic cost savings (triggered by "
+             "the '-unroll-perecent-dynamic-cost-saved-threshold' flag)."));
+
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+    "unroll-max-iteration-count-to-analyze", cl::init(0), cl::Hidden,
+    cl::desc("Don't allow loop unrolling to simulate more than this number of"
+             "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned>
+UnrollCount("unroll-count", cl::Hidden,
+  cl::desc("Use this unroll count for all loops including those with "
+           "unroll_count pragma values, for testing purposes"));
+
+static cl::opt<bool>
+UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+  cl::desc("Allows loops to be partially unrolled until "
+           "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool>
+UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+  cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned>
+PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+  cl::desc("Unrolled size limit for loops with an unroll(full) or "
+           "unroll_count pragma."));
+
+
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = UINT_MAX;
+
+/// Default unroll count for loops with run-time trip count if
+/// -unroll-count is not set
+static const unsigned DefaultUnrollRuntimeCount = 8;
+
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides, pragmas, and user specified parameters.
+static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
+    Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
+    Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
+    Optional<bool> UserRuntime, unsigned PragmaCount, bool PragmaFullUnroll,
+    bool PragmaEnableUnroll, unsigned TripCount) {
+  TargetTransformInfo::UnrollingPreferences UP;
+
+  // Set up the defaults
+  UP.Threshold = 150;
+  UP.PercentDynamicCostSavedThreshold = 20;
+  UP.DynamicCostSavingsDiscount = 2000;
+  UP.OptSizeThreshold = 50;
+  UP.PartialThreshold = UP.Threshold;
+  UP.PartialOptSizeThreshold = UP.OptSizeThreshold;
+  UP.Count = 0;
+  UP.MaxCount = UINT_MAX;
+  UP.Partial = false;
+  UP.Runtime = false;
+  UP.AllowExpensiveTripCount = false;
+
+  // Override with any target specific settings
+  TTI.getUnrollingPreferences(L, UP);
+
+  // Apply size attributes
+  if (L->getHeader()->getParent()->optForSize()) {
+    UP.Threshold = UP.OptSizeThreshold;
+    UP.PartialThreshold = UP.PartialOptSizeThreshold;
+  }
+
+  // Apply unroll count pragmas
+  if (PragmaCount)
+    UP.Count = PragmaCount;
+  else if (PragmaFullUnroll)
+    UP.Count = TripCount;
+
+  // Apply any user values specified by cl::opt
+  if (UnrollThreshold.getNumOccurrences() > 0) {
+    UP.Threshold = UnrollThreshold;
+    UP.PartialThreshold = UnrollThreshold;
+  }
+  if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0)
+    UP.PercentDynamicCostSavedThreshold =
+        UnrollPercentDynamicCostSavedThreshold;
+  if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0)
+    UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount;
+  if (UnrollCount.getNumOccurrences() > 0)
+    UP.Count = UnrollCount;
+  if (UnrollAllowPartial.getNumOccurrences() > 0)
+    UP.Partial = UnrollAllowPartial;
+  if (UnrollRuntime.getNumOccurrences() > 0)
+    UP.Runtime = UnrollRuntime;
+
+  // Apply user values provided by argument
+  if (UserThreshold.hasValue()) {
+    UP.Threshold = *UserThreshold;
+    UP.PartialThreshold = *UserThreshold;
+  }
+  if (UserCount.hasValue())
+    UP.Count = *UserCount;
+  if (UserAllowPartial.hasValue())
+    UP.Partial = *UserAllowPartial;
+  if (UserRuntime.hasValue())
+    UP.Runtime = *UserRuntime;
+
+  if (PragmaCount > 0 ||
+      ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0)) {
+    // If the loop has an unrolling pragma, we want to be more aggressive with
+    // unrolling limits. Set thresholds to at least the PragmaTheshold value
+    // which is larger than the default limits.
+    if (UP.Threshold != NoThreshold)
+      UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+    if (UP.PartialThreshold != NoThreshold)
+      UP.PartialThreshold =
+          std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+  }
+
+  return UP;
+}
+
+namespace {
+// This class is used to get an estimate of the optimization effects that we
+// could get from complete loop unrolling. It comes from the fact that some
+// loads might be replaced with concrete constant values and that could trigger
+// a chain of instruction simplifications.
+//
+// E.g. we might have:
+//   int a[] = {0, 1, 0};
+//   v = 0;
+//   for (i = 0; i < 3; i ++)
+//     v += b[i]*a[i];
+// If we completely unroll the loop, we would get:
+//   v = b[0]*a[0] + b[1]*a[1] + b[2]*a[2]
+// Which then will be simplified to:
+//   v = b[0]* 0 + b[1]* 1 + b[2]* 0
+// And finally:
+//   v = b[1]
+class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> {
+  typedef InstVisitor<UnrolledInstAnalyzer, bool> Base;
+  friend class InstVisitor<UnrolledInstAnalyzer, bool>;
+  struct SimplifiedAddress {
+    Value *Base = nullptr;
+    ConstantInt *Offset = nullptr;
+  };
+
+public:
+  UnrolledInstAnalyzer(unsigned Iteration,
+                       DenseMap<Value *, Constant *> &SimplifiedValues,
+                       ScalarEvolution &SE)
+      : SimplifiedValues(SimplifiedValues), SE(SE) {
+      IterationNumber = SE.getConstant(APInt(64, Iteration));
+  }
+
+  // Allow access to the initial visit method.
+  using Base::visit;
+
+private:
+  /// \brief A cache of pointer bases and constant-folded offsets corresponding
+  /// to GEP (or derived from GEP) instructions.
+  ///
+  /// In order to find the base pointer one needs to perform non-trivial
+  /// traversal of the corresponding SCEV expression, so it's good to have the
+  /// results saved.
+  DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses;
+
+  /// \brief SCEV expression corresponding to number of currently simulated
+  /// iteration.
+  const SCEV *IterationNumber;
+
+  /// \brief A Value->Constant map for keeping values that we managed to
+  /// constant-fold on the given iteration.
+  ///
+  /// While we walk the loop instructions, we build up and maintain a mapping
+  /// of simplified values specific to this iteration.  The idea is to propagate
+  /// any special information we have about loads that can be replaced with
+  /// constants after complete unrolling, and account for likely simplifications
+  /// post-unrolling.
+  DenseMap<Value *, Constant *> &SimplifiedValues;
+
+  ScalarEvolution &SE;
+
+  /// \brief Try to simplify instruction \param I using its SCEV expression.
+  ///
+  /// The idea is that some AddRec expressions become constants, which then
+  /// could trigger folding of other instructions. However, that only happens
+  /// for expressions whose start value is also constant, which isn't always the
+  /// case. In another common and important case the start value is just some
+  /// address (i.e. SCEVUnknown) - in this case we compute the offset and save
+  /// it along with the base address instead.
+  bool simplifyInstWithSCEV(Instruction *I) {
+    if (!SE.isSCEVable(I->getType()))
+      return false;
+
+    const SCEV *S = SE.getSCEV(I);
+    if (auto *SC = dyn_cast<SCEVConstant>(S)) {
+      SimplifiedValues[I] = SC->getValue();
+      return true;
+    }
+
+    auto *AR = dyn_cast<SCEVAddRecExpr>(S);
+    if (!AR)
+      return false;
+
+    const SCEV *ValueAtIteration = AR->evaluateAtIteration(IterationNumber, SE);
+    // Check if the AddRec expression becomes a constant.
+    if (auto *SC = dyn_cast<SCEVConstant>(ValueAtIteration)) {
+      SimplifiedValues[I] = SC->getValue();
+      return true;
+    }
+
+    // Check if the offset from the base address becomes a constant.
+    auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(S));
+    if (!Base)
+      return false;
+    auto *Offset =
+        dyn_cast<SCEVConstant>(SE.getMinusSCEV(ValueAtIteration, Base));
+    if (!Offset)
+      return false;
+    SimplifiedAddress Address;
+    Address.Base = Base->getValue();
+    Address.Offset = Offset->getValue();
+    SimplifiedAddresses[I] = Address;
+    return true;
+  }
+
+  /// Base case for the instruction visitor.
+  bool visitInstruction(Instruction &I) {
+    return simplifyInstWithSCEV(&I);
+  }
+
+  /// Try to simplify binary operator I.
+  ///
+  /// TODO: Probably it's worth to hoist the code for estimating the
+  /// simplifications effects to a separate class, since we have a very similar
+  /// code in InlineCost already.
+  bool visitBinaryOperator(BinaryOperator &I) {
+    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+    if (!isa<Constant>(LHS))
+      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+        LHS = SimpleLHS;
+    if (!isa<Constant>(RHS))
+      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+        RHS = SimpleRHS;
+
+    Value *SimpleV = nullptr;
+    const DataLayout &DL = I.getModule()->getDataLayout();
+    if (auto FI = dyn_cast<FPMathOperator>(&I))
+      SimpleV =
+          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
+    else
+      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+
+    if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
+      SimplifiedValues[&I] = C;
+
+    if (SimpleV)
+      return true;
+    return Base::visitBinaryOperator(I);
+  }
+
+  /// Try to fold load I.
+  bool visitLoad(LoadInst &I) {
+    Value *AddrOp = I.getPointerOperand();
+
+    auto AddressIt = SimplifiedAddresses.find(AddrOp);
+    if (AddressIt == SimplifiedAddresses.end())
+      return false;
+    ConstantInt *SimplifiedAddrOp = AddressIt->second.Offset;
+
+    auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base);
+    // We're only interested in loads that can be completely folded to a
+    // constant.
+    if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant())
+      return false;
+
+    ConstantDataSequential *CDS =
+        dyn_cast<ConstantDataSequential>(GV->getInitializer());
+    if (!CDS)
+      return false;
+
+    // We might have a vector load from an array. FIXME: for now we just bail
+    // out in this case, but we should be able to resolve and simplify such
+    // loads.
+    if(!CDS->isElementTypeCompatible(I.getType()))
+      return false;
+
+    int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U;
+    assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 &&
+           "Unexpectedly large index value.");
+    int64_t Index = SimplifiedAddrOp->getSExtValue() / ElemSize;
+    if (Index >= CDS->getNumElements()) {
+      // FIXME: For now we conservatively ignore out of bound accesses, but
+      // we're allowed to perform the optimization in this case.
+      return false;
+    }
+
+    Constant *CV = CDS->getElementAsConstant(Index);
+    assert(CV && "Constant expected.");
+    SimplifiedValues[&I] = CV;
+
+    return true;
+  }
+
+  bool visitCastInst(CastInst &I) {
+    // Propagate constants through casts.
+    Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+    if (!COp)
+      COp = SimplifiedValues.lookup(I.getOperand(0));
+    if (COp)
+      if (Constant *C =
+              ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
+        SimplifiedValues[&I] = C;
+        return true;
+      }
+
+    return Base::visitCastInst(I);
+  }
+
+  bool visitCmpInst(CmpInst &I) {
+    Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+    // First try to handle simplified comparisons.
+    if (!isa<Constant>(LHS))
+      if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+        LHS = SimpleLHS;
+    if (!isa<Constant>(RHS))
+      if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+        RHS = SimpleRHS;
+
+    if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) {
+      auto SimplifiedLHS = SimplifiedAddresses.find(LHS);
+      if (SimplifiedLHS != SimplifiedAddresses.end()) {
+        auto SimplifiedRHS = SimplifiedAddresses.find(RHS);
+        if (SimplifiedRHS != SimplifiedAddresses.end()) {
+          SimplifiedAddress &LHSAddr = SimplifiedLHS->second;
+          SimplifiedAddress &RHSAddr = SimplifiedRHS->second;
+          if (LHSAddr.Base == RHSAddr.Base) {
+            LHS = LHSAddr.Offset;
+            RHS = RHSAddr.Offset;
+          }
+        }
+      }
+    }
+
+    if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+      if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+        if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
+          SimplifiedValues[&I] = C;
+          return true;
+        }
+      }
+    }
+
+    return Base::visitCmpInst(I);
+  }
+};
+} // namespace
+
+
+namespace {
+struct EstimatedUnrollCost {
+  /// \brief The estimated cost after unrolling.
+  int UnrolledCost;
+
+  /// \brief The estimated dynamic cost of executing the instructions in the
+  /// rolled form.
+  int RolledDynamicCost;
+};
+}
+
+/// \brief Figure out if the loop is worth full unrolling.
+///
+/// Complete loop unrolling can make some loads constant, and we need to know
+/// if that would expose any further optimization opportunities.  This routine
+/// estimates this optimization.  It computes cost of unrolled loop
+/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
+/// dynamic cost we mean that we won't count costs of blocks that are known not
+/// to be executed (i.e. if we have a branch in the loop and we know that at the
+/// given iteration its condition would be resolved to true, we won't add up the
+/// cost of the 'false'-block).
+/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
+/// the analysis failed (no benefits expected from the unrolling, or the loop is
+/// too big to analyze), the returned value is None.
+static Optional<EstimatedUnrollCost>
+analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
+                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
+                      int MaxUnrolledLoopSize) {
+  // We want to be able to scale offsets by the trip count and add more offsets
+  // to them without checking for overflows, and we already don't want to
+  // analyze *massive* trip counts, so we force the max to be reasonably small.
+  assert(UnrollMaxIterationsCountToAnalyze < (INT_MAX / 2) &&
+         "The unroll iterations max is too large!");
+
+  // Don't simulate loops with a big or unknown tripcount
+  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
+      TripCount > UnrollMaxIterationsCountToAnalyze)
+    return None;
+
+  SmallSetVector<BasicBlock *, 16> BBWorklist;
+  DenseMap<Value *, Constant *> SimplifiedValues;
+  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
+
+  // The estimated cost of the unrolled form of the loop. We try to estimate
+  // this by simplifying as much as we can while computing the estimate.
+  int UnrolledCost = 0;
+  // We also track the estimated dynamic (that is, actually executed) cost in
+  // the rolled form. This helps identify cases when the savings from unrolling
+  // aren't just exposing dead control flows, but actual reduced dynamic
+  // instructions due to the simplifications which we expect to occur after
+  // unrolling.
+  int RolledDynamicCost = 0;
+
+  // Ensure that we don't violate the loop structure invariants relied on by
+  // this analysis.
+  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+  assert(L->isLCSSAForm(DT) &&
+         "Must have loops in LCSSA form to track live-out values.");
+
+  DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+
+  // Simulate execution of each iteration of the loop counting instructions,
+  // which would be simplified.
+  // Since the same load will take different values on different iterations,
+  // we literally have to go through all loop's iterations.
+  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+    DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+    // Prepare for the iteration by collecting any simplified entry or backedge
+    // inputs.
+    for (Instruction &I : *L->getHeader()) {
+      auto *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI)
+        break;
+
+      // The loop header PHI nodes must have exactly two input: one from the
+      // loop preheader and one from the loop latch.
+      assert(
+          PHI->getNumIncomingValues() == 2 &&
+          "Must have an incoming value only for the preheader and the latch.");
+
+      Value *V = PHI->getIncomingValueForBlock(
+          Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+      Constant *C = dyn_cast<Constant>(V);
+      if (Iteration != 0 && !C)
+        C = SimplifiedValues.lookup(V);
+      if (C)
+        SimplifiedInputValues.push_back({PHI, C});
+    }
+
+    // Now clear and re-populate the map for the next iteration.
+    SimplifiedValues.clear();
+    while (!SimplifiedInputValues.empty())
+      SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE);
+
+    BBWorklist.clear();
+    BBWorklist.insert(L->getHeader());
+    // Note that we *must not* cache the size, this loop grows the worklist.
+    for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+      BasicBlock *BB = BBWorklist[Idx];
+
+      // Visit all instructions in the given basic block and try to simplify
+      // it.  We don't change the actual IR, just count optimization
+      // opportunities.
+      for (Instruction &I : *BB) {
+        int InstCost = TTI.getUserCost(&I);
+
+        // Visit the instruction to analyze its loop cost after unrolling,
+        // and if the visitor returns false, include this instruction in the
+        // unrolled cost.
+        if (!Analyzer.visit(I))
+          UnrolledCost += InstCost;
+        else {
+          DEBUG(dbgs() << "  " << I
+                       << " would be simplified if loop is unrolled.\n");
+          (void)0;
+        }
+
+        // Also track this instructions expected cost when executing the rolled
+        // loop form.
+        RolledDynamicCost += InstCost;
+
+        // If unrolled body turns out to be too big, bail out.
+        if (UnrolledCost > MaxUnrolledLoopSize) {
+          DEBUG(dbgs() << "  Exceeded threshold.. exiting.\n"
+                       << "  UnrolledCost: " << UnrolledCost
+                       << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+                       << "\n");
+          return None;
+        }
+      }
+
+      TerminatorInst *TI = BB->getTerminator();
+
+      // Add in the live successors by first checking whether we have terminator
+      // that may be simplified based on the values simplified by this call.
+      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+        if (BI->isConditional()) {
+          if (Constant *SimpleCond =
+                  SimplifiedValues.lookup(BI->getCondition())) {
+            BasicBlock *Succ = nullptr;
+            // Just take the first successor if condition is undef
+            if (isa<UndefValue>(SimpleCond))
+              Succ = BI->getSuccessor(0);
+            else
+              Succ = BI->getSuccessor(
+                  cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0);
+            if (L->contains(Succ))
+              BBWorklist.insert(Succ);
+            continue;
+          }
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+        if (Constant *SimpleCond =
+                SimplifiedValues.lookup(SI->getCondition())) {
+          BasicBlock *Succ = nullptr;
+          // Just take the first successor if condition is undef
+          if (isa<UndefValue>(SimpleCond))
+            Succ = SI->getSuccessor(0);
+          else
+            Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond))
+                       .getCaseSuccessor();
+          if (L->contains(Succ))
+            BBWorklist.insert(Succ);
+          continue;
+        }
+      }
+
+      // Add BB's successors to the worklist.
+      for (BasicBlock *Succ : successors(BB))
+        if (L->contains(Succ))
+          BBWorklist.insert(Succ);
+    }
+
+    // If we found no optimization opportunities on the first iteration, we
+    // won't find them on later ones too.
+    if (UnrolledCost == RolledDynamicCost) {
+      DEBUG(dbgs() << "  No opportunities found.. exiting.\n"
+                   << "  UnrolledCost: " << UnrolledCost << "\n");
+      return None;
+    }
+  }
+  DEBUG(dbgs() << "Analysis finished:\n"
+               << "UnrolledCost: " << UnrolledCost << ", "
+               << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+  return {{UnrolledCost, RolledDynamicCost}};
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                                    bool &NotDuplicatable,
+                                    const TargetTransformInfo &TTI,
+                                    AssumptionCache *AC) {
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+  CodeMetrics Metrics;
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+  NumCalls = Metrics.NumInlineCandidates;
+  NotDuplicatable = Metrics.notDuplicatable;
+
+  unsigned LoopSize = Metrics.NumInsts;
+
+  // Don't allow an estimate of size zero.  This would allows unrolling of loops
+  // with huge iteration counts, which is a compile time problem even if it's
+  // not a problem for code quality. Also, the code using this size may assume
+  // that each loop has at least three instructions (likely a conditional
+  // branch, a comparison feeding that branch, and some kind of loop increment
+  // feeding that comparison instruction).
+  LoopSize = std::max(LoopSize, 3u);
+
+  return LoopSize;
+}
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+  if (MDNode *LoopID = L->getLoopID())
+    return GetUnrollMetadata(LoopID, Name);
+  return nullptr;
+}
+
+// Returns true if the loop has an unroll(full) pragma.
+static bool HasUnrollFullPragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+}
+
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool HasUnrollEnablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
+// Returns true if the loop has an unroll(disable) pragma.
+static bool HasUnrollDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
+}
+
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma.  Otherwise return 0.
+static unsigned UnrollCountPragmaValue(const Loop *L) {
+  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count =
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+    assert(Count >= 1 && "Unroll count must be positive.");
+    return Count;
+  }
+  return 0;
+}
+
+// Remove existing unroll metadata and add unroll disable metadata to
+// indicate the loop has already been unrolled.  This prevents a loop
+// from being unrolled more than is directed by a pragma if the loop
+// unrolling pass is run more than once (which it generally is).
+static void SetLoopAlreadyUnrolled(Loop *L) {
+  MDNode *LoopID = L->getLoopID();
+  if (!LoopID) return;
+
+  // First remove any existing loop unrolling metadata.
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+    bool IsUnrollMetadata = false;
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (MD) {
+      const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+      IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+    }
+    if (!IsUnrollMetadata)
+      MDs.push_back(LoopID->getOperand(i));
+  }
+
+  // Add unroll(disable) metadata to disable future unrolling.
+  LLVMContext &Context = L->getHeader()->getContext();
+  SmallVector<Metadata *, 1> DisableOperands;
+  DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+  MDs.push_back(DisableNode);
+
+  MDNode *NewLoopID = MDNode::get(Context, MDs);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  L->setLoopID(NewLoopID);
+}
+
+static bool canUnrollCompletely(Loop *L, unsigned Threshold,
+                                unsigned PercentDynamicCostSavedThreshold,
+                                unsigned DynamicCostSavingsDiscount,
+                                uint64_t UnrolledCost,
+                                uint64_t RolledDynamicCost) {
+  if (Threshold == NoThreshold) {
+    DEBUG(dbgs() << "  Can fully unroll, because no threshold is set.\n");
+    return true;
+  }
+
+  if (UnrolledCost <= Threshold) {
+    DEBUG(dbgs() << "  Can fully unroll, because unrolled cost: "
+                 << UnrolledCost << "<" << Threshold << "\n");
+    return true;
+  }
+
+  assert(UnrolledCost && "UnrolledCost can't be 0 at this point.");
+  assert(RolledDynamicCost >= UnrolledCost &&
+         "Cannot have a higher unrolled cost than a rolled cost!");
+
+  // Compute the percentage of the dynamic cost in the rolled form that is
+  // saved when unrolled. If unrolling dramatically reduces the estimated
+  // dynamic cost of the loop, we use a higher threshold to allow more
+  // unrolling.
+  unsigned PercentDynamicCostSaved =
+      (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost;
+
+  if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold &&
+      (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <=
+          (int64_t)Threshold) {
+    DEBUG(dbgs() << "  Can fully unroll, because unrolling will reduce the "
+                    "expected dynamic cost by " << PercentDynamicCostSaved
+                 << "% (threshold: " << PercentDynamicCostSavedThreshold
+                 << "%)\n"
+                 << "  and the unrolled cost (" << UnrolledCost
+                 << ") is less than the max threshold ("
+                 << DynamicCostSavingsDiscount << ").\n");
+    return true;
+  }
+
+  DEBUG(dbgs() << "  Too large to fully unroll:\n");
+  DEBUG(dbgs() << "    Threshold: " << Threshold << "\n");
+  DEBUG(dbgs() << "    Max threshold: " << DynamicCostSavingsDiscount << "\n");
+  DEBUG(dbgs() << "    Percent cost saved threshold: "
+               << PercentDynamicCostSavedThreshold << "%\n");
+  DEBUG(dbgs() << "    Unrolled cost: " << UnrolledCost << "\n");
+  DEBUG(dbgs() << "    Rolled dynamic cost: " << RolledDynamicCost << "\n");
+  DEBUG(dbgs() << "    Percent cost saved: " << PercentDynamicCostSaved
+               << "\n");
+  return false;
+}
+
+static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
+                            AssumptionCache &AC, bool PreserveLCSSA,
+                            Optional<unsigned> ProvidedCount,
+                            Optional<unsigned> ProvidedThreshold,
+                            Optional<bool> ProvidedAllowPartial,
+                            Optional<bool> ProvidedRuntime) {
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
+        << "] Loop %" << Header->getName() << "\n");
+
+  if (HasUnrollDisablePragma(L)) {
+    return false;
+  }
+  bool PragmaFullUnroll = HasUnrollFullPragma(L);
+  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
+  unsigned PragmaCount = UnrollCountPragmaValue(L);
+  bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0;
+
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  // If there are multiple exiting blocks but one of them is the latch, use the
+  // latch for the trip count estimation. Otherwise insist on a single exiting
+  // block for the trip count estimation.
+  BasicBlock *ExitingBlock = L->getLoopLatch();
+  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+    ExitingBlock = L->getExitingBlock();
+  if (ExitingBlock) {
+    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+  }
+
+  TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+      ProvidedRuntime, PragmaCount, PragmaFullUnroll, PragmaEnableUnroll,
+      TripCount);
+
+  unsigned Count = UP.Count;
+  bool CountSetExplicitly = Count != 0;
+  // Use a heuristic count if we didn't set anything explicitly.
+  if (!CountSetExplicitly)
+    Count = TripCount == 0 ? DefaultUnrollRuntimeCount : TripCount;
+  if (TripCount && Count > TripCount)
+    Count = TripCount;
+
+  unsigned NumInlineCandidates;
+  bool notDuplicatable;
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, &AC);
+  DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+
+  // When computing the unrolled size, note that the conditional branch on the
+  // backedge and the comparison feeding it are not replicated like the rest of
+  // the loop body (which is why 2 is subtracted).
+  uint64_t UnrolledSize = (uint64_t)(LoopSize-2) * Count + 2;
+  if (notDuplicatable) {
+    DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
+                 << " instructions.\n");
+    return false;
+  }
+  if (NumInlineCandidates != 0) {
+    DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+    return false;
+  }
+
+  // Given Count, TripCount and thresholds determine the type of
+  // unrolling which is to be performed.
+  enum { Full = 0, Partial = 1, Runtime = 2 };
+  int Unrolling;
+  if (TripCount && Count == TripCount) {
+    Unrolling = Partial;
+    // If the loop is really small, we don't need to run an expensive analysis.
+    if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount,
+                            UnrolledSize, UnrolledSize)) {
+      Unrolling = Full;
+    } else {
+      // The loop isn't that small, but we still can fully unroll it if that
+      // helps to remove a significant number of instructions.
+      // To check that, run additional analysis on the loop.
+      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+              L, TripCount, DT, *SE, TTI,
+              UP.Threshold + UP.DynamicCostSavingsDiscount))
+        if (canUnrollCompletely(L, UP.Threshold,
+                                UP.PercentDynamicCostSavedThreshold,
+                                UP.DynamicCostSavingsDiscount,
+                                Cost->UnrolledCost, Cost->RolledDynamicCost)) {
+          Unrolling = Full;
+        }
+    }
+  } else if (TripCount && Count < TripCount) {
+    Unrolling = Partial;
+  } else {
+    Unrolling = Runtime;
+  }
+
+  // Reduce count based on the type of unrolling and the threshold values.
+  unsigned OriginalCount = Count;
+  bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || UP.Runtime;
+  // Don't unroll a runtime trip count loop with unroll full pragma.
+  if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
+    AllowRuntime = false;
+  }
+  if (Unrolling == Partial) {
+    bool AllowPartial = PragmaEnableUnroll || UP.Partial;
+    if (!AllowPartial && !CountSetExplicitly) {
+      DEBUG(dbgs() << "  will not try to unroll partially because "
+                   << "-unroll-allow-partial not given\n");
+      return false;
+    }
+    if (UP.PartialThreshold != NoThreshold &&
+        UnrolledSize > UP.PartialThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      Count = (std::max(UP.PartialThreshold, 3u) - 2) / (LoopSize - 2);
+      while (Count != 0 && TripCount % Count != 0)
+        Count--;
+    }
+  } else if (Unrolling == Runtime) {
+    if (!AllowRuntime && !CountSetExplicitly) {
+      DEBUG(dbgs() << "  will not try to unroll loop with runtime trip count "
+                   << "-unroll-runtime not given\n");
+      return false;
+    }
+    // Reduce unroll count to be the largest power-of-two factor of
+    // the original count which satisfies the threshold limit.
+    while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
+      Count >>= 1;
+      UnrolledSize = (LoopSize-2) * Count + 2;
+    }
+    if (Count > UP.MaxCount)
+      Count = UP.MaxCount;
+    DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
+  }
+
+  if (HasPragma) {
+    if (PragmaCount != 0)
+      // If loop has an unroll count pragma mark loop as unrolled to prevent
+      // unrolling beyond that requested by the pragma.
+      SetLoopAlreadyUnrolled(L);
+
+    // Emit optimization remarks if we are unable to unroll the loop
+    // as directed by a pragma.
+    DebugLoc LoopLoc = L->getStartLoc();
+    Function *F = Header->getParent();
+    LLVMContext &Ctx = F->getContext();
+    if ((PragmaCount > 0) && Count != OriginalCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to unroll loop the number of times directed by "
+          "unroll_count pragma because unrolled size is too large.");
+    } else if (PragmaFullUnroll && !TripCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to fully unroll loop as directed by unroll(full) pragma "
+          "because loop has a runtime trip count.");
+    } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to unroll loop as directed by unroll(enable) pragma because "
+          "unrolled size is too large.");
+    } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+               Count != TripCount) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          "Unable to fully unroll loop as directed by unroll pragma because "
+          "unrolled size is too large.");
+    }
+  }
+
+  if (Unrolling != Full && Count < 2) {
+    // Partial unrolling by 1 is a nop.  For full unrolling, a factor
+    // of 1 makes sense because loop control can be eliminated.
+    return false;
+  }
+
+  // Unroll the loop.
+  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount,
+                  TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA))
+    return false;
+
+  return true;
+}
+
+namespace {
+class LoopUnroll : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopUnroll(Optional<unsigned> Threshold = None,
+             Optional<unsigned> Count = None,
+             Optional<bool> AllowPartial = None, Optional<bool> Runtime = None)
+      : LoopPass(ID), ProvidedCount(Count), ProvidedThreshold(Threshold),
+        ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime) {
+    initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+  }
+
+  Optional<unsigned> ProvidedCount;
+  Optional<unsigned> ProvidedThreshold;
+  Optional<bool> ProvidedAllowPartial;
+  Optional<bool> ProvidedRuntime;
+
+  bool runOnLoop(Loop *L, LPPassManager &) override {
+    if (skipOptnoneFunction(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, PreserveLCSSA, ProvidedCount,
+                           ProvidedThreshold, ProvidedAllowPartial,
+                           ProvidedRuntime);
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addPreservedID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addPreservedID(LCSSAID);
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+    // If loop unroll does not preserve dom info then LCSSA pass on next
+    // loop will receive invalid dom info.
+    // For now, recreate dom info, if loop is unrolled.
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char LoopUnroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+                                 int Runtime) {
+  // TODO: It would make more sense for this function to take the optionals
+  // directly, but that's dangerous since it would silently break out of tree
+  // callers.
+  return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
+                        Count == -1 ? None : Optional<unsigned>(Count),
+                        AllowPartial == -1 ? None
+                                           : Optional<bool>(AllowPartial),
+                        Runtime == -1 ? None : Optional<bool>(Runtime));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass() {
+  return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 0000000..95d7f8a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1342 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to have multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unswitch"
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts,  "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(100), cl::Hidden);
+
+static cl::opt<bool>
+LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
+    cl::init(false), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to access PGO "
+             "heuristics to minimize code growth in cold regions."));
+
+static cl::opt<unsigned>
+ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
+    cl::desc("Coldness threshold in percentage. The loop header frequency "
+             "(relative to the entry frequency) is compared with this "
+             "threshold to determine if non-trivial unswitching should be "
+             "enabled."));
+
+namespace {
+
+  class LUAnalysisCache {
+
+    typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
+      UnswitchedValsMap;
+
+    typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+
+    struct LoopProperties {
+      unsigned CanBeUnswitchedCount;
+      unsigned WasUnswitchedCount;
+      unsigned SizeEstimation;
+      UnswitchedValsMap UnswitchedVals;
+    };
+
+    // Here we use std::map instead of DenseMap, since we need to keep valid
+    // LoopProperties pointer for current loop for better performance.
+    typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
+    typedef LoopPropsMap::iterator LoopPropsMapIt;
+
+    LoopPropsMap LoopsProperties;
+    UnswitchedValsMap *CurLoopInstructions;
+    LoopProperties *CurrentLoopProperties;
+
+    // A loop unswitching with an estimated cost above this threshold
+    // is not performed. MaxSize is turned into unswitching quota for
+    // the current loop, and reduced correspondingly, though note that
+    // the quota is returned by releaseMemory() when the loop has been
+    // processed, so that MaxSize will return to its previous
+    // value. So in most cases MaxSize will equal the Threshold flag
+    // when a new loop is processed. An exception to that is that
+    // MaxSize will have a smaller value while processing nested loops
+    // that were introduced due to loop unswitching of an outer loop.
+    //
+    // FIXME: The way that MaxSize works is subtle and depends on the
+    // pass manager processing loops and calling releaseMemory() in a
+    // specific order. It would be good to find a more straightforward
+    // way of doing what MaxSize does.
+    unsigned MaxSize;
+
+  public:
+    LUAnalysisCache()
+        : CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
+          MaxSize(Threshold) {}
+
+    // Analyze loop. Check its size, calculate is it possible to unswitch
+    // it. Returns true if we can unswitch this loop.
+    bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                   AssumptionCache *AC);
+
+    // Clean all data related to given loop.
+    void forgetLoop(const Loop *L);
+
+    // Mark case value as unswitched.
+    // Since SI instruction can be partly unswitched, in order to avoid
+    // extra unswitching in cloned loops keep track all unswitched values.
+    void setUnswitched(const SwitchInst *SI, const Value *V);
+
+    // Check was this case value unswitched before or not.
+    bool isUnswitched(const SwitchInst *SI, const Value *V);
+
+    // Returns true if another unswitching could be done within the cost
+    // threshold.
+    bool CostAllowsUnswitching();
+
+    // Clone all loop-unswitch related loop properties.
+    // Redistribute unswitching quotas.
+    // Note, that new loop data is stored inside the VMap.
+    void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                   const ValueToValueMapTy &VMap);
+  };
+
+  class LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+    AssumptionCache *AC;
+
+    // Used to check if second loop needs processing after
+    // RewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+
+    LUAnalysisCache BranchesInfo;
+
+    bool EnabledPGO;
+
+    // BFI and ColdEntryFreq are only used when PGO and
+    // LoopUnswitchWithBlockFrequency are enabled.
+    BlockFrequencyInfo BFI;
+    BlockFrequency ColdEntryFreq;
+
+    bool OptimizeForSize;
+    bool redoLoop;
+
+    Loop *currentLoop;
+    DominatorTree *DT;
+    BasicBlock *loopHeader;
+    BasicBlock *loopPreheader;
+
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopUnswitch(bool Os = false) :
+      LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
+      currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
+      loopPreheader(nullptr) {
+        initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+    bool processCurrentLoop();
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.
+    ///
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+
+  private:
+
+    void releaseMemory() override {
+      BranchesInfo.forgetLoop(currentLoop);
+    }
+
+    void initLoopData() {
+      loopHeader = currentLoop->getHeader();
+      loopPreheader = currentLoop->getLoopPreheader();
+    }
+
+    /// Split all of the edges from inside the loop to their exit blocks.
+    /// Update the appropriate Phi nodes as we do so.
+    void SplitExitEdges(Loop *L,
+                        const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+    bool TryTrivialLoopUnswitch(bool &Changed);
+
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+                              TerminatorInst *TI = nullptr);
+    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock, TerminatorInst *TI);
+    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
+                                     TerminatorInst *TI);
+
+    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool isEqual);
+
+    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                        BasicBlock *TrueDest,
+                                        BasicBlock *FalseDest,
+                                        Instruction *InsertPt,
+                                        TerminatorInst *TI);
+
+    void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+  };
+}
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                                AssumptionCache *AC) {
+
+  LoopPropsMapIt PropsIt;
+  bool Inserted;
+  std::tie(PropsIt, Inserted) =
+      LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+  LoopProperties &Props = PropsIt->second;
+
+  if (Inserted) {
+    // New loop.
+
+    // Limit the number of instructions to avoid causing significant code
+    // expansion, and the number of basic blocks, to avoid loops with
+    // large numbers of branches which cause loop unswitching to go crazy.
+    // This is a very ad-hoc heuristic.
+
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    // FIXME: This is overly conservative because it does not take into
+    // consideration code simplification opportunities and code that can
+    // be shared by the resultant unswitched loops.
+    CodeMetrics Metrics;
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+         ++I)
+      Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+
+    Props.SizeEstimation = Metrics.NumInsts;
+    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+    Props.WasUnswitchedCount = 0;
+    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "NOT unswitching loop %"
+                   << L->getHeader()->getName() << ", contents cannot be "
+                   << "duplicated!\n");
+      return false;
+    }
+  }
+
+  // Be careful. This links are good only before new loop addition.
+  CurrentLoopProperties = &Props;
+  CurLoopInstructions = &Props.UnswitchedVals;
+
+  return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop *L) {
+
+  LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+  if (LIt != LoopsProperties.end()) {
+    LoopProperties &Props = LIt->second;
+    MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+               Props.SizeEstimation;
+    LoopsProperties.erase(LIt);
+  }
+
+  CurrentLoopProperties = nullptr;
+  CurLoopInstructions = nullptr;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
+  (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
+  return (*CurLoopInstructions)[SI].count(V);
+}
+
+bool LUAnalysisCache::CostAllowsUnswitching() {
+  return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                                const ValueToValueMapTy &VMap) {
+
+  LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties &OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
+
+  // Reallocate "can-be-unswitched quota"
+
+  --OldLoopProps.CanBeUnswitchedCount;
+  ++OldLoopProps.WasUnswitchedCount;
+  NewLoopProps.WasUnswitchedCount = 0;
+  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+  // Clone unswitched values info:
+  // for new loop switches we clone info about values that was
+  // already unswitched and has redundant successors.
+  for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+    const SwitchInst *OldInst = I->first;
+    Value *NewI = VMap.lookup(OldInst);
+    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
+    assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+  }
+}
+
+char LoopUnswitch::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os) {
+  return new LoopUnswitch(Os);
+}
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+
+  // We started analyze new instruction, increment scanned instructions counter.
+  ++TotalInsts;
+
+  // We can never unswitch on vector conditions.
+  if (Cond->getType()->isVectorTy())
+    return nullptr;
+
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return nullptr;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+
+  // Hoist simple values out.
+  if (L->makeLoopInvariant(Cond, Changed))
+    return Cond;
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // If either the left or right side is invariant, we can unswitch on this,
+      // which will cause the branch to go away in one loop and the condition to
+      // simplify in the other one.
+      if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+        return LHS;
+      if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+        return RHS;
+    }
+
+  return nullptr;
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  if (skipOptnoneFunction(L))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      *L->getHeader()->getParent());
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LPM = &LPM_Ref;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  currentLoop = L;
+  Function *F = currentLoop->getHeader()->getParent();
+
+  EnabledPGO = F->getEntryCount().hasValue();
+
+  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
+    BranchProbabilityInfo BPI(*F, *LI);
+    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
+
+    // Use BranchProbability to compute a minimum frequency based on
+    // function entry baseline frequency. Loops with headers below this
+    // frequency are considered as cold.
+    const BranchProbability ColdProb(ColdnessThreshold, 100);
+    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
+  }
+
+  bool Changed = false;
+  do {
+    assert(currentLoop->isLCSSAForm(*DT));
+    redoLoop = false;
+    Changed |= processCurrentLoop();
+  } while(redoLoop);
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  if (Changed)
+    DT->recalculate(*F);
+  return Changed;
+}
+
+/// Do actual work and unswitch loop if possible and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+  bool Changed = false;
+
+  initLoopData();
+
+  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+  if (!loopPreheader)
+    return false;
+
+  // Loops with indirectbr cannot be cloned.
+  if (!currentLoop->isSafeToClone())
+    return false;
+
+  // Without dedicated exits, splitting the exit edge may fail.
+  if (!currentLoop->hasDedicatedExits())
+    return false;
+
+  LLVMContext &Context = loopHeader->getContext();
+
+  // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
+  if (!BranchesInfo.countLoop(
+          currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                           *currentLoop->getHeader()->getParent()),
+          AC))
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  if (TryTrivialLoopUnswitch(Changed)) {
+    return true;
+  }
+
+  // Do not unswitch loops containing convergent operations, as we might be
+  // making them control dependent on the unswitch value when they were not
+  // before.
+  // FIXME: This could be refined to only bail if the convergent operation is
+  // not already control-dependent on the unswitch value.
+  for (const auto BB : currentLoop->blocks()) {
+    for (auto &I : *BB) {
+      auto CS = CallSite(&I);
+      if (!CS) continue;
+      if (CS.hasFnAttr(Attribute::Convergent))
+        return false;
+    }
+  }
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  // FIXME: Use Function::optForSize().
+  if (OptimizeForSize ||
+      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+
+  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
+    // Compute the weighted frequency of the hottest block in the
+    // loop (loopHeader in this case since inner loops should be
+    // processed before outer loop). If it is less than ColdFrequency,
+    // we should not unswitch.
+    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
+    if (LoopEntryFreq < ColdEntryFreq)
+      return false;
+  }
+
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = currentLoop->block_begin(),
+         E = currentLoop->block_end(); I != E; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+                                               currentLoop, Changed);
+        if (LoopCond &&
+            UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+          ++NumBranches;
+          return true;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                             currentLoop, Changed);
+      unsigned NumCases = SI->getNumCases();
+      if (LoopCond && NumCases) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        // FIXME: scan for a case with a non-critical edge?
+        Constant *UnswitchVal = nullptr;
+
+        // Do not process same value again and again.
+        // At this point we have some cases already unswitched and
+        // some not yet unswitched. Let's find the first not yet unswitched one.
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
+          Constant *UnswitchValCandidate = i.getCaseValue();
+          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+            UnswitchVal = UnswitchValCandidate;
+            break;
+          }
+        }
+
+        if (!UnswitchVal)
+          continue;
+
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+          ++NumSwitches;
+          return true;
+        }
+      }
+    }
+
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
+                                             ConstantInt::getTrue(Context))) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
+  return Changed;
+}
+
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited. Without more analysis, this could indicate an infinite
+    // loop.
+    return false;
+  }
+  if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB) return false;
+    ExitBB = BB;
+    return true;
+  }
+
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+
+  return true;
+}
+
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header make infinite loops.
+  BasicBlock *ExitBB = nullptr;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return nullptr;
+}
+
+/// We have found that we can unswitch currentLoop when LoopCond == Val to
+/// simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+                                        TerminatorInst *TI) {
+  // Check to see if it would be profitable to unswitch current loop.
+  if (!BranchesInfo.CostAllowsUnswitching()) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Cost too high.\n");
+    return false;
+  }
+
+  UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
+  return true;
+}
+
+/// Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop &New = LPM->addLoop(PL);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    CloneLoop(*I, &New, VM, LI, LPM);
+
+  return &New;
+}
+
+static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst,
+                         bool Swapped) {
+  if (!SrcInst || !SrcInst->hasMetadata())
+    return;
+
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  SrcInst->getAllMetadata(MDs);
+  for (auto &MD : MDs) {
+    switch (MD.first) {
+    default:
+      break;
+    case LLVMContext::MD_prof:
+      if (Swapped && MD.second->getNumOperands() == 3 &&
+          isa<MDString>(MD.second->getOperand(0))) {
+        MDString *MDName = cast<MDString>(MD.second->getOperand(0));
+        if (MDName->getString() == "branch_weights") {
+          auto *ValT = cast_or_null<ConstantAsMetadata>(
+                           MD.second->getOperand(1))->getValue();
+          auto *ValF = cast_or_null<ConstantAsMetadata>(
+                           MD.second->getOperand(2))->getValue();
+          assert(ValT && ValF && "Invalid Operands of branch_weights");
+          auto NewMD =
+              MDBuilder(DstInst->getParent()->getContext())
+                  .createBranchWeights(cast<ConstantInt>(ValF)->getZExtValue(),
+                                       cast<ConstantInt>(ValT)->getZExtValue());
+          MD.second = NewMD;
+        }
+      }
+      // fallthrough.
+    case LLVMContext::MD_make_implicit:
+    case LLVMContext::MD_dbg:
+      DstInst->setMetadata(MD.first, MD.second);
+    }
+  }
+}
+
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                                  BasicBlock *TrueDest,
+                                                  BasicBlock *FalseDest,
+                                                  Instruction *InsertPt,
+                                                  TerminatorInst *TI) {
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  bool Swapped = false;
+  if (!isa<ConstantInt>(Val) ||
+      Val->getType() != Type::getInt1Ty(LIC->getContext()))
+    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
+  else if (Val != ConstantInt::getTrue(Val->getContext())) {
+    // We want to enter the new loop when the condition is true.
+    std::swap(TrueDest, FalseDest);
+    Swapped = true;
+  }
+
+  // Insert the new branch.
+  BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
+  copyMetadata(BI, TI, Swapped);
+
+  // If either edge is critical, split it. This helps preserve LoopSimplify
+  // form for enclosing loops.
+  auto Options = CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA();
+  SplitCriticalEdge(BI, 0, Options);
+  SplitCriticalEdge(BI, 1, Options);
+}
+
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                            BasicBlock *ExitBlock,
+                                            TerminatorInst *TI) {
+  DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+               << loopHeader->getName() << " [" << L->getBlocks().size()
+               << " blocks] in Function "
+               << L->getHeader()->getParent()->getName() << " on cond: " << *Val
+               << " == " << *Cond << "\n");
+
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change loopPreheader to have a
+  // conditional branch on Cond.
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI);
+
+  // Okay, now we have a position to branch from and a position to branch to,
+  // insert the new conditional branch.
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
+                                 loopPreheader->getTerminator(), TI);
+  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+  loopPreheader->getTerminator()->eraseFromParent();
+
+  // We need to reprocess this loop, it could be unswitched again.
+  redoLoop = true;
+
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  ++NumTrivial;
+}
+
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
+  BasicBlock *CurrentBB = currentLoop->getHeader();
+  TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+  LLVMContext &Context = CurrentBB->getContext();
+
+  // If loop header has only one reachable successor (currently via an
+  // unconditional branch or constant foldable conditional branch, but
+  // should also consider adding constant foldable switch instruction in
+  // future), we should keep looking for trivial condition candidates in
+  // the successor as well. An alternative is to constant fold conditions
+  // and merge successors into loop header (then we only need to check header's
+  // terminator). The reason for not doing this in LoopUnswitch pass is that
+  // it could potentially break LoopPassManager's invariants. Folding dead
+  // branches could either eliminate the current loop or make other loops
+  // unreachable. LCSSA form might also not be preserved after deleting
+  // branches. The following code keeps traversing loop header's successors
+  // until it finds the trivial condition candidate (condition that is not a
+  // constant). Since unswitching generates branches with constant conditions,
+  // this scenario could be very common in practice.
+  SmallSet<BasicBlock*, 8> Visited;
+
+  while (true) {
+    // If we exit loop or reach a previous visited block, then
+    // we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch
+    // can happen. Exit and return false.
+    if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+      return false;
+
+    // Check if this loop will execute any side-effecting instructions (e.g.
+    // stores, calls, volatile loads) in the part of the loop that the code
+    // *would* execute. Check the header first.
+    for (Instruction &I : *CurrentBB)
+      if (I.mayHaveSideEffects())
+        return false;
+
+    // FIXME: add check for constant foldable switch instructions.
+    if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+      if (BI->isUnconditional()) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+        CurrentBB = BI->getSuccessor(0);
+      } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+        CurrentBB = BI->getSuccessor(1);
+      } else {
+        // Found a trivial condition candidate: non-foldable conditional branch.
+        break;
+      }
+    } else {
+      break;
+    }
+
+    CurrentTerm = CurrentBB->getTerminator();
+  }
+
+  // CondVal is the condition that controls the trivial condition.
+  // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+  Constant *CondVal = nullptr;
+  BasicBlock *LoopExitBB = nullptr;
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+    // If this isn't branching on an invariant condition, we can't unswitch it.
+    if (!BI->isConditional())
+      return false;
+
+    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+                                           currentLoop, Changed);
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != BI->getCondition())
+      return false;
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes
+    // it to do this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                             BI->getSuccessor(0)))) {
+      CondVal = ConstantInt::getTrue(Context);
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                                    BI->getSuccessor(1)))) {
+      CondVal = ConstantInt::getFalse(Context);
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+                             CurrentTerm);
+    ++NumBranches;
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+    // If this isn't switching on an invariant condition, we can't unswitch it.
+    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                           currentLoop, Changed);
+
+    // Unswitch only if the trivial condition itself is an LIV (not
+    // partial LIV which could occur in and/or)
+    if (!LoopCond || LoopCond != SI->getCondition())
+      return false;
+
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    // Note that we can't trivially unswitch on the default case or
+    // on already unswitched cases.
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *LoopExitCandidate;
+      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
+                                               i.getCaseSuccessor()))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        ConstantInt *CaseVal = i.getCaseValue();
+
+        // Check that it was not unswitched before, since already unswitched
+        // trivial vals are looks trivial too.
+        if (BranchesInfo.isUnswitched(SI, CaseVal))
+          continue;
+        LoopExitBB = LoopExitCandidate;
+        CondVal = CaseVal;
+        break;
+      }
+    }
+
+    // If we didn't find a single unique LoopExit block, or if the loop exit
+    // block contains phi nodes, this isn't trivial.
+    if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+      return false;   // Can't handle this.
+
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+                             nullptr);
+    ++NumSwitches;
+    return true;
+  }
+  return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L,
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks){
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+                                       pred_end(ExitBlock));
+
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
+    SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI,
+                           /*PreserveLCSSA*/ true);
+  }
+}
+
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
+                                               Loop *L, TerminatorInst *TI) {
+  Function *F = loopHeader->getParent();
+  DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+        << loopHeader->getName() << " [" << L->getBlocks().size()
+        << " blocks] in Function " << F->getName()
+        << " when '" << *Val << "' == " << *LIC << "\n");
+
+  if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+    SEWP->getSE().forgetLoop(L);
+
+  LoopBlocks.clear();
+  NewBlocks.clear();
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, DT, LI);
+  LoopBlocks.push_back(NewPreheader);
+
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  SplitExitEdges(L, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  ValueToValueMapTy VMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+
+    NewBlocks.push_back(NewBB);
+    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
+    LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+  }
+
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(NewPreheader->getIterator(),
+                                F->getBasicBlockList(),
+                                NewBlocks[0]->getIterator(), F->end());
+
+  // FIXME: We could register any cloned assumptions instead of clearing the
+  // whole function's cache.
+  AC->clear();
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+  // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+  // Probably clone more loop-unswitch related loop properties.
+  BranchesInfo.cloneData(NewLoop, L, VMap);
+
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+  }
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      ValueToValueMapTy::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+                                    &*ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+      RemapInstruction(&*I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  // Emit the new branch that selects between the two versions of this loop.
+  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
+                                 TI);
+  LPM->deleteSimpleAnalysisValue(OldBR, L);
+  OldBR->eraseFromParent();
+
+  LoopProcessWorklist.push_back(NewLoop);
+  redoLoop = true;
+
+  // Keep a WeakVH holding onto LIC.  If the first call to RewriteLoopBody
+  // deletes the instruction (for example by simplifying a PHI that feeds into
+  // the condition that we're unswitching on), we don't rewrite the second
+  // iteration.
+  WeakVH LICHandle(LIC);
+
+  // Now we rewrite the original code to know that the condition is true and the
+  // new code to know that the condition is false.
+  RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
+
+  // It's possible that simplifying one loop could cause the other to be
+  // changed to another value or a constant.  If its a constant, don't simplify
+  // it.
+  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+      LICHandle && !isa<Constant>(LICHandle))
+    RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
+}
+
+/// Remove all instances of I from the worklist vector specified.
+static void RemoveFromWorklist(Instruction *I,
+                               std::vector<Instruction*> &Worklist) {
+
+  Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+                 Worklist.end());
+}
+
+/// When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
+                              std::vector<Instruction*> &Worklist,
+                              Loop *L, LPPassManager *LPM) {
+  DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (User *U : I->users())
+    Worklist.push_back(cast<Instruction>(U));
+  LPM->deleteSimpleAnalysisValue(I, L);
+  RemoveFromWorklist(I, Worklist);
+  I->replaceAllUsesWith(V);
+  I->eraseFromParent();
+  ++NumSimplify;
+}
+
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<Instruction*> Worklist;
+  LLVMContext &Context = Val->getContext();
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) &&
+      Val->getType()->isIntegerTy(1))) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+
+    for (User *U : LIC->users()) {
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !L->contains(UI))
+        continue;
+      Worklist.push_back(UI);
+    }
+
+    for (std::vector<Instruction*>::iterator UI = Worklist.begin(),
+         UE = Worklist.end(); UI != UE; ++UI)
+      (*UI)->replaceUsesOfWith(LIC, Replacement);
+
+    SimplifyCode(Worklist, L);
+    return;
+  }
+
+  // Otherwise, we don't know the precise value of LIC, but we do know that it
+  // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+  // can.  This case occurs when we unswitch switch statements.
+  for (User *U : LIC->users()) {
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (!UI || !L->contains(UI))
+      continue;
+
+    Worklist.push_back(UI);
+
+    // TODO: We could do other simplifications, for example, turning
+    // 'icmp eq LIC, Val' -> false.
+
+    // If we know that LIC is not Val, use this info to simplify code.
+    SwitchInst *SI = dyn_cast<SwitchInst>(UI);
+    if (!SI || !isa<ConstantInt>(Val)) continue;
+
+    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // Default case is live for multiple values.
+    if (DeadCase == SI->case_default()) continue;
+
+    // Found a dead case value.  Don't remove PHI nodes in the
+    // successor if they become single-entry, those PHI nodes may
+    // be in the Users list.
+
+    BasicBlock *Switch = SI->getParent();
+    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *Latch = L->getLoopLatch();
+
+    BranchesInfo.setUnswitched(SI, Val);
+
+    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
+    // If the DeadCase successor dominates the loop latch, then the
+    // transformation isn't safe since it will delete the sole predecessor edge
+    // to the latch.
+    if (Latch && DT->dominates(SISucc, Latch))
+      continue;
+
+    // FIXME: This is a hack.  We need to keep the successor around
+    // and hooked up so as to preserve the loop structure, because
+    // trying to update it is complicated.  So instead we preserve the
+    // loop structure and put the block on a dead code path.
+    SplitEdge(Switch, SISucc, DT, LI);
+    // Compute the successors instead of relying on the return value
+    // of SplitEdge, since it may have split the switch successor
+    // after PHI nodes.
+    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+    // Create an "unreachable" destination.
+    BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+                                           Switch->getParent(),
+                                           OldSISucc);
+    new UnreachableInst(Context, Abort);
+    // Force the new case destination to branch to the "unreachable"
+    // block while maintaining a (dead) CFG edge to the old block.
+    NewSISucc->getTerminator()->eraseFromParent();
+    BranchInst::Create(Abort, OldSISucc,
+                       ConstantInt::getTrue(Context), NewSISucc);
+    // Release the PHI operands for this edge.
+    for (BasicBlock::iterator II = NewSISucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(II); ++II)
+      PN->setIncomingValue(PN->getBasicBlockIndex(Switch),
+                           UndefValue::get(PN->getType()));
+    // Tell the domtree about the new block. We don't fully update the
+    // domtree here -- instead we force it to do a full recomputation
+    // after the pass is complete -- but we do need to inform it of
+    // new blocks.
+    DT->addNewBlock(Abort, NewSISucc);
+  }
+
+  SimplifyCode(Worklist, L);
+}
+
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      DEBUG(dbgs() << "Remove dead instruction '" << *I);
+
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      LPM->deleteSimpleAnalysisValue(I, L);
+      RemoveFromWorklist(I, Worklist);
+      I->eraseFromParent();
+      ++NumSimplify;
+      continue;
+    }
+
+    // See if instruction simplification can hack this up.  This is common for
+    // things like "select false, X, Y" after unswitching made the condition be
+    // 'false'.  TODO: update the domtree properly so we can pass it here.
+    if (Value *V = SimplifyInstruction(I, DL))
+      if (LI->replacementPreservesLCSSAForm(I, V)) {
+        ReplaceUsesOfWith(I, V, Worklist, L, LPM);
+        continue;
+      }
+
+    // Special case hacks that appear commonly in unswitched code.
+    if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+              << Succ->getName() << "\n");
+
+        // Resolve any single entry PHI nodes in Succ.
+        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+
+        // If Succ has any successors with PHI nodes, update them to have
+        // entries coming from Pred instead of Succ.
+        Succ->replaceAllUsesWith(Pred);
+
+        // Move all of the successor contents from Succ to Pred.
+        Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
+                                   Succ->begin(), Succ->end());
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+
+        // Remove Succ from the loop tree.
+        LI->removeBlock(Succ);
+        LPM->deleteSimpleAnalysisValue(Succ, L);
+        Succ->eraseFromParent();
+        ++NumSimplify;
+        continue;
+      }
+
+      continue;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
new file mode 100644
index 0000000..41511bc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -0,0 +1,148 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+
+  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+  CXI->replaceAllUsesWith(Res);
+  CXI->eraseFromParent();
+  return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Res = nullptr;
+
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
+    break;
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
+    break;
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
+    break;
+  }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(NotAtomic);
+  return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(NotAtomic);
+  return true;
+}
+
+namespace {
+  struct LowerAtomic : public BasicBlockPass {
+    static char ID;
+    LowerAtomic() : BasicBlockPass(ID) {
+      initializeLowerAtomicPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnBasicBlock(BasicBlock &BB) override {
+      if (skipOptnoneFunction(BB))
+        return false;
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+        Instruction *Inst = &*DI++;
+        if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+          Changed |= LowerFenceInst(FI);
+        else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+          Changed |= LowerAtomicCmpXchgInst(CXI);
+        else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+          Changed |= LowerAtomicRMWInst(RMWI);
+        else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+          if (LI->isAtomic())
+            LowerLoadInst(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+          if (SI->isAtomic())
+            LowerStoreInst(SI);
+        }
+      }
+      return Changed;
+    }
+  };
+}
+
+char LowerAtomic::ID = 0;
+INITIALIZE_PASS(LowerAtomic, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form",
+                false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
new file mode 100644
index 0000000..2ace902
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -0,0 +1,192 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+          "Number of 'expect' intrinsic instructions handled");
+
+static cl::opt<uint32_t>
+LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
+                   cl::desc("Weight of the branch likely to be taken (default = 64)"));
+static cl::opt<uint32_t>
+UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
+                   cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+  CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+  unsigned n = SI.getNumCases(); // +1 for default case.
+  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+
+  if (Case == SI.case_default())
+    Weights[0] = LikelyBranchWeight;
+  else
+    Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
+
+  SI.setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+  SI.setCondition(ArgValue);
+  return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+  if (BI.isUnconditional())
+    return false;
+
+  // Handle non-optimized IR code like:
+  //   %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+  //   %tobool = icmp ne i64 %expval, 0
+  //   br i1 %tobool, label %if.then, label %if.end
+  //
+  // Or the following simpler case:
+  //   %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+  //   br i1 %expval, label %if.then, label %if.end
+
+  CallInst *CI;
+
+  ICmpInst *CmpI = dyn_cast<ICmpInst>(BI.getCondition());
+  if (!CmpI) {
+    CI = dyn_cast<CallInst>(BI.getCondition());
+  } else {
+    if (CmpI->getPredicate() != CmpInst::ICMP_NE)
+      return false;
+    CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+  }
+
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
+
+  // If expect value is equal to 1 it means that we are more likely to take
+  // branch 0, in other case more likely is branch 1.
+  if (ExpectedValue->isOne())
+    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+  else
+    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+  BI.setMetadata(LLVMContext::MD_prof, Node);
+
+  if (CmpI)
+    CmpI->setOperand(0, ArgValue);
+  else
+    BI.setCondition(ArgValue);
+  return true;
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+  bool Changed = false;
+
+  for (BasicBlock &BB : F) {
+    // Create "block_weights" metadata.
+    if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (handleBranchExpect(*BI))
+        ExpectIntrinsicsHandled++;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      if (handleSwitchExpect(*SI))
+        ExpectIntrinsicsHandled++;
+    }
+
+    // Remove llvm.expect intrinsics.
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      CallInst *CI = dyn_cast<CallInst>(BI++);
+      if (!CI)
+        continue;
+
+      Function *Fn = CI->getCalledFunction();
+      if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+        Value *Exp = CI->getArgOperand(0);
+        CI->replaceAllUsesWith(Exp);
+        CI->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F) {
+  if (lowerExpectIntrinsic(F))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+/// \brief Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+  static char ID;
+  LowerExpectIntrinsic() : FunctionPass(ID) {
+    initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+                "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+  return new LowerExpectIntrinsic();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 0000000..6b43b0f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,1304 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "memcpyopt"
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
+
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
+                                  bool &VariableIdxFound,
+                                  const DataLayout &DL) {
+  // Skip over the first indices.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i != Idx; ++i, ++GTI)
+    /*skip along*/;
+
+  // Compute the offset implied by the rest of the indices.
+  int64_t Offset = 0;
+  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!OpC)
+      return VariableIdxFound = true;
+    if (OpC->isZero()) continue;  // No offset.
+
+    // Handle struct indices, which add their field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      continue;
+    }
+
+    // Otherwise, we have a sequential type like an array or vector.  Multiply
+    // the index by the ElementSize.
+    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+    Offset += Size*OpC->getSExtValue();
+  }
+
+  return Offset;
+}
+
+/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
+/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
+/// might be &A[40]. In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+                            const DataLayout &DL) {
+  Ptr1 = Ptr1->stripPointerCasts();
+  Ptr2 = Ptr2->stripPointerCasts();
+
+  // Handle the trivial case first.
+  if (Ptr1 == Ptr2) {
+    Offset = 0;
+    return true;
+  }
+
+  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
+
+  bool VariableIdxFound = false;
+
+  // If one pointer is a GEP and the other isn't, then see if the GEP is a
+  // constant offset from the base, as in "P" and "gep P, 1".
+  if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
+    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
+    return !VariableIdxFound;
+  }
+
+  if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
+    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
+    return !VariableIdxFound;
+  }
+
+  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+  // base.  After that base, they may have some number of common (and
+  // potentially variable) indices.  After that they handle some constant
+  // offset, which determines their offset from each other.  At this point, we
+  // handle no other case.
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+    return false;
+
+  // Skip any common indices and track the GEP types.
+  unsigned Idx = 1;
+  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+      break;
+
+  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
+  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
+  if (VariableIdxFound) return false;
+
+  Offset = Offset2-Offset1;
+  return true;
+}
+
+
+/// Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+///   store 0 -> P+1
+///   store 0 -> P+0
+///   store 0 -> P+3
+///   store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc.  When we see
+/// the first store, we make a range [1, 2).  The second store extends the range
+/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+namespace {
+struct MemsetRange {
+  // Start/End - A semi range that describes the span that this range covers.
+  // The range is closed at the start and open at the end: [Start, End).
+  int64_t Start, End;
+
+  /// StartPtr - The getelementptr instruction that points to the start of the
+  /// range.
+  Value *StartPtr;
+
+  /// Alignment - The known alignment of the first store.
+  unsigned Alignment;
+
+  /// TheStores - The actual stores that make up this range.
+  SmallVector<Instruction*, 16> TheStores;
+
+  bool isProfitableToUseMemset(const DataLayout &DL) const;
+};
+} // end anon namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+  // If we found more than 4 stores to merge or 16 bytes, use memset.
+  if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+  // If there is nothing to merge, don't do anything.
+  if (TheStores.size() < 2) return false;
+
+  // If any of the stores are a memset, then it is always good to extend the
+  // memset.
+  for (Instruction *SI : TheStores)
+    if (!isa<StoreInst>(SI))
+      return true;
+
+  // Assume that the code generator is capable of merging pairs of stores
+  // together if it wants to.
+  if (TheStores.size() == 2) return false;
+
+  // If we have fewer than 8 stores, it can still be worthwhile to do this.
+  // For example, merging 4 i8 stores into an i32 store is useful almost always.
+  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+  // memset will be split into 2 32-bit stores anyway) and doing so can
+  // pessimize the llvm optimizer.
+  //
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
+  unsigned Bytes = unsigned(End-Start);
+  unsigned MaxIntSize = DL.getLargestLegalIntTypeSize();
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
+
+  // Assume the remaining bytes if any are done a byte at a time.
+  unsigned NumByteStores = Bytes % MaxIntSize;
+
+  // If we will reduce the # stores (according to this heuristic), do the
+  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+  // etc.
+  return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+
+namespace {
+class MemsetRanges {
+  /// A sorted list of the memset ranges.
+  SmallVector<MemsetRange, 8> Ranges;
+  typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+  const DataLayout &DL;
+public:
+  MemsetRanges(const DataLayout &DL) : DL(DL) {}
+
+  typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+  const_iterator begin() const { return Ranges.begin(); }
+  const_iterator end() const { return Ranges.end(); }
+  bool empty() const { return Ranges.empty(); }
+
+  void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      addStore(OffsetFromFirst, SI);
+    else
+      addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+  }
+
+  void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+    addRange(OffsetFromFirst, StoreSize,
+             SI->getPointerOperand(), SI->getAlignment(), SI);
+  }
+
+  void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+    int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+  }
+
+  void addRange(int64_t Start, int64_t Size, Value *Ptr,
+                unsigned Alignment, Instruction *Inst);
+
+};
+
+} // end anon namespace
+
+
+/// Add a new store to the MemsetRanges data structure.  This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+                            unsigned Alignment, Instruction *Inst) {
+  int64_t End = Start+Size;
+
+  range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
+    [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
+
+  // We now know that I == E, in which case we didn't find anything to merge
+  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
+  // to insert a new range.  Handle this now.
+  if (I == Ranges.end() || End < I->Start) {
+    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    R.Start        = Start;
+    R.End          = End;
+    R.StartPtr     = Ptr;
+    R.Alignment    = Alignment;
+    R.TheStores.push_back(Inst);
+    return;
+  }
+
+  // This store overlaps with I, add it.
+  I->TheStores.push_back(Inst);
+
+  // At this point, we may have an interval that completely contains our store.
+  // If so, just add it to the interval and return.
+  if (I->Start <= Start && I->End >= End)
+    return;
+
+  // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+  // but is not entirely contained within the range.
+
+  // See if the range extends the start of the range.  In this case, it couldn't
+  // possibly cause it to join the prior range, because otherwise we would have
+  // stopped on *it*.
+  if (Start < I->Start) {
+    I->Start = Start;
+    I->StartPtr = Ptr;
+    I->Alignment = Alignment;
+  }
+
+  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+  // is in or right at the end of I), and that End >= I->Start.  Extend I out to
+  // End.
+  if (End > I->End) {
+    I->End = End;
+    range_iterator NextI = I;
+    while (++NextI != Ranges.end() && End >= NextI->Start) {
+      // Merge the range in.
+      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+      if (NextI->End > I->End)
+        I->End = NextI->End;
+      Ranges.erase(NextI);
+      NextI = I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         MemCpyOpt Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class MemCpyOpt : public FunctionPass {
+    MemoryDependenceAnalysis *MD;
+    TargetLibraryInfo *TLI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MemCpyOpt() : FunctionPass(ID) {
+      initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
+      MD = nullptr;
+      TLI = nullptr;
+    }
+
+    bool runOnFunction(Function &F) override;
+
+  private:
+    // This transformation requires dominator postdominator info
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+
+    // Helper functions
+    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
+    bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
+    bool processMemCpy(MemCpyInst *M);
+    bool processMemMove(MemMoveInst *M);
+    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
+                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
+    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
+    bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
+    bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
+    bool processByValArgument(CallSite CS, unsigned ArgNo);
+    Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
+                                      Value *ByteVal);
+
+    bool iterateOnFunction(Function &F);
+  };
+
+  char MemCpyOpt::ID = 0;
+}
+
+/// The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                    false, false)
+
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
+                                             Value *StartPtr, Value *ByteVal) {
+  const DataLayout &DL = StartInst->getModule()->getDataLayout();
+
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemsetRanges Ranges(DL);
+
+  BasicBlock::iterator BI(StartInst);
+  for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      // If this is a store, see if we can merge it in.
+      if (!NextStore->isSimple()) break;
+
+      // Check to see if this stored value is of the same byte-splattable value.
+      if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
+                           DL))
+        break;
+
+      Ranges.addStore(Offset, NextStore);
+    } else {
+      MemSetInst *MSI = cast<MemSetInst>(BI);
+
+      if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+          !isa<ConstantInt>(MSI->getLength()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
+        break;
+
+      Ranges.addMemSet(Offset, MSI);
+    }
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return nullptr;
+
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  Ranges.addInst(0, StartInst);
+
+  // If we create any memsets, we put it right before the first instruction that
+  // isn't part of the memset block.  This ensure that the memset is dominated
+  // by any addressing instruction needed by the start of the block.
+  IRBuilder<> Builder(&*BI);
+
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  Instruction *AMemSet = nullptr;
+  for (const MemsetRange &Range : Ranges) {
+
+    if (Range.TheStores.size() == 1) continue;
+
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(DL))
+      continue;
+
+    // Otherwise, we do want to transform this!  Create a new memset.
+    // Get the starting pointer of the block.
+    StartPtr = Range.StartPtr;
+
+    // Determine alignment
+    unsigned Alignment = Range.Alignment;
+    if (Alignment == 0) {
+      Type *EltType =
+        cast<PointerType>(StartPtr->getType())->getElementType();
+      Alignment = DL.getABITypeAlignment(EltType);
+    }
+
+    AMemSet =
+      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+
+    DEBUG(dbgs() << "Replace stores:\n";
+          for (Instruction *SI : Range.TheStores)
+            dbgs() << *SI << '\n';
+          dbgs() << "With: " << *AMemSet << '\n');
+
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+    // Zap all the stores.
+    for (Instruction *SI : Range.TheStores) {
+      MD->removeInstruction(SI);
+      SI->eraseFromParent();
+    }
+    ++NumMemSetInfer;
+  }
+
+  return AMemSet;
+}
+
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = SI->getAlignment();
+  if (!StoreAlign)
+    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  unsigned LoadAlign = LI->getAlignment();
+  if (!LoadAlign)
+    LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+  return std::min(StoreAlign, LoadAlign);
+}
+
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+  if (!SI->isSimple()) return false;
+
+  // Avoid merging nontemporal stores since the resulting
+  // memcpy/memset would not be able to preserve the nontemporal hint.
+  // In theory we could teach how to propagate the !nontemporal metadata to
+  // memset calls. However, that change would force the backend to
+  // conservatively expand !nontemporal memset calls back to sequences of
+  // store instructions (effectively undoing the merging).
+  if (SI->getMetadata(LLVMContext::MD_nontemporal))
+    return false;
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+
+  // Load to store forwarding can be interpreted as memcpy.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+    if (LI->isSimple() && LI->hasOneUse() &&
+        LI->getParent() == SI->getParent()) {
+
+      auto *T = LI->getType();
+      if (T->isAggregateType()) {
+        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+        // We use alias analysis to check if an instruction may store to
+        // the memory we load from in between the load and the store. If
+        // such an instruction is found, we try to promote there instead
+        // of at the store position.
+        Instruction *P = SI;
+        for (BasicBlock::iterator I = ++LI->getIterator(), E = SI->getIterator();
+             I != E; ++I) {
+          if (!(AA.getModRefInfo(&*I, LoadLoc) & MRI_Mod))
+            continue;
+
+          // We found an instruction that may write to the loaded memory.
+          // We can try to promote at this position instead of the store
+          // position if nothing alias the store memory after this and the store
+          // destination is not in the range.
+          P = &*I;
+          for (; I != E; ++I) {
+            MemoryLocation StoreLoc = MemoryLocation::get(SI);
+            if (&*I == SI->getOperand(1) ||
+                AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+              P = nullptr;
+              break;
+            }
+          }
+
+          break;
+        }
+
+        // If a valid insertion position is found, then we can promote
+        // the load/store pair to a memcpy.
+        if (P) {
+          // If we load from memory that may alias the memory we store to,
+          // memmove must be used to preserve semantic. If not, memcpy can
+          // be used.
+          bool UseMemMove = false;
+          if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc))
+            UseMemMove = true;
+
+          unsigned Align = findCommonAlignment(DL, SI, LI);
+          uint64_t Size = DL.getTypeStoreSize(T);
+
+          IRBuilder<> Builder(P);
+          Instruction *M;
+          if (UseMemMove)
+            M = Builder.CreateMemMove(SI->getPointerOperand(),
+                                      LI->getPointerOperand(), Size,
+                                      Align, SI->isVolatile());
+          else
+            M = Builder.CreateMemCpy(SI->getPointerOperand(),
+                                     LI->getPointerOperand(), Size,
+                                     Align, SI->isVolatile());
+
+          DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI
+                       << " => " << *M << "\n");
+
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+
+          // Make sure we do not invalidate the iterator.
+          BBI = M->getIterator();
+          return true;
+        }
+      }
+
+      // Detect cases where we're performing call slot forwarding, but
+      // happen to be using a load-store pair to implement it, rather than
+      // a memcpy.
+      MemDepResult ldep = MD->getDependency(LI);
+      CallInst *C = nullptr;
+      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+        C = dyn_cast<CallInst>(ldep.getInst());
+
+      if (C) {
+        // Check that nothing touches the dest of the "copy" between
+        // the call and the store.
+        AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+        MemoryLocation StoreLoc = MemoryLocation::get(SI);
+        for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
+             I != E; --I) {
+          if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
+            C = nullptr;
+            break;
+          }
+        }
+      }
+
+      if (C) {
+        bool changed = performCallSlotOptzn(
+            LI, SI->getPointerOperand()->stripPointerCasts(),
+            LI->getPointerOperand()->stripPointerCasts(),
+            DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+            findCommonAlignment(DL, SI, LI), C);
+        if (changed) {
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+
+  // There are two cases that are interesting for this code to handle: memcpy
+  // and memset.  Right now we only handle memset.
+
+  // Ensure that the value being stored is something that can be memset'able a
+  // byte at a time like "0" or "-1" or any width, as well as things like
+  // 0xA0A0A0A0 and 0.0.
+  auto *V = SI->getOperand(0);
+  if (Value *ByteVal = isBytewiseValue(V)) {
+    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+                                              ByteVal)) {
+      BBI = I->getIterator(); // Don't invalidate iterator.
+      return true;
+    }
+
+    // If we have an aggregate, we try to promote it to memset regardless
+    // of opportunity for merging as it can expose optimization opportunities
+    // in subsequent passes.
+    auto *T = V->getType();
+    if (T->isAggregateType()) {
+      uint64_t Size = DL.getTypeStoreSize(T);
+      unsigned Align = SI->getAlignment();
+      if (!Align)
+        Align = DL.getABITypeAlignment(T);
+      IRBuilder<> Builder(SI);
+      auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal,
+                                     Size, Align, SI->isVolatile());
+
+      DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
+      MD->removeInstruction(SI);
+      SI->eraseFromParent();
+      NumMemSetInfer++;
+
+      // Make sure we do not invalidate the iterator.
+      BBI = M->getIterator();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+  // See if there is another memset or store neighboring this memset which
+  // allows us to widen out the memset to do a single larger store.
+  if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+                                              MSI->getValue())) {
+      BBI = I->getIterator(); // Don't invalidate iterator.
+      return true;
+    }
+  return false;
+}
+
+
+/// Takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
+                                     Value *cpyDest, Value *cpySrc,
+                                     uint64_t cpyLen, unsigned cpyAlign,
+                                     CallInst *C) {
+  // The general transformation to keep in mind is
+  //
+  //   call @func(..., src, ...)
+  //   memcpy(dest, src, ...)
+  //
+  // ->
+  //
+  //   memcpy(dest, src, ...)
+  //   call @func(..., dest, ...)
+  //
+  // Since moving the memcpy is technically awkward, we additionally check that
+  // src only holds uninitialized values at the moment of the call, meaning that
+  // the memcpy can be discarded rather than moved.
+
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
+  CallSite CS(C);
+
+  // Require that src be an alloca.  This simplifies the reasoning considerably.
+  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  if (!srcAlloca)
+    return false;
+
+  ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  if (!srcArraySize)
+    return false;
+
+  const DataLayout &DL = cpy->getModule()->getDataLayout();
+  uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+                     srcArraySize->getZExtValue();
+
+  if (cpyLen < srcSize)
+    return false;
+
+  // Check that accessing the first srcSize bytes of dest will not cause a
+  // trap.  Otherwise the transform is invalid since it might cause a trap
+  // to occur earlier than it otherwise would.
+  if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
+    // The destination is an alloca.  Check it is larger than srcSize.
+    ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+    if (!destArraySize)
+      return false;
+
+    uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
+                        destArraySize->getZExtValue();
+
+    if (destSize < srcSize)
+      return false;
+  } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
+    if (A->getDereferenceableBytes() < srcSize) {
+      // If the destination is an sret parameter then only accesses that are
+      // outside of the returned struct type can trap.
+      if (!A->hasStructRetAttr())
+        return false;
+
+      Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+      if (!StructTy->isSized()) {
+        // The call may never return and hence the copy-instruction may never
+        // be executed, and therefore it's not safe to say "the destination
+        // has at least <cpyLen> bytes, as implied by the copy-instruction",
+        return false;
+      }
+
+      uint64_t destSize = DL.getTypeAllocSize(StructTy);
+      if (destSize < srcSize)
+        return false;
+    }
+  } else {
+    return false;
+  }
+
+  // Check that dest points to memory that is at least as aligned as src.
+  unsigned srcAlign = srcAlloca->getAlignment();
+  if (!srcAlign)
+    srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
+  bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+  // If dest is not aligned enough and we can't increase its alignment then
+  // bail out.
+  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+    return false;
+
+  // Check that src is not accessed except via the call and the memcpy.  This
+  // guarantees that it holds only undefined values when passed in (so the final
+  // memcpy can be dropped), that it is not read or written between the call and
+  // the memcpy, and that writing beyond the end of it is undefined.
+  SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
+                                   srcAlloca->user_end());
+  while (!srcUseList.empty()) {
+    User *U = srcUseList.pop_back_val();
+
+    if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
+      for (User *UU : U->users())
+        srcUseList.push_back(UU);
+      continue;
+    }
+    if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+      if (!G->hasAllZeroIndices())
+        return false;
+
+      for (User *UU : U->users())
+        srcUseList.push_back(UU);
+      continue;
+    }
+    if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+      if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
+          IT->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+
+    if (U != C && U != cpy)
+      return false;
+  }
+
+  // Check that src isn't captured by the called function since the
+  // transformation can cause aliasing issues in that case.
+  for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+    if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
+      return false;
+
+  // Since we're changing the parameter to the callsite, we need to make sure
+  // that what would be the new parameter dominates the callsite.
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
+    if (!DT.dominates(cpyDestInst, C))
+      return false;
+
+  // In addition to knowing that the call does not access src in some
+  // unexpected manner, for example via a global, which we deduce from
+  // the use analysis, we also need to know that it does not sneakily
+  // access dest.  We rely on AA to figure this out for us.
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
+  // If necessary, perform additional analysis.
+  if (MR != MRI_NoModRef)
+    MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+  if (MR != MRI_NoModRef)
+    return false;
+
+  // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+      Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
+        : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                      cpyDest->getName(), C);
+      changedArgument = true;
+      if (CS.getArgument(i)->getType() == Dest->getType())
+        CS.setArgument(i, Dest);
+      else
+        CS.setArgument(i, CastInst::CreatePointerCast(Dest,
+                          CS.getArgument(i)->getType(), Dest->getName(), C));
+    }
+
+  if (!changedArgument)
+    return false;
+
+  // If the destination wasn't sufficiently aligned then increase its alignment.
+  if (!isDestSufficientlyAligned) {
+    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+  }
+
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
+  MD->removeInstruction(C);
+
+  // Update AA metadata
+  // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+  // handled here, but combineMetadata doesn't support them yet
+  unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_noalias,
+                         LLVMContext::MD_invariant_group};
+  combineMetadata(C, cpy, KnownIDs);
+
+  // Remove the memcpy.
+  MD->removeInstruction(cpy);
+  ++NumMemCpyInstr;
+
+  return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
+bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other.
+  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+    return false;
+
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction.  Just
+  // ignore the input and let someone else zap MDep.  This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (M->getSource() == MDep->getSource())
+    return false;
+
+  // Second, the length of the memcpy's must be the same, or the preceding one
+  // must be larger than the following one.
+  ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+  if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+    return false;
+
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep =
+      MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
+                                   M->getIterator(), M->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
+
+  // If the dest of the second might alias the source of the first, then the
+  // source and dest might overlap.  We still want to eliminate the intermediate
+  // value, but we have to generate a memmove instead of memcpy.
+  bool UseMemMove = false;
+  if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+                    MemoryLocation::getForSource(MDep)))
+    UseMemMove = true;
+
+  // If all checks passed, then we can transform M.
+
+  // Make sure to use the lesser of the alignment of the source and the dest
+  // since we're changing where we're reading from, but don't want to increase
+  // the alignment past what can be read from or written to.
+  // TODO: Is this worth it if we're creating a less aligned memcpy? For
+  // example we could be moving from movaps -> movq on x86.
+  unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
+
+  IRBuilder<> Builder(M);
+  if (UseMemMove)
+    Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+                          Align, M->isVolatile());
+  else
+    Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+                         Align, M->isVolatile());
+
+  // Remove the instruction we're replacing.
+  MD->removeInstruction(M);
+  M->eraseFromParent();
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of \p MemCpy is
+/// \p MemSet.  Try to simplify \p MemSet to only set the trailing bytes that
+/// weren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+///   memset(dst, c, dst_size);
+///   memcpy(dst, src, src_size);
+/// \endcode
+/// into:
+/// \code
+///   memcpy(dst, src, src_size);
+///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// \endcode
+bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+                                              MemSetInst *MemSet) {
+  // We can only transform memset/memcpy with the same destination.
+  if (MemSet->getDest() != MemCpy->getDest())
+    return false;
+
+  // Check that there are no other dependencies on the memset destination.
+  MemDepResult DstDepInfo =
+      MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
+                                   MemCpy->getIterator(), MemCpy->getParent());
+  if (DstDepInfo.getInst() != MemSet)
+    return false;
+
+  // Use the same i8* dest as the memcpy, killing the memset dest if different.
+  Value *Dest = MemCpy->getRawDest();
+  Value *DestSize = MemSet->getLength();
+  Value *SrcSize = MemCpy->getLength();
+
+  // By default, create an unaligned memset.
+  unsigned Align = 1;
+  // If Dest is aligned, and SrcSize is constant, use the minimum alignment
+  // of the sum.
+  const unsigned DestAlign =
+      std::max(MemSet->getAlignment(), MemCpy->getAlignment());
+  if (DestAlign > 1)
+    if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+      Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+
+  IRBuilder<> Builder(MemCpy);
+
+  // If the sizes have different types, zext the smaller one.
+  if (DestSize->getType() != SrcSize->getType()) {
+    if (DestSize->getType()->getIntegerBitWidth() >
+        SrcSize->getType()->getIntegerBitWidth())
+      SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
+    else
+      DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
+  }
+
+  Value *MemsetLen =
+      Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize),
+                           ConstantInt::getNullValue(DestSize->getType()),
+                           Builder.CreateSub(DestSize, SrcSize));
+  Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1),
+                       MemsetLen, Align);
+
+  MD->removeInstruction(MemSet);
+  MemSet->eraseFromParent();
+  return true;
+}
+
+/// Transform memcpy to memset when its source was just memset.
+/// In other words, turn:
+/// \code
+///   memset(dst1, c, dst1_size);
+///   memcpy(dst2, dst1, dst2_size);
+/// \endcode
+/// into:
+/// \code
+///   memset(dst1, c, dst1_size);
+///   memset(dst2, c, dst2_size);
+/// \endcode
+/// When dst2_size <= dst1_size.
+///
+/// The \p MemCpy must have a Constant length.
+bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+                                           MemSetInst *MemSet) {
+  // This only makes sense on memcpy(..., memset(...), ...).
+  if (MemSet->getRawDest() != MemCpy->getRawSource())
+    return false;
+
+  ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+  ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+  // Make sure the memcpy doesn't read any more than what the memset wrote.
+  // Don't worry about sizes larger than i64.
+  if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
+    return false;
+
+  IRBuilder<> Builder(MemCpy);
+  Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+                       CopySize, MemCpy->getAlignment());
+  return true;
+}
+
+/// Perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+  // We can only optimize non-volatile memcpy's.
+  if (M->isVolatile()) return false;
+
+  // If the source and destination of the memcpy are the same, then zap it.
+  if (M->getSource() == M->getDest()) {
+    MD->removeInstruction(M);
+    M->eraseFromParent();
+    return false;
+  }
+
+  // If copying from a constant, try to turn the memcpy into a memset.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
+        IRBuilder<> Builder(M);
+        Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+                             M->getAlignment(), false);
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        ++NumCpyToSet;
+        return true;
+      }
+
+  MemDepResult DepInfo = MD->getDependency(M);
+
+  // Try to turn a partially redundant memset + memcpy into
+  // memcpy + smaller memset.  We don't need the memcpy size for this.
+  if (DepInfo.isClobber())
+    if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
+      if (processMemSetMemCpyDependence(M, MDep))
+        return true;
+
+  // The optimizations after this point require the memcpy size.
+  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+  if (!CopySize) return false;
+
+  // There are four possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE.
+  //   b) call-memcpy xform for return slot optimization.
+  //   c) memcpy from freshly alloca'd space or space that has just started its
+  //      lifetime copies undefined data, and we can therefore eliminate the
+  //      memcpy in favor of the data that was already at the destination.
+  //   d) memcpy from a just-memset'd source can be turned into memset.
+  if (DepInfo.isClobber()) {
+    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                               CopySize->getZExtValue(), M->getAlignment(),
+                               C)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        return true;
+      }
+    }
+  }
+
+  MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
+      SrcLoc, true, M->getIterator(), M->getParent());
+
+  if (SrcDepInfo.isClobber()) {
+    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+      return processMemCpyMemCpyDependence(M, MDep);
+  } else if (SrcDepInfo.isDef()) {
+    Instruction *I = SrcDepInfo.getInst();
+    bool hasUndefContents = false;
+
+    if (isa<AllocaInst>(I)) {
+      hasUndefContents = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+        if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+          if (LTSize->getZExtValue() >= CopySize->getZExtValue())
+            hasUndefContents = true;
+    }
+
+    if (hasUndefContents) {
+      MD->removeInstruction(M);
+      M->eraseFromParent();
+      ++NumMemCpyInstr;
+      return true;
+    }
+  }
+
+  if (SrcDepInfo.isClobber())
+    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+      if (performMemCpyToMemSetOptzn(M, MDep)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        ++NumCpyToSet;
+        return true;
+      }
+
+  return false;
+}
+
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
+bool MemCpyOpt::processMemMove(MemMoveInst *M) {
+  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  if (!TLI->has(LibFunc::memmove))
+    return false;
+
+  // See if the pointers alias.
+  if (!AA.isNoAlias(MemoryLocation::getForDest(M),
+                    MemoryLocation::getForSource(M)))
+    return false;
+
+  DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
+
+  // If not, then we know we can transform this.
+  Type *ArgTys[3] = { M->getRawDest()->getType(),
+                      M->getRawSource()->getType(),
+                      M->getLength()->getType() };
+  M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+                                                 Intrinsic::memcpy, ArgTys));
+
+  // MemDep may have over conservative information about this instruction, just
+  // conservatively flush it from the cache.
+  MD->removeInstruction(M);
+
+  ++NumMoveToCpy;
+  return true;
+}
+
+/// This is called on every byval argument in call sites.
+bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
+  const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
+  // Find out what feeds this byval argument.
+  Value *ByValArg = CS.getArgument(ArgNo);
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
+  MemDepResult DepInfo = MD->getPointerDependencyFrom(
+      MemoryLocation(ByValArg, ByValSize), true,
+      CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
+  if (!DepInfo.isClobber())
+    return false;
+
+  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
+  // a memcpy, see if we can byval from the source of the memcpy instead of the
+  // result.
+  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  if (!MDep || MDep->isVolatile() ||
+      ByValArg->stripPointerCasts() != MDep->getDest())
+    return false;
+
+  // The length of the memcpy must be larger or equal to the size of the byval.
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+    return false;
+
+  // Get the alignment of the byval.  If the call doesn't specify the alignment,
+  // then it is some target specific value that we can't know.
+  unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+  if (ByValAlign == 0) return false;
+
+  // If it is greater than the memcpy, then we check to see if we can force the
+  // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  AssumptionCache &AC =
+      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+          *CS->getParent()->getParent());
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  if (MDep->getAlignment() < ByValAlign &&
+      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
+                                 CS.getInstruction(), &AC, &DT) < ByValAlign)
+    return false;
+
+  // Verify that the copied-from memory doesn't change in between the memcpy and
+  // the byval call.
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    foo(*a)
+  // It would be invalid to transform the second memcpy into foo(*b).
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep = MD->getPointerDependencyFrom(
+      MemoryLocation::getForSource(MDep), false,
+      CS.getInstruction()->getIterator(), MDep->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
+
+  Value *TmpCast = MDep->getSource();
+  if (MDep->getSource()->getType() != ByValArg->getType())
+    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                              "tmpcast", CS.getInstruction());
+
+  DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
+               << "  " << *MDep << "\n"
+               << "  " << *CS.getInstruction() << "\n");
+
+  // Otherwise we're good!  Update the byval argument.
+  CS.setArgument(ArgNo, TmpCast);
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// Executes one iteration of MemCpyOpt.
+bool MemCpyOpt::iterateOnFunction(Function &F) {
+  bool MadeChange = false;
+
+  // Walk all instruction in the function.
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      // Avoid invalidating the iterator.
+      Instruction *I = &*BI++;
+
+      bool RepeatInstruction = false;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        MadeChange |= processStore(SI, BI);
+      else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+        RepeatInstruction = processMemSet(M, BI);
+      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+        RepeatInstruction = processMemCpy(M);
+      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+        RepeatInstruction = processMemMove(M);
+      else if (auto CS = CallSite(I)) {
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.isByValArgument(i))
+            MadeChange |= processByValArgument(CS, i);
+      }
+
+      // Reprocess the instruction if desired.
+      if (RepeatInstruction) {
+        if (BI != BB->begin()) --BI;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOpt::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  bool MadeChange = false;
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  // If we don't have at least memset and memcpy, there is little point of doing
+  // anything here.  These are required by a freestanding implementation, so if
+  // even they are disabled, there is no point in trying hard.
+  if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+    return false;
+
+  while (1) {
+    if (!iterateOnFunction(F))
+      break;
+    MadeChange = true;
+  }
+
+  MD = nullptr;
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
new file mode 100644
index 0000000..c812d61
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -0,0 +1,587 @@
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! \brief This pass performs merges of loads and stores on both sides of a
+//  diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. It hoists / sinks when it
+// thinks it safe to do so.  This optimization helps with eg. hiding load
+// latencies, triggering if-conversion, and reducing static code size.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+//            header:
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               %lt = load %addr_l               %le = load %addr_l
+//               <use %lt>                        <use %le>
+//               <...>                            <...>
+//               store %st, %addr_s               store %se, %addr_s
+//               br label %if.end                 br label %if.end
+//                     +                         +
+//                      +                       +
+//                       +                     +
+//            if.end ("footer"):
+//                     <...>
+//
+// Diamond shaped code after merge:
+//
+//            header:
+//                     %l = load %addr_l
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               <use %l>                         <use %l>
+//               <...>                            <...>
+//               br label %if.end                 br label %if.end
+//                      +                        +
+//                       +                      +
+//                        +                    +
+//            if.end ("footer"):
+//                     %s.sink = phi [%st, if.then], [%se, if.else]
+//                     <...>
+//                     store %s.sink, %addr_s
+//                     <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+//===----------------------------------------------------------------------===//
+//                         MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class MergedLoadStoreMotion : public FunctionPass {
+  AliasAnalysis *AA;
+  MemoryDependenceAnalysis *MD;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MergedLoadStoreMotion()
+      : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {
+    initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceAnalysis>();
+  }
+
+  // Helper routines
+
+  ///
+  /// \brief Remove instruction from parent and update memory dependence
+  /// analysis.
+  ///
+  void removeInstruction(Instruction *Inst);
+  BasicBlock *getDiamondTail(BasicBlock *BB);
+  bool isDiamondHead(BasicBlock *BB);
+  // Routines for hoisting loads
+  bool isLoadHoistBarrierInRange(const Instruction& Start,
+                                 const Instruction& End,
+                                 LoadInst* LI);
+  LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
+  void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
+                        Instruction *ElseInst);
+  bool isSafeToHoist(Instruction *I) const;
+  bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
+  bool mergeLoads(BasicBlock *BB);
+  // Routines for sinking stores
+  StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+  PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+  bool isStoreSinkBarrierInRange(const Instruction &Start,
+                                 const Instruction &End, MemoryLocation Loc);
+  bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+  bool mergeStores(BasicBlock *BB);
+  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+  // where Size0 and Size1 are the #instructions on the two sides of
+  // the diamond. The constant chosen here is arbitrary. Compiler Time
+  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+  const int MagicCompileTimeControl;
+};
+
+char MergedLoadStoreMotion::ID = 0;
+} // anonymous namespace
+
+///
+/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass() {
+  return new MergedLoadStoreMotion();
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
+                      "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
+                    "MergedLoadStoreMotion", false, false)
+
+///
+/// \brief Remove instruction from parent and update memory dependence analysis.
+///
+void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
+  // Notify the memory dependence analysis.
+  if (MD) {
+    MD->removeInstruction(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      MD->invalidateCachedPointerInfo(LI->getPointerOperand());
+    if (Inst->getType()->getScalarType()->isPointerTy()) {
+      MD->invalidateCachedPointerInfo(Inst);
+    }
+  }
+  Inst->eraseFromParent();
+}
+
+///
+/// \brief Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+  assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+  BranchInst *BI = (BranchInst *)(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+  return Tail;
+}
+
+///
+/// \brief True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+  if (!BB)
+    return false;
+  if (!isa<BranchInst>(BB->getTerminator()))
+    return false;
+  if (BB->getTerminator()->getNumSuccessors() != 2)
+    return false;
+
+  BranchInst *BI = (BranchInst *)(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+
+  if (!Succ0->getSinglePredecessor() ||
+      Succ0->getTerminator()->getNumSuccessors() != 1)
+    return false;
+  if (!Succ1->getSinglePredecessor() ||
+      Succ1->getTerminator()->getNumSuccessors() != 1)
+    return false;
+
+  BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+  // Ignore triangles.
+  if (Succ1->getTerminator()->getSuccessor(0) != Tail)
+    return false;
+  return true;
+}
+
+///
+/// \brief True when instruction is a hoist barrier for a load
+///
+/// Whenever an instruction could possibly modify the value
+/// being loaded or protect against the load from happening
+/// it is considered a hoist barrier.
+///
+bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, 
+                                                      const Instruction& End,
+                                                      LoadInst* LI) {
+  MemoryLocation Loc = MemoryLocation::get(LI);
+  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
+}
+
+///
+/// \brief Decide if a load can be hoisted
+///
+/// When there is a load in \p BB to the same address as \p LI
+/// and it can be hoisted from \p BB, return that load.
+/// Otherwise return Null.
+///
+LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
+                                                   LoadInst *Load0) {
+
+  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
+       ++BBI) {
+    Instruction *Inst = &*BBI;
+
+    // Only merge and hoist loads when their result in used only in BB
+    if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1))
+      continue;
+
+    LoadInst *Load1 = dyn_cast<LoadInst>(Inst);
+    BasicBlock *BB0 = Load0->getParent();
+
+    MemoryLocation Loc0 = MemoryLocation::get(Load0);
+    MemoryLocation Loc1 = MemoryLocation::get(Load1);
+    if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) &&
+        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) &&
+        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) {
+      return Load1;
+    }
+  }
+  return nullptr;
+}
+
+///
+/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
+/// \p BB
+///
+/// BB is the head of a diamond
+///
+void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
+                                             Instruction *HoistCand,
+                                             Instruction *ElseInst) {
+  DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
+        dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
+        dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
+  // Hoist the instruction.
+  assert(HoistCand->getParent() != BB);
+
+  // Intersect optional metadata.
+  HoistCand->intersectOptionalDataWith(ElseInst);
+  HoistCand->dropUnknownNonDebugMetadata();
+
+  // Prepend point for instruction insert
+  Instruction *HoistPt = BB->getTerminator();
+
+  // Merged instruction
+  Instruction *HoistedInst = HoistCand->clone();
+
+  // Hoist instruction.
+  HoistedInst->insertBefore(HoistPt);
+
+  HoistCand->replaceAllUsesWith(HoistedInst);
+  removeInstruction(HoistCand);
+  // Replace the else block instruction.
+  ElseInst->replaceAllUsesWith(HoistedInst);
+  removeInstruction(ElseInst);
+}
+
+///
+/// \brief Return true if no operand of \p I is defined in I's parent block
+///
+bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
+  BasicBlock *Parent = I->getParent();
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i));
+    if (Instr && Instr->getParent() == Parent)
+      return false;
+  }
+  return true;
+}
+
+///
+/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
+///
+bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
+                                      LoadInst *L1) {
+  // Only one definition?
+  Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
+  Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
+  if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
+      A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
+      A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
+      isa<GetElementPtrInst>(A0)) {
+    DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
+          dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
+          dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
+    hoistInstruction(BB, A0, A1);
+    hoistInstruction(BB, L0, L1);
+    return true;
+  } else
+    return false;
+}
+
+///
+/// \brief Try to hoist two loads to same address into diamond header
+///
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a load in the second successor.
+///
+bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
+  bool MergedLoads = false;
+  assert(isDiamondHead(BB));
+  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+  // #Instructions in Succ1 for Compile Time Control
+  int Size1 = Succ1->size();
+  int NLoads = 0;
+  for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
+       BBI != BBE;) {
+    Instruction *I = &*BBI;
+    ++BBI;
+
+    // Only move non-simple (atomic, volatile) loads.
+    LoadInst *L0 = dyn_cast<LoadInst>(I);
+    if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
+      continue;
+
+    ++NLoads;
+    if (NLoads * Size1 >= MagicCompileTimeControl)
+      break;
+    if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
+      bool Res = hoistLoad(BB, L0, L1);
+      MergedLoads |= Res;
+      // Don't attempt to hoist above loads that had not been hoisted.
+      if (!Res)
+        break;
+    }
+  }
+  return MergedLoads;
+}
+
+///
+/// \brief True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
+                                                      const Instruction &End,
+                                                      MemoryLocation Loc) {
+  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef);
+}
+
+///
+/// \brief Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p  when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+                                                   StoreInst *Store0) {
+  DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+  BasicBlock *BB0 = Store0->getParent();
+  for (BasicBlock::reverse_iterator RBI = BB1->rbegin(), RBE = BB1->rend();
+       RBI != RBE; ++RBI) {
+    Instruction *Inst = &*RBI;
+
+    if (!isa<StoreInst>(Inst))
+       continue;
+
+    StoreInst *Store1 = cast<StoreInst>(Inst);
+
+    MemoryLocation Loc0 = MemoryLocation::get(Store0);
+    MemoryLocation Loc1 = MemoryLocation::get(Store1);
+    if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store1))),
+                                 BB1->back(), Loc1) &&
+      !isStoreSinkBarrierInRange(*(std::next(BasicBlock::iterator(Store0))),
+                                 BB0->back(), Loc0)) {
+      return Store1;
+    }
+  }
+  return nullptr;
+}
+
+///
+/// \brief Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
+  // Create a phi if the values mismatch.
+  PHINode *NewPN = nullptr;
+  Value *Opd1 = S0->getValueOperand();
+  Value *Opd2 = S1->getValueOperand();
+  if (Opd1 != Opd2) {
+    NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+                            &BB->front());
+    NewPN->addIncoming(Opd1, S0->getParent());
+    NewPN->addIncoming(Opd2, S1->getParent());
+    if (MD && NewPN->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(NewPN);
+  }
+  return NewPN;
+}
+
+///
+/// \brief Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
+                                      StoreInst *S1) {
+  // Only one definition?
+  Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+      (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+      (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
+    DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+          dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+          dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+    // Hoist the instruction.
+    BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+    // Intersect optional metadata.
+    S0->intersectOptionalDataWith(S1);
+    S0->dropUnknownNonDebugMetadata();
+
+    // Create the new store to be inserted at the join point.
+    StoreInst *SNew = (StoreInst *)(S0->clone());
+    Instruction *ANew = A0->clone();
+    SNew->insertBefore(&*InsertPt);
+    ANew->insertBefore(SNew);
+
+    assert(S0->getParent() == A0->getParent());
+    assert(S1->getParent() == A1->getParent());
+
+    PHINode *NewPN = getPHIOperand(BB, S0, S1);
+    // New PHI operand? Use it.
+    if (NewPN)
+      SNew->setOperand(0, NewPN);
+    removeInstruction(S0);
+    removeInstruction(S1);
+    A0->replaceAllUsesWith(ANew);
+    removeInstruction(A0);
+    A1->replaceAllUsesWith(ANew);
+    removeInstruction(A1);
+    return true;
+  }
+  return false;
+}
+
+///
+/// \brief True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond tail block, iterate over the instructions in one
+/// predecessor block and try to match a store in the second predecessor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+
+  bool MergedStores = false;
+  assert(T && "Footer of a diamond cannot be empty");
+
+  pred_iterator PI = pred_begin(T), E = pred_end(T);
+  assert(PI != E);
+  BasicBlock *Pred0 = *PI;
+  ++PI;
+  BasicBlock *Pred1 = *PI;
+  ++PI;
+  // tail block  of a diamond/hammock?
+  if (Pred0 == Pred1)
+    return false; // No.
+  if (PI != E)
+    return false; // No. More than 2 predecessors.
+
+  // #Instructions in Succ1 for Compile Time Control
+  int Size1 = Pred1->size();
+  int NStores = 0;
+
+  for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+       RBI != RBE;) {
+
+    Instruction *I = &*RBI;
+    ++RBI;
+
+    // Sink move non-simple (atomic, volatile) stores
+    if (!isa<StoreInst>(I))
+      continue;
+    StoreInst *S0 = (StoreInst *)I;
+    if (!S0->isSimple())
+      continue;
+
+    ++NStores;
+    if (NStores * Size1 >= MagicCompileTimeControl)
+      break;
+    if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+      bool Res = sinkStore(T, S0, S1);
+      MergedStores |= Res;
+      // Don't attempt to sink below stores that had to stick around
+      // But after removal of a store and some of its feeding
+      // instruction search again from the beginning since the iterator
+      // is likely stale at this point.
+      if (!Res)
+        break;
+      else {
+        RBI = Pred0->rbegin();
+        RBE = Pred0->rend();
+        DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+      }
+    }
+  }
+  return MergedStores;
+}
+
+///
+/// \brief Run the transformation for each function
+///
+bool MergedLoadStoreMotion::runOnFunction(Function &F) {
+  MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  bool Changed = false;
+  DEBUG(dbgs() << "Instruction Merger\n");
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = &*FI++;
+
+    // Hoist equivalent loads and sink stores
+    // outside diamonds when possible
+    if (isDiamondHead(BB)) {
+      Changed |= mergeLoads(BB);
+      Changed |= mergeStores(getDiamondTail(BB));
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
new file mode 100644
index 0000000..c8f885e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -0,0 +1,577 @@
+//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates n-ary add expressions and eliminates the redundancy
+// exposed by the reassociation.
+//
+// A motivating example:
+//
+//   void foo(int a, int b) {
+//     bar(a + b);
+//     bar((a + 2) + b);
+//   }
+//
+// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
+// the above code to
+//
+//   int t = a + b;
+//   bar(t);
+//   bar(t + 2);
+//
+// However, the Reassociate pass is unable to do that because it processes each
+// instruction individually and believes (a + 2) + b is the best form according
+// to its rank system.
+//
+// To address this limitation, NaryReassociate reassociates an expression in a
+// form that reuses existing instructions. As a result, NaryReassociate can
+// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
+// (a + b) is computed before.
+//
+// NaryReassociate works as follows. For every instruction in the form of (a +
+// b) + c, it checks whether a + c or b + c is already computed by a dominating
+// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
+// c) + a and removes the redundancy accordingly. To efficiently look up whether
+// an expression is computed before, we store each instruction seen and its SCEV
+// into an SCEV-to-instruction map.
+//
+// Although the algorithm pattern-matches only ternary additions, it
+// automatically handles many >3-ary expressions by walking through the function
+// in the depth-first order. For example, given
+//
+//   (a + c) + d
+//   ((a + b) + c) + d
+//
+// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
+// ((a + c) + b) + d into ((a + c) + d) + b.
+//
+// Finally, the above dominator-based algorithm may need to be run multiple
+// iterations before emitting optimal code. One source of this need is that we
+// only split an operand when it is used only once. The above algorithm can
+// eliminate an instruction and decrease the usage count of its operands. As a
+// result, an instruction that previously had multiple uses may become a
+// single-use instruction and thus eligible for split consideration. For
+// example,
+//
+//   ac = a + c
+//   ab = a + b
+//   abc = ab + c
+//   ab2 = ab + b
+//   ab2c = ab2 + c
+//
+// In the first iteration, we cannot reassociate abc to ac+b because ab is used
+// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
+// result, ab2 becomes dead and ab will be used only once in the second
+// iteration.
+//
+// Limitations and TODO items:
+//
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "nary-reassociate"
+
+namespace {
+class NaryReassociate : public FunctionPass {
+public:
+  static char ID;
+
+  NaryReassociate(): FunctionPass(ID) {
+    initializeNaryReassociatePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  // Runs only one iteration of the dominator-based algorithm. See the header
+  // comments for why we need multiple iterations.
+  bool doOneIteration(Function &F);
+
+  // Reassociates I for better CSE.
+  Instruction *tryReassociate(Instruction *I);
+
+  // Reassociate GEP for better CSE.
+  Instruction *tryReassociateGEP(GetElementPtrInst *GEP);
+  // Try splitting GEP at the I-th index and see whether either part can be
+  // CSE'ed. This is a helper function for tryReassociateGEP.
+  //
+  // \p IndexedType The element type indexed by GEP's I-th index. This is
+  //                equivalent to
+  //                  GEP->getIndexedType(GEP->getPointerOperand(), 0-th index,
+  //                                      ..., i-th index).
+  GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+                                              unsigned I, Type *IndexedType);
+  // Given GEP's I-th index = LHS + RHS, see whether &Base[..][LHS][..] or
+  // &Base[..][RHS][..] can be CSE'ed and rewrite GEP accordingly.
+  GetElementPtrInst *tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+                                              unsigned I, Value *LHS,
+                                              Value *RHS, Type *IndexedType);
+
+  // Reassociate binary operators for better CSE.
+  Instruction *tryReassociateBinaryOp(BinaryOperator *I);
+
+  // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly
+  // passed.
+  Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS,
+                                      BinaryOperator *I);
+  // Rewrites I to (LHS op RHS) if LHS is computed already.
+  Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS,
+                                       BinaryOperator *I);
+
+  // Tries to match Op1 and Op2 by using V.
+  bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2);
+
+  // Gets SCEV for (LHS op RHS).
+  const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
+                            const SCEV *RHS);
+
+  // Returns the closest dominator of \c Dominatee that computes
+  // \c CandidateExpr. Returns null if not found.
+  Instruction *findClosestMatchingDominator(const SCEV *CandidateExpr,
+                                            Instruction *Dominatee);
+  // GetElementPtrInst implicitly sign-extends an index if the index is shorter
+  // than the pointer size. This function returns whether Index is shorter than
+  // GEP's pointer size, i.e., whether Index needs to be sign-extended in order
+  // to be an index of GEP.
+  bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP);
+
+  AssumptionCache *AC;
+  const DataLayout *DL;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  TargetLibraryInfo *TLI;
+  TargetTransformInfo *TTI;
+  // A lookup table quickly telling which instructions compute the given SCEV.
+  // Note that there can be multiple instructions at different locations
+  // computing to the same SCEV, so we map a SCEV to an instruction list.  For
+  // example,
+  //
+  //   if (p1)
+  //     foo(a + b);
+  //   if (p2)
+  //     bar(a + b);
+  DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs;
+};
+} // anonymous namespace
+
+char NaryReassociate::ID = 0;
+INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation",
+                    false, false)
+
+FunctionPass *llvm::createNaryReassociatePass() {
+  return new NaryReassociate();
+}
+
+bool NaryReassociate::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool Changed = false, ChangedInThisIteration;
+  do {
+    ChangedInThisIteration = doOneIteration(F);
+    Changed |= ChangedInThisIteration;
+  } while (ChangedInThisIteration);
+  return Changed;
+}
+
+// Whitelist the instruction types NaryReassociate handles for now.
+static bool isPotentiallyNaryReassociable(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::GetElementPtr:
+  case Instruction::Mul:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool NaryReassociate::doOneIteration(Function &F) {
+  bool Changed = false;
+  SeenExprs.clear();
+  // Process the basic blocks in pre-order of the dominator tree. This order
+  // ensures that all bases of a candidate are in Candidates when we process it.
+  for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+       Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ++I) {
+      if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) {
+        const SCEV *OldSCEV = SE->getSCEV(&*I);
+        if (Instruction *NewI = tryReassociate(&*I)) {
+          Changed = true;
+          SE->forgetValue(&*I);
+          I->replaceAllUsesWith(NewI);
+          // If SeenExprs constains I's WeakVH, that entry will be replaced with
+          // nullptr.
+          RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
+          I = NewI->getIterator();
+        }
+        // Add the rewritten instruction to SeenExprs; the original instruction
+        // is deleted.
+        const SCEV *NewSCEV = SE->getSCEV(&*I);
+        SeenExprs[NewSCEV].push_back(WeakVH(&*I));
+        // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+        // is equivalent to I. However, ScalarEvolution::getSCEV may
+        // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
+        // we reassociate
+        //   I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+        // to
+        //   NewI = &a[sext(i)] + sext(j).
+        //
+        // ScalarEvolution computes
+        //   getSCEV(I)    = a + 4 * sext(i + j)
+        //   getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+        // which are different SCEVs.
+        //
+        // To alleviate this issue of ScalarEvolution not always capturing
+        // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+        // map both SCEV before and after tryReassociate(I) to I.
+        //
+        // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
+        if (NewSCEV != OldSCEV)
+          SeenExprs[OldSCEV].push_back(WeakVH(&*I));
+      }
+    }
+  }
+  return Changed;
+}
+
+Instruction *NaryReassociate::tryReassociate(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+    return tryReassociateBinaryOp(cast<BinaryOperator>(I));
+  case Instruction::GetElementPtr:
+    return tryReassociateGEP(cast<GetElementPtrInst>(I));
+  default:
+    llvm_unreachable("should be filtered out by isPotentiallyNaryReassociable");
+  }
+}
+
+// FIXME: extract this method into TTI->getGEPCost.
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+                          const TargetTransformInfo *TTI,
+                          const DataLayout *DL) {
+  GlobalVariable *BaseGV = nullptr;
+  int64_t BaseOffset = 0;
+  bool HasBaseReg = false;
+  int64_t Scale = 0;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
+    BaseGV = GV;
+  else
+    HasBaseReg = true;
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+      if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+        BaseOffset += ConstIdx->getSExtValue() * ElementSize;
+      } else {
+        // Needs scale register.
+        if (Scale != 0) {
+          // No addressing mode takes two scale registers.
+          return false;
+        }
+        Scale = ElementSize;
+      }
+    } else {
+      StructType *STy = cast<StructType>(*GTI);
+      uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+      BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
+    }
+  }
+
+  unsigned AddrSpace = GEP->getPointerAddressSpace();
+  return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
+                                    BaseOffset, HasBaseReg, Scale, AddrSpace);
+}
+
+Instruction *NaryReassociate::tryReassociateGEP(GetElementPtrInst *GEP) {
+  // Not worth reassociating GEP if it is foldable.
+  if (isGEPFoldable(GEP, TTI, DL))
+    return nullptr;
+
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
+    if (isa<SequentialType>(*GTI++)) {
+      if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1, *GTI)) {
+        return NewGEP;
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool NaryReassociate::requiresSignExtension(Value *Index,
+                                            GetElementPtrInst *GEP) {
+  unsigned PointerSizeInBits =
+      DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
+  return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
+}
+
+GetElementPtrInst *
+NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I,
+                                          Type *IndexedType) {
+  Value *IndexToSplit = GEP->getOperand(I + 1);
+  if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
+    IndexToSplit = SExt->getOperand(0);
+  } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
+    // zext can be treated as sext if the source is non-negative.
+    if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
+      IndexToSplit = ZExt->getOperand(0);
+  }
+
+  if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
+    // If the I-th index needs sext and the underlying add is not equipped with
+    // nsw, we cannot split the add because
+    //   sext(LHS + RHS) != sext(LHS) + sext(RHS).
+    if (requiresSignExtension(IndexToSplit, GEP) &&
+        computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+            OverflowResult::NeverOverflows)
+      return nullptr;
+
+    Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
+    // IndexToSplit = LHS + RHS.
+    if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+      return NewGEP;
+    // Symmetrically, try IndexToSplit = RHS + LHS.
+    if (LHS != RHS) {
+      if (auto *NewGEP =
+              tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+        return NewGEP;
+    }
+  }
+  return nullptr;
+}
+
+GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex(
+    GetElementPtrInst *GEP, unsigned I, Value *LHS, Value *RHS,
+    Type *IndexedType) {
+  // Look for GEP's closest dominator that has the same SCEV as GEP except that
+  // the I-th index is replaced with LHS.
+  SmallVector<const SCEV *, 4> IndexExprs;
+  for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+    IndexExprs.push_back(SE->getSCEV(*Index));
+  // Replace the I-th index with LHS.
+  IndexExprs[I] = SE->getSCEV(LHS);
+  if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
+      DL->getTypeSizeInBits(LHS->getType()) <
+          DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) {
+    // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
+    // zext if the source operand is proved non-negative. We should do that
+    // consistently so that CandidateExpr more likely appears before. See
+    // @reassociate_gep_assume for an example of this canonicalization.
+    IndexExprs[I] =
+        SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
+  }
+  const SCEV *CandidateExpr = SE->getGEPExpr(
+      GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()),
+      IndexExprs, GEP->isInBounds());
+
+  Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+  if (Candidate == nullptr)
+    return nullptr;
+
+  IRBuilder<> Builder(GEP);
+  // Candidate does not necessarily have the same pointer type as GEP. Use
+  // bitcast or pointer cast to make sure they have the same type, so that the
+  // later RAUW doesn't complain.
+  Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+  assert(Candidate->getType() == GEP->getType());
+
+  // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
+  uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
+  Type *ElementType = GEP->getType()->getElementType();
+  uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
+  // Another less rare case: because I is not necessarily the last index of the
+  // GEP, the size of the type at the I-th index (IndexedSize) is not
+  // necessarily divisible by ElementSize. For example,
+  //
+  // #pragma pack(1)
+  // struct S {
+  //   int a[3];
+  //   int64 b[8];
+  // };
+  // #pragma pack()
+  //
+  // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
+  //
+  // TODO: bail out on this case for now. We could emit uglygep.
+  if (IndexedSize % ElementSize != 0)
+    return nullptr;
+
+  // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (RHS->getType() != IntPtrTy)
+    RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
+  if (IndexedSize != ElementSize) {
+    RHS = Builder.CreateMul(
+        RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
+  }
+  GetElementPtrInst *NewGEP =
+      cast<GetElementPtrInst>(Builder.CreateGEP(Candidate, RHS));
+  NewGEP->setIsInBounds(GEP->isInBounds());
+  NewGEP->takeName(GEP);
+  return NewGEP;
+}
+
+Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) {
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
+    return NewI;
+  if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
+    return NewI;
+  return nullptr;
+}
+
+Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+                                                     BinaryOperator *I) {
+  Value *A = nullptr, *B = nullptr;
+  // To be conservative, we reassociate I only when it is the only user of (A op
+  // B).
+  if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+    // I = (A op B) op RHS
+    //   = (A op RHS) op B or (B op RHS) op A
+    const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
+    const SCEV *RHSExpr = SE->getSCEV(RHS);
+    if (BExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+        return NewI;
+    }
+    if (AExpr != RHSExpr) {
+      if (auto *NewI =
+              tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+        return NewI;
+    }
+  }
+  return nullptr;
+}
+
+Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+                                                      Value *RHS,
+                                                      BinaryOperator *I) {
+  // Look for the closest dominator LHS of I that computes LHSExpr, and replace
+  // I with LHS op RHS.
+  auto *LHS = findClosestMatchingDominator(LHSExpr, I);
+  if (LHS == nullptr)
+    return nullptr;
+
+  Instruction *NewI = nullptr;
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+    break;
+  case Instruction::Mul:
+    NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+    break;
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  NewI->takeName(I);
+  return NewI;
+}
+
+bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1,
+                                     Value *&Op2) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+  case Instruction::Mul:
+    return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return false;
+}
+
+const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS,
+                                           const SCEV *RHS) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    return SE->getAddExpr(LHS, RHS);
+  case Instruction::Mul:
+    return SE->getMulExpr(LHS, RHS);
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  }
+  return nullptr;
+}
+
+Instruction *
+NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr,
+                                              Instruction *Dominatee) {
+  auto Pos = SeenExprs.find(CandidateExpr);
+  if (Pos == SeenExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed
+    // during rewriting.
+    if (Value *Candidate = Candidates.back()) {
+      Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+      if (DT->dominates(CandidateInstruction, Dominatee))
+        return CandidateInstruction;
+    }
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 0000000..9f26f78
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,163 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+
+namespace {
+  class PartiallyInlineLibCalls : public FunctionPass {
+  public:
+    static char ID;
+
+    PartiallyInlineLibCalls() :
+      FunctionPass(ID) {
+      initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    bool runOnFunction(Function &F) override;
+
+  private:
+    /// Optimize calls to sqrt.
+    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                      BasicBlock &CurrBB, Function::iterator &BB);
+  };
+
+  char PartiallyInlineLibCalls::ID = 0;
+}
+
+INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
+                "Partially inline calls to library functions", false, false)
+
+void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
+  bool Changed = false;
+  Function::iterator CurrBB;
+  TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const TargetTransformInfo *TTI =
+      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc::Func LibFunc;
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+        continue;
+
+      switch (LibFunc) {
+      case LibFunc::sqrtf:
+      case LibFunc::sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
+                                           Function *CalledFunc,
+                                           BasicBlock &CurrBB,
+                                           Function::iterator &BB) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  // The call must have the expected result type.
+  if (!Call->getType()->isFloatingPointTy())
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // if (v0 is a NaN)
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB->getIterator();
+  return true;
+}
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCalls();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
new file mode 100644
index 0000000..b56b355
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -0,0 +1,953 @@
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit.  That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call.  A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call.  Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable.  The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector.  In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops.  We try to avoid inserting more polls than
+// are necessary to ensure a finite period between poll sites.  This is not
+// because the poll itself is expensive in the generated code; it's not.  Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable.  The callee might execute a
+// poll (or otherwise be inspected by the GC).  If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard.  They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them.  Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumCallSafepoints, "Number of call safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop, "Number of loops w/o safepoints due to calls in loop");
+STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+                                  cl::init(false));
+
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"?  Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+                                         cl::Hidden, cl::init(32));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself.  Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+                                   cl::init(false));
+
+// Print tracing output
+static cl::opt<bool> TraceLSP("spp-trace", cl::Hidden, cl::init(false));
+
+namespace {
+
+/// An analysis pass whose purpose is to identify each of the backedges in
+/// the function which require a safepoint poll to be inserted.
+struct PlaceBackedgeSafepointsImpl : public FunctionPass {
+  static char ID;
+
+  /// The output of the pass - gives a list of each backedge (described by
+  /// pointing at the branch) which need a poll inserted.
+  std::vector<TerminatorInst *> PollLocations;
+
+  /// True unless we're running spp-no-calls in which case we need to disable
+  /// the call-dependent placement opts.
+  bool CallSafepointsEnabled;
+
+  ScalarEvolution *SE = nullptr;
+  DominatorTree *DT = nullptr;
+  LoopInfo *LI = nullptr;
+
+  PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+      : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
+    initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *);
+  void runOnLoopAndSubLoops(Loop *L) {
+    // Visit all the subloops
+    for (auto I = L->begin(), E = L->end(); I != E; I++)
+      runOnLoopAndSubLoops(*I);
+    runOnLoop(L);
+  }
+
+  bool runOnFunction(Function &F) override {
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    for (auto I = LI->begin(), E = LI->end(); I != E; I++) {
+      runOnLoopAndSubLoops(*I);
+    }
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    // We no longer modify the IR at all in this pass.  Thus all
+    // analysis are preserved.
+    AU.setPreservesAll();
+  }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  PlaceSafepoints() : FunctionPass(ID) {
+    initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We modify the graph wholesale (inlining, block insertion, etc).  We
+    // preserve nothing at the moment.  We could potentially preserve dom tree
+    // if that was worth doing
+  }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction.  Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/);
+
+static bool needsStatepoint(const CallSite &CS) {
+  if (callsGCLeafFunction(CS))
+    return false;
+  if (CS.isCall()) {
+    CallInst *call = cast<CallInst>(CS.getInstruction());
+    if (call->isInlineAsm())
+      return false;
+  }
+  if (isStatepoint(CS) || isGCRelocate(CS) || isGCResult(CS)) {
+    return false;
+  }
+  return true;
+}
+
+static Value *ReplaceWithStatepoint(const CallSite &CS);
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred.  Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+                                               BasicBlock *Pred,
+                                               DominatorTree &DT) {
+  // In general, we're looking for any cut of the graph which ensures
+  // there's a call safepoint along every edge between Header and Pred.
+  // For the moment, we look only for the 'cuts' that consist of a single call
+  // instruction in a block which is dominated by the Header and dominates the
+  // loop latch (Pred) block.  Somewhat surprisingly, walking the entire chain
+  // of such dominating blocks gets substantially more occurrences than just
+  // checking the Pred and Header blocks themselves.  This may be due to the
+  // density of loop exit conditions caused by range and null checks.
+  // TODO: structure this as an analysis pass, cache the result for subloops,
+  // avoid dom tree recalculations
+  assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+  BasicBlock *Current = Pred;
+  while (true) {
+    for (Instruction &I : *Current) {
+      if (auto CS = CallSite(&I))
+        // Note: Technically, needing a safepoint isn't quite the right
+        // condition here.  We should instead be checking if the target method
+        // has an
+        // unconditional poll. In practice, this is only a theoretical concern
+        // since we don't have any methods with conditional-only safepoint
+        // polls.
+        if (needsStatepoint(CS))
+          return true;
+    }
+
+    if (Current == Header)
+      break;
+    Current = DT.getNode(Current)->getIDom()->getBlock();
+  }
+
+  return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations.  Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+                                    BasicBlock *Pred) {
+  // A conservative bound on the loop as a whole.
+  const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L);
+  if (MaxTrips != SE->getCouldNotCompute() &&
+      SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+          CountedLoopTripWidth))
+    return true;
+
+  // If this is a conditional branch to the header with the alternate path
+  // being outside the loop, we can ask questions about the execution frequency
+  // of the exit block.
+  if (L->isLoopExiting(Pred)) {
+    // This returns an exact expression only.  TODO: We really only need an
+    // upper bound here, but SE doesn't expose that.
+    const SCEV *MaxExec = SE->getExitCount(L, Pred);
+    if (MaxExec != SE->getCouldNotCompute() &&
+        SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+            CountedLoopTripWidth))
+        return true;
+  }
+
+  return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *start, Instruction *end,
+                      std::vector<CallInst *> &calls,
+                      std::set<BasicBlock *> &seen,
+                      std::vector<BasicBlock *> &worklist) {
+  for (BasicBlock::iterator itr(start);
+       itr != start->getParent()->end() && itr != BasicBlock::iterator(end);
+       itr++) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*itr)) {
+      calls.push_back(CI);
+    }
+    // FIXME: This code does not handle invokes
+    assert(!dyn_cast<InvokeInst>(&*itr) &&
+           "support for invokes in poll code needed");
+    // Only add the successor blocks if we reach the terminator instruction
+    // without encountering end first
+    if (itr->isTerminator()) {
+      BasicBlock *BB = itr->getParent();
+      for (BasicBlock *Succ : successors(BB)) {
+        if (seen.count(Succ) == 0) {
+          worklist.push_back(Succ);
+          seen.insert(Succ);
+        }
+      }
+    }
+  }
+}
+static void scanInlinedCode(Instruction *start, Instruction *end,
+                            std::vector<CallInst *> &calls,
+                            std::set<BasicBlock *> &seen) {
+  calls.clear();
+  std::vector<BasicBlock *> worklist;
+  seen.insert(start->getParent());
+  scanOneBB(start, end, calls, seen, worklist);
+  while (!worklist.empty()) {
+    BasicBlock *BB = worklist.back();
+    worklist.pop_back();
+    scanOneBB(&*BB->begin(), end, calls, seen, worklist);
+  }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
+  // Loop through all loop latches (branches controlling backedges).  We need
+  // to place a safepoint on every backedge (potentially).
+  // Note: In common usage, there will be only one edge due to LoopSimplify
+  // having run sometime earlier in the pipeline, but this code must be correct
+  // w.r.t. loops with multiple backedges.
+  BasicBlock *header = L->getHeader();
+  SmallVector<BasicBlock*, 16> LoopLatches;
+  L->getLoopLatches(LoopLatches);
+  for (BasicBlock *pred : LoopLatches) {
+    assert(L->contains(pred));
+
+    // Make a policy decision about whether this loop needs a safepoint or
+    // not.  Note that this is about unburdening the optimizer in loops, not
+    // avoiding the runtime cost of the actual safepoint.
+    if (!AllBackedges) {
+      if (mustBeFiniteCountedLoop(L, SE, pred)) {
+        if (TraceLSP)
+          errs() << "skipping safepoint placement in finite loop\n";
+        FiniteExecution++;
+        continue;
+      }
+      if (CallSafepointsEnabled &&
+          containsUnconditionalCallSafepoint(L, header, pred, *DT)) {
+        // Note: This is only semantically legal since we won't do any further
+        // IPO or inlining before the actual call insertion..  If we hadn't, we
+        // might latter loose this call safepoint.
+        if (TraceLSP)
+          errs() << "skipping safepoint placement due to unconditional call\n";
+        CallInLoop++;
+        continue;
+      }
+    }
+
+    // TODO: We can create an inner loop which runs a finite number of
+    // iterations with an outer loop which contains a safepoint.  This would
+    // not help runtime performance that much, but it might help our ability to
+    // optimize the inner loop.
+
+    // Safepoint insertion would involve creating a new basic block (as the
+    // target of the current backedge) which does the safepoint (of all live
+    // variables) and branches to the true header
+    TerminatorInst *term = pred->getTerminator();
+
+    if (TraceLSP) {
+      errs() << "[LSP] terminator instruction: ";
+      term->dump();
+    }
+
+    PollLocations.push_back(term);
+  }
+
+  return false;
+}
+
+/// Returns true if an entry safepoint is not required before this callsite in
+/// the caller function.
+static bool doesNotRequireEntrySafepointBefore(const CallSite &CS) {
+  Instruction *Inst = CS.getInstruction();
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::experimental_gc_statepoint:
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      // The can wrap an actual call which may grow the stack by an unbounded
+      // amount or run forever.
+      return false;
+    default:
+      // Most LLVM intrinsics are things which do not expand to actual calls, or
+      // at least if they do, are leaf functions that cause only finite stack
+      // growth.  In particular, the optimizer likes to form things like memsets
+      // out of stores in the original IR.  Another important example is
+      // llvm.localescape which must occur in the entry block.  Inserting a
+      // safepoint before it is not legal since it could push the localescape
+      // out of the entry block.
+      return true;
+    }
+  }
+  return false;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+                                                  DominatorTree &DT) {
+
+  // Conceptually, this poll needs to be on method entry, but in
+  // practice, we place it as late in the entry block as possible.  We
+  // can place it as late as we want as long as it dominates all calls
+  // that can grow the stack.  This, combined with backedge polls,
+  // give us all the progress guarantees we need.
+
+  // hasNextInstruction and nextInstruction are used to iterate
+  // through a "straight line" execution sequence.
+
+  auto hasNextInstruction = [](Instruction *I) {
+    if (!I->isTerminator()) {
+      return true;
+    }
+    BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+    return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+  };
+
+  auto nextInstruction = [&hasNextInstruction](Instruction *I) {
+    assert(hasNextInstruction(I) &&
+           "first check if there is a next instruction!");
+    if (I->isTerminator()) {
+      return &I->getParent()->getUniqueSuccessor()->front();
+    } else {
+      return &*++I->getIterator();
+    }
+  };
+
+  Instruction *cursor = nullptr;
+  for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor);
+       cursor = nextInstruction(cursor)) {
+
+    // We need to ensure a safepoint poll occurs before any 'real' call.  The
+    // easiest way to ensure finite execution between safepoints in the face of
+    // recursive and mutually recursive functions is to enforce that each take
+    // a safepoint.  Additionally, we need to ensure a poll before any call
+    // which can grow the stack by an unbounded amount.  This isn't required
+    // for GC semantics per se, but is a common requirement for languages
+    // which detect stack overflow via guard pages and then throw exceptions.
+    if (auto CS = CallSite(cursor)) {
+      if (doesNotRequireEntrySafepointBefore(CS))
+        continue;
+      break;
+    }
+  }
+
+  assert((hasNextInstruction(cursor) || cursor->isTerminator()) &&
+         "either we stopped because of a call, or because of terminator");
+
+  return cursor;
+}
+
+/// Identify the list of call sites which need to be have parseable state
+static void findCallSafepoints(Function &F,
+                               std::vector<CallSite> &Found /*rval*/) {
+  assert(Found.empty() && "must be empty!");
+  for (Instruction &I : instructions(F)) {
+    Instruction *inst = &I;
+    if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) {
+      CallSite CS(inst);
+
+      // No safepoint needed or wanted
+      if (!needsStatepoint(CS)) {
+        continue;
+      }
+
+      Found.push_back(CS);
+    }
+  }
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector.  Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(std::vector<T> &vec) {
+  std::set<T> seen;
+  std::vector<T> tmp;
+  vec.reserve(vec.size());
+  std::swap(tmp, vec);
+  for (auto V : tmp) {
+    if (seen.insert(V).second) {
+      vec.push_back(V);
+    }
+  }
+}
+
+static const char *const GCSafepointPollName = "gc.safepoint_poll";
+
+static bool isGCSafepointPoll(Function &F) {
+  return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites.  The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const auto &FunctionGCName = F.getGC();
+    const StringRef StatepointExampleName("statepoint-example");
+    const StringRef CoreCLRName("coreclr");
+    return (StatepointExampleName == FunctionGCName) ||
+           (CoreCLRName == FunctionGCName);
+  } else
+    return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+// Normalize basic block to make it ready to be target of invoke statepoint.
+// Ensure that 'BB' does not have phi nodes. It may require spliting it.
+static BasicBlock *normalizeForInvokeSafepoint(BasicBlock *BB,
+                                               BasicBlock *InvokeParent) {
+  BasicBlock *ret = BB;
+
+  if (!BB->getUniquePredecessor()) {
+    ret = SplitBlockPredecessors(BB, InvokeParent, "");
+  }
+
+  // Now that 'ret' has unique predecessor we can safely remove all phi nodes
+  // from it
+  FoldSingleEntryPHINodes(ret);
+  assert(!isa<PHINode>(ret->begin()));
+
+  return ret;
+}
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+  if (F.isDeclaration() || F.empty()) {
+    // This is a declaration, nothing to do.  Must exit early to avoid crash in
+    // dom tree calculation
+    return false;
+  }
+
+  if (isGCSafepointPoll(F)) {
+    // Given we're inlining this inside of safepoint poll insertion, this
+    // doesn't make any sense.  Note that we do make any contained calls
+    // parseable after we inline a poll.
+    return false;
+  }
+
+  if (!shouldRewriteFunction(F))
+    return false;
+
+  bool modified = false;
+
+  // In various bits below, we rely on the fact that uses are reachable from
+  // defs.  When there are basic blocks unreachable from the entry, dominance
+  // and reachablity queries return non-sensical results.  Thus, we preprocess
+  // the function to ensure these properties hold.
+  modified |= removeUnreachableBlocks(F);
+
+  // STEP 1 - Insert the safepoint polling locations.  We do not need to
+  // actually insert parse points yet.  That will be done for all polls and
+  // calls in a single pass.
+
+  DominatorTree DT;
+  DT.recalculate(F);
+
+  SmallVector<Instruction *, 16> PollsNeeded;
+  std::vector<CallSite> ParsePointNeeded;
+
+  if (enableBackedgeSafepoints(F)) {
+    // Construct a pass manager to run the LoopPass backedge logic.  We
+    // need the pass manager to handle scheduling all the loop passes
+    // appropriately.  Doing this by hand is painful and just not worth messing
+    // with for the moment.
+    legacy::FunctionPassManager FPM(F.getParent());
+    bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+    PlaceBackedgeSafepointsImpl *PBS =
+      new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+    FPM.add(PBS);
+    FPM.run(F);
+
+    // We preserve dominance information when inserting the poll, otherwise
+    // we'd have to recalculate this on every insert
+    DT.recalculate(F);
+
+    auto &PollLocations = PBS->PollLocations;
+
+    auto OrderByBBName = [](Instruction *a, Instruction *b) {
+      return a->getParent()->getName() < b->getParent()->getName();
+    };
+    // We need the order of list to be stable so that naming ends up stable
+    // when we split edges.  This makes test cases much easier to write.
+    std::sort(PollLocations.begin(), PollLocations.end(), OrderByBBName);
+
+    // We can sometimes end up with duplicate poll locations.  This happens if
+    // a single loop is visited more than once.   The fact this happens seems
+    // wrong, but it does happen for the split-backedge.ll test case.
+    PollLocations.erase(std::unique(PollLocations.begin(),
+                                    PollLocations.end()),
+                        PollLocations.end());
+
+    // Insert a poll at each point the analysis pass identified
+    // The poll location must be the terminator of a loop latch block.
+    for (TerminatorInst *Term : PollLocations) {
+      // We are inserting a poll, the function is modified
+      modified = true;
+
+      if (SplitBackedge) {
+        // Split the backedge of the loop and insert the poll within that new
+        // basic block.  This creates a loop with two latches per original
+        // latch (which is non-ideal), but this appears to be easier to
+        // optimize in practice than inserting the poll immediately before the
+        // latch test.
+
+        // Since this is a latch, at least one of the successors must dominate
+        // it. Its possible that we have a) duplicate edges to the same header
+        // and b) edges to distinct loop headers.  We need to insert pools on
+        // each.
+        SetVector<BasicBlock *> Headers;
+        for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+          BasicBlock *Succ = Term->getSuccessor(i);
+          if (DT.dominates(Succ, Term->getParent())) {
+            Headers.insert(Succ);
+          }
+        }
+        assert(!Headers.empty() && "poll location is not a loop latch?");
+
+        // The split loop structure here is so that we only need to recalculate
+        // the dominator tree once.  Alternatively, we could just keep it up to
+        // date and use a more natural merged loop.
+        SetVector<BasicBlock *> SplitBackedges;
+        for (BasicBlock *Header : Headers) {
+          BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
+          PollsNeeded.push_back(NewBB->getTerminator());
+          NumBackedgeSafepoints++;
+        }
+      } else {
+        // Split the latch block itself, right before the terminator.
+        PollsNeeded.push_back(Term);
+        NumBackedgeSafepoints++;
+      }
+    }
+  }
+
+  if (enableEntrySafepoints(F)) {
+    Instruction *Location = findLocationForEntrySafepoint(F, DT);
+    if (!Location) {
+      // policy choice not to insert?
+    } else {
+      PollsNeeded.push_back(Location);
+      modified = true;
+      NumEntrySafepoints++;
+    }
+  }
+
+  // Now that we've identified all the needed safepoint poll locations, insert
+  // safepoint polls themselves.
+  for (Instruction *PollLocation : PollsNeeded) {
+    std::vector<CallSite> RuntimeCalls;
+    InsertSafepointPoll(PollLocation, RuntimeCalls);
+    ParsePointNeeded.insert(ParsePointNeeded.end(), RuntimeCalls.begin(),
+                            RuntimeCalls.end());
+  }
+  PollsNeeded.clear(); // make sure we don't accidentally use
+  // The dominator tree has been invalidated by the inlining performed in the
+  // above loop.  TODO: Teach the inliner how to update the dom tree?
+  DT.recalculate(F);
+
+  if (enableCallSafepoints(F)) {
+    std::vector<CallSite> Calls;
+    findCallSafepoints(F, Calls);
+    NumCallSafepoints += Calls.size();
+    ParsePointNeeded.insert(ParsePointNeeded.end(), Calls.begin(), Calls.end());
+  }
+
+  // Unique the vectors since we can end up with duplicates if we scan the call
+  // site for call safepoints after we add it for entry or backedge.  The
+  // only reason we need tracking at all is that some functions might have
+  // polls but not call safepoints and thus we might miss marking the runtime
+  // calls for the polls. (This is useful in test cases!)
+  unique_unsorted(ParsePointNeeded);
+
+  // Any parse point (no matter what source) will be handled here
+
+  // We're about to start modifying the function
+  if (!ParsePointNeeded.empty())
+    modified = true;
+
+  // Now run through and insert the safepoints, but do _NOT_ update or remove
+  // any existing uses.  We have references to live variables that need to
+  // survive to the last iteration of this loop.
+  std::vector<Value *> Results;
+  Results.reserve(ParsePointNeeded.size());
+  for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+    CallSite &CS = ParsePointNeeded[i];
+
+    // For invoke statepoints we need to remove all phi nodes at the normal
+    // destination block.
+    // Reason for this is that we can place gc_result only after last phi node
+    // in basic block. We will get malformed code after RAUW for the
+    // gc_result if one of this phi nodes uses result from the invoke.
+    if (InvokeInst *Invoke = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      normalizeForInvokeSafepoint(Invoke->getNormalDest(),
+                                  Invoke->getParent());
+    }
+
+    Value *GCResult = ReplaceWithStatepoint(CS);
+    Results.push_back(GCResult);
+  }
+  assert(Results.size() == ParsePointNeeded.size());
+
+  // Adjust all users of the old call sites to use the new ones instead
+  for (size_t i = 0; i < ParsePointNeeded.size(); i++) {
+    CallSite &CS = ParsePointNeeded[i];
+    Value *GCResult = Results[i];
+    if (GCResult) {
+      // Can not RAUW for the invoke gc result in case of phi nodes preset.
+      assert(CS.isCall() || !isa<PHINode>(cast<Instruction>(GCResult)->getParent()->begin()));
+
+      // Replace all uses with the new call
+      CS.getInstruction()->replaceAllUsesWith(GCResult);
+    }
+
+    // Now that we've handled all uses, remove the original call itself
+    // Note: The insert point can't be the deleted instruction!
+    CS.getInstruction()->eraseFromParent();
+  }
+  return modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+FunctionPass *llvm::createPlaceSafepointsPass() {
+  return new PlaceSafepoints();
+}
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+                      "place-backedge-safepoints-impl",
+                      "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+                    "place-backedge-safepoints-impl",
+                    "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                      false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+                    false, false)
+
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+                    std::vector<CallSite> &ParsePointsNeeded /*rval*/) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Module *M = InsertBefore->getModule();
+  assert(M && "must be part of a module");
+
+  // Inline the safepoint poll implementation - this will get all the branch,
+  // control flow, etc..  Most importantly, it will introduce the actual slow
+  // path call - where we need to insert a safepoint (parsepoint).
+
+  auto *F = M->getFunction(GCSafepointPollName);
+  assert(F && "gc.safepoint_poll function is missing");
+  assert(F->getType()->getElementType() ==
+         FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
+         "gc.safepoint_poll declared with wrong type");
+  assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+  CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
+
+  // Record some information about the call site we're replacing
+  BasicBlock::iterator before(PollCall), after(PollCall);
+  bool isBegin(false);
+  if (before == OrigBB->begin()) {
+    isBegin = true;
+  } else {
+    before--;
+  }
+  after++;
+  assert(after != OrigBB->end() && "must have successor");
+
+  // do the actual inlining
+  InlineFunctionInfo IFI;
+  bool InlineStatus = InlineFunction(PollCall, IFI);
+  assert(InlineStatus && "inline must succeed");
+  (void)InlineStatus; // suppress warning in release-asserts
+
+  // Check post conditions
+  assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+  std::vector<CallInst *> calls; // new calls
+  std::set<BasicBlock *> BBs;    // new BBs + insertee
+  // Include only the newly inserted instructions, Note: begin may not be valid
+  // if we inserted to the beginning of the basic block
+  BasicBlock::iterator start;
+  if (isBegin) {
+    start = OrigBB->begin();
+  } else {
+    start = before;
+    start++;
+  }
+
+  // If your poll function includes an unreachable at the end, that's not
+  // valid.  Bugpoint likes to create this, so check for it.
+  assert(isPotentiallyReachable(&*start, &*after, nullptr, nullptr) &&
+         "malformed poll function");
+
+  scanInlinedCode(&*(start), &*(after), calls, BBs);
+  assert(!calls.empty() && "slow path not found for safepoint poll");
+
+  // Record the fact we need a parsable state at the runtime call contained in
+  // the poll function.  This is required so that the runtime knows how to
+  // parse the last frame when we actually take  the safepoint (i.e. execute
+  // the slow path)
+  assert(ParsePointsNeeded.empty());
+  for (size_t i = 0; i < calls.size(); i++) {
+
+    // No safepoint needed or wanted
+    if (!needsStatepoint(calls[i])) {
+      continue;
+    }
+
+    // These are likely runtime calls.  Should we assert that via calling
+    // convention or something?
+    ParsePointsNeeded.push_back(CallSite(calls[i]));
+  }
+  assert(ParsePointsNeeded.size() <= calls.size());
+}
+
+/// Replaces the given call site (Call or Invoke) with a gc.statepoint
+/// intrinsic with an empty deoptimization arguments list.  This does
+/// NOT do explicit relocation for GC support.
+static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) {
+  assert(CS.getInstruction()->getModule() && "must be set");
+
+  // TODO: technically, a pass is not allowed to get functions from within a
+  // function pass since it might trigger a new function addition.  Refactor
+  // this logic out to the initialization of the pass.  Doesn't appear to
+  // matter in practice.
+
+  // Then go ahead and use the builder do actually do the inserts.  We insert
+  // immediately before the previous instruction under the assumption that all
+  // arguments will be available here.  We can't insert afterwards since we may
+  // be replacing a terminator.
+  IRBuilder<> Builder(CS.getInstruction());
+
+  // Note: The gc args are not filled in at this time, that's handled by
+  // RewriteStatepointsForGC (which is currently under review).
+
+  // Create the statepoint given all the arguments
+  Instruction *Token = nullptr;
+
+  uint64_t ID;
+  uint32_t NumPatchBytes;
+
+  AttributeSet OriginalAttrs = CS.getAttributes();
+  Attribute AttrID =
+      OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
+  Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
+      AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
+
+  AttrBuilder AttrsToRemove;
+  bool HasID = AttrID.isStringAttribute() &&
+               !AttrID.getValueAsString().getAsInteger(10, ID);
+
+  if (HasID)
+    AttrsToRemove.addAttribute("statepoint-id");
+  else
+    ID = 0xABCDEF00;
+
+  bool HasNumPatchBytes =
+      AttrNumPatchBytes.isStringAttribute() &&
+      !AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
+
+  if (HasNumPatchBytes)
+    AttrsToRemove.addAttribute("statepoint-num-patch-bytes");
+  else
+    NumPatchBytes = 0;
+
+  OriginalAttrs = OriginalAttrs.removeAttributes(
+      CS.getInstruction()->getContext(), AttributeSet::FunctionIndex,
+      AttrsToRemove);
+
+  if (CS.isCall()) {
+    CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+    CallInst *Call = Builder.CreateGCStatepointCall(
+        ID, NumPatchBytes, CS.getCalledValue(),
+        makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None,
+        "safepoint_token");
+    Call->setTailCall(ToReplace->isTailCall());
+    Call->setCallingConv(ToReplace->getCallingConv());
+
+    // In case if we can handle this set of attributes - set up function
+    // attributes directly on statepoint and return attributes later for
+    // gc_result intrinsic.
+    Call->setAttributes(OriginalAttrs.getFnAttributes());
+
+    Token = Call;
+
+    // Put the following gc_result and gc_relocate calls immediately after
+    // the old call (which we're about to delete).
+    assert(ToReplace->getNextNode() && "not a terminator, must have next");
+    Builder.SetInsertPoint(ToReplace->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
+  } else if (CS.isInvoke()) {
+    InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
+
+    // Insert the new invoke into the old block.  We'll remove the old one in a
+    // moment at which point this will become the new terminator for the
+    // original block.
+    Builder.SetInsertPoint(ToReplace->getParent());
+    InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+        ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(),
+        ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()),
+        None, None, "safepoint_token");
+
+    Invoke->setCallingConv(ToReplace->getCallingConv());
+
+    // In case if we can handle this set of attributes - set up function
+    // attributes directly on statepoint and return attributes later for
+    // gc_result intrinsic.
+    Invoke->setAttributes(OriginalAttrs.getFnAttributes());
+
+    Token = Invoke;
+
+    // We'll insert the gc.result into the normal block
+    BasicBlock *NormalDest = ToReplace->getNormalDest();
+    // Can not insert gc.result in case of phi nodes preset.
+    // Should have removed this cases prior to running this function
+    assert(!isa<PHINode>(NormalDest->begin()));
+    Instruction *IP = &*(NormalDest->getFirstInsertionPt());
+    Builder.SetInsertPoint(IP);
+  } else {
+    llvm_unreachable("unexpect type of CallSite");
+  }
+  assert(Token);
+
+  // Handle the return value of the original call - update all uses to use a
+  // gc_result hanging off the statepoint node we just inserted
+
+  // Only add the gc_result iff there is actually a used result
+  if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+    std::string TakenName =
+        CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+    CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), TakenName);
+    GCResult->setAttributes(OriginalAttrs.getRetAttributes());
+    return GCResult;
+  } else {
+    // No return value for the call.
+    return nullptr;
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 0000000..bcadd4e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,2306 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "reassociate"
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+namespace {
+  struct ValueEntry {
+    unsigned Rank;
+    Value *Op;
+    ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
+  };
+  inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
+    return LHS.Rank > RHS.Rank;   // Sort so that highest rank goes to start.
+  }
+}
+
+#ifndef NDEBUG
+/// Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+  Module *M = I->getModule();
+  dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+       << *Ops[0].Op->getType() << '\t';
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    dbgs() << "[ ";
+    Ops[i].Op->printAsOperand(dbgs(), false, M);
+    dbgs() << ", #" << Ops[i].Rank << "] ";
+  }
+}
+#endif
+
+namespace {
+  /// \brief Utility class representing a base and exponent pair which form one
+  /// factor of some product.
+  struct Factor {
+    Value *Base;
+    unsigned Power;
+
+    Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
+
+    /// \brief Sort factors in descending order by their power.
+    struct PowerDescendingSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power > RHS.Power;
+      }
+    };
+
+    /// \brief Compare factors for equal powers.
+    struct PowerEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power == RHS.Power;
+      }
+    };
+  };
+  
+  /// Utility class representing a non-constant Xor-operand. We classify
+  /// non-constant Xor-Operands into two categories:
+  ///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+  ///  C2)
+  ///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+  ///          constant.
+  ///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+  ///          operand as "E | 0"
+  class XorOpnd {
+  public:
+    XorOpnd(Value *V);
+
+    bool isInvalid() const { return SymbolicPart == nullptr; }
+    bool isOrExpr() const { return isOr; }
+    Value *getValue() const { return OrigVal; }
+    Value *getSymbolicPart() const { return SymbolicPart; }
+    unsigned getSymbolicRank() const { return SymbolicRank; }
+    const APInt &getConstPart() const { return ConstPart; }
+
+    void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+    void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+    // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
+    // The purpose is twofold:
+    // 1) Cluster together the operands sharing the same symbolic-value.
+    // 2) Operand having smaller symbolic-value-rank is permuted earlier, which 
+    //   could potentially shorten crital path, and expose more loop-invariants.
+    //   Note that values' rank are basically defined in RPO order (FIXME). 
+    //   So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
+    //   than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+    //   "z" in the order of X-Y-Z is better than any other orders.
+    struct PtrSortFunctor {
+      bool operator()(XorOpnd * const &LHS, XorOpnd * const &RHS) {
+        return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+      }
+    };
+  private:
+    Value *OrigVal;
+    Value *SymbolicPart;
+    APInt ConstPart;
+    unsigned SymbolicRank;
+    bool isOr;
+  };
+}
+
+namespace {
+  class Reassociate : public FunctionPass {
+    DenseMap<BasicBlock*, unsigned> RankMap;
+    DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
+    SetVector<AssertingVH<Instruction> > RedoInsts;
+    bool MadeChange;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    Reassociate() : FunctionPass(ID) {
+      initializeReassociatePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+  private:
+    void BuildRankMap(Function &F);
+    unsigned getRank(Value *V);
+    void canonicalizeOperands(Instruction *I);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeExpression(BinaryOperator *I,
+                              SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
+                        Value *&Res);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                        APInt &ConstOpnd, Value *&Res);
+    bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                SmallVectorImpl<Factor> &Factors);
+    Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                   SmallVectorImpl<Factor> &Factors);
+    Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *RemoveFactorFromExpression(Value *V, Value *Factor);
+    void EraseInst(Instruction *I);
+    void RecursivelyEraseDeadInsts(Instruction *I,
+                                   SetVector<AssertingVH<Instruction>> &Insts);
+    void OptimizeInst(Instruction *I);
+    Instruction *canonicalizeNegConstExpr(Instruction *I);
+  };
+}
+
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (isa<ConstantInt>(V0))
+      std::swap(V0, V1);
+
+    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
+      ConstPart = C->getValue();
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  isOr = true;
+}
+
+char Reassociate::ID = 0;
+INITIALIZE_PASS(Reassociate, "reassociate",
+                "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
+
+/// Return true if V is an instruction of the specified opcode and if it
+/// only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode &&
+      (!isa<FPMathOperator>(V) ||
+       cast<Instruction>(V)->hasUnsafeAlgebra()))
+    return cast<BinaryOperator>(V);
+  return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+                                        unsigned Opcode2) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      (cast<Instruction>(V)->getOpcode() == Opcode1 ||
+       cast<Instruction>(V)->getOpcode() == Opcode2) &&
+      (!isa<FPMathOperator>(V) ||
+       cast<Instruction>(V)->hasUnsafeAlgebra()))
+    return cast<BinaryOperator>(V);
+  return nullptr;
+}
+
+void Reassociate::BuildRankMap(Function &F) {
+  unsigned i = 2;
+
+  // Assign distinct ranks to function arguments.
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    ValueRankMap[&*I] = ++i;
+    DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+  }
+
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+         E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    unsigned BBRank = RankMap[BB] = ++i << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (mayBeMemoryDependent(*I))
+        ValueRankMap[&*I] = ++BBRank;
+  }
+}
+
+unsigned Reassociate::getRank(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
+    return 0;  // Otherwise it's a global or constant, rank 0.
+  }
+
+  if (unsigned Rank = ValueRankMap[I])
+    return Rank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands();
+       i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a not or neg instruction, do not count it for rank.  This
+  // assures us that X and ~X will have the same rank.
+  if  (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+       !BinaryOperator::isFNeg(I))
+    ++Rank;
+
+  DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
+
+  return ValueRankMap[I] = Rank;
+}
+
+// Canonicalize constants to RHS.  Otherwise, sort the operands by rank.
+void Reassociate::canonicalizeOperands(Instruction *I) {
+  assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+  assert(I->isCommutative() && "Expected commutative operator.");
+
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  unsigned LHSRank = getRank(LHS);
+  unsigned RHSRank = getRank(RHS);
+
+  if (isa<Constant>(RHS))
+    return;
+
+  if (isa<Constant>(LHS) || RHSRank < LHSRank)
+    cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+        BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+      BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntOrIntVectorTy())
+    return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+  else {
+    BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+/// Replace 0-X with X*-1.
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+  Type *Ty = Neg->getType();
+  Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+    ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
+
+  BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
+  Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Res->setDebugLoc(Neg->getDebugLoc());
+  return Res;
+}
+
+/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
+/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+  if (Bitwidth < 3)
+    return Bitwidth - 1;
+  return Bitwidth - 2;
+}
+
+/// Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
+/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+  // If we were working with infinite precision arithmetic then the combined
+  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
+  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+  // for nilpotent operations and addition, but not for idempotent operations
+  // and multiplication), so it is important to correctly reduce the combined
+  // weight back into range if wrapping would be wrong.
+
+  // If RHS is zero then the weight didn't change.
+  if (RHS.isMinValue())
+    return;
+  // If LHS is zero then the combined weight is RHS.
+  if (LHS.isMinValue()) {
+    LHS = RHS;
+    return;
+  }
+  // From this point on we know that neither LHS nor RHS is zero.
+
+  if (Instruction::isIdempotent(Opcode)) {
+    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+    // weight of 1.  Keeping weights at zero or one also means that wrapping is
+    // not a problem.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    return; // Return a weight of 1.
+  }
+  if (Instruction::isNilpotent(Opcode)) {
+    // Nilpotent means X op X === 0, so reduce weights modulo 2.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    LHS = 0; // 1 + 1 === 0 modulo 2.
+    return;
+  }
+  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
+    // TODO: Reduce the weight by exploiting nsw/nuw?
+    LHS += RHS;
+    return;
+  }
+
+  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+         "Unknown associative operation!");
+  unsigned Bitwidth = LHS.getBitWidth();
+  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
+  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
+  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+  // which by a happy accident means that they can always be represented using
+  // Bitwidth bits.
+  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
+  // the Carmichael number).
+  if (Bitwidth > 3) {
+    /// CM - The value of Carmichael's lambda function.
+    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+    // Any weight W >= Threshold can be replaced with W - CM.
+    APInt Threshold = CM + Bitwidth;
+    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+    // For Bitwidth 4 or more the following sum does not overflow.
+    LHS += RHS;
+    while (LHS.uge(Threshold))
+      LHS -= CM;
+  } else {
+    // To avoid problems with overflow do everything the same as above but using
+    // a larger type.
+    unsigned CM = 1U << CarmichaelShift(Bitwidth);
+    unsigned Threshold = CM + Bitwidth;
+    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+           "Weights not reduced!");
+    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+    while (Total >= Threshold)
+      Total -= CM;
+    LHS = Total;
+  }
+}
+
+typedef std::pair<Value*, APInt> RepeatedValue;
+
+/// Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs).  The
+/// original expression is the same as
+///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times
+/// op
+///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times
+/// op
+///   ...
+/// op
+///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'.  The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different.  Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node.  Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+///           expression graph        node names
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B
+///                  / \ / \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression.  This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef.  If it is B's operand then this gives:
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B - operand of B replaced with undef
+///                  / \   \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+
+static bool LinearizeExprTree(BinaryOperator *I,
+                              SmallVectorImpl<RepeatedValue> &Ops) {
+  DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+  unsigned Opcode = I->getOpcode();
+  assert(I->isAssociative() && I->isCommutative() &&
+         "Expected an associative and commutative operation!");
+
+  // Visit all operands of the expression, keeping track of their weight (the
+  // number of paths from the expression root to the operand, or if you like
+  // the number of times that operand occurs in the linearized expression).
+  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+  // while A has weight two.
+
+  // Worklist of non-leaf nodes (their operands are in the expression too) along
+  // with their weights, representing a certain number of paths to the operator.
+  // If an operator occurs in the worklist multiple times then we found multiple
+  // ways to get to it.
+  SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  bool Changed = false;
+
+  // Leaves of the expression are values that either aren't the right kind of
+  // operation (eg: a constant, or a multiply in an add tree), or are, but have
+  // some uses that are not inside the expression.  For example, in I = X + X,
+  // X = A + B, the value X has two uses (by I) that are in the expression.  If
+  // X has any other uses, for example in a return instruction, then we consider
+  // X to be a leaf, and won't analyze it further.  When we first visit a value,
+  // if it has more than one use then at first we conservatively consider it to
+  // be a leaf.  Later, as the expression is explored, we may discover some more
+  // uses of the value from inside the expression.  If all uses turn out to be
+  // from within the expression (and the value is a binary operator of the right
+  // kind) then the value is no longer considered to be a leaf, and its operands
+  // are explored.
+
+  // Leaves - Keeps track of the set of putative leaves as well as the number of
+  // paths to each leaf seen so far.
+  typedef DenseMap<Value*, APInt> LeafMap;
+  LeafMap Leaves; // Leaf -> Total weight so far.
+  SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+  SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+  while (!Worklist.empty()) {
+    std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+    I = P.first; // We examine the operands of this binary operator.
+
+    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+      Value *Op = I->getOperand(OpIdx);
+      APInt Weight = P.second; // Number of paths to this operand.
+      DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+      // If this is a binary operation of the right kind with only one use then
+      // add its operands to the expression.
+      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+        assert(Visited.insert(Op).second && "Not first visit!");
+        DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        Worklist.push_back(std::make_pair(BO, Weight));
+        continue;
+      }
+
+      // Appears to be a leaf.  Is the operand already in the set of leaves?
+      LeafMap::iterator It = Leaves.find(Op);
+      if (It == Leaves.end()) {
+        // Not in the leaf map.  Must be the first time we saw this operand.
+        assert(Visited.insert(Op).second && "Not first visit!");
+        if (!Op->hasOneUse()) {
+          // This value has uses not accounted for by the expression, so it is
+          // not safe to modify.  Mark it as being a leaf.
+          DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LeafOrder.push_back(Op);
+          Leaves[Op] = Weight;
+          continue;
+        }
+        // No uses outside the expression, try morphing it.
+      } else if (It != Leaves.end()) {
+        // Already in the leaf map.
+        assert(Visited.count(Op) && "In leaf map but not visited!");
+
+        // Update the number of paths to the leaf.
+        IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0   // TODO: Re-enable once PR13021 is fixed.
+        // The leaf already has one use from inside the expression.  As we want
+        // exactly one such use, drop this new use of the leaf.
+        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+        I->setOperand(OpIdx, UndefValue::get(I->getType()));
+        Changed = true;
+
+        // If the leaf is a binary operation of the right kind and we now see
+        // that its multiple original uses were in fact all by nodes belonging
+        // to the expression, then no longer consider it to be a leaf and add
+        // its operands to the expression.
+        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+          DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          Worklist.push_back(std::make_pair(BO, It->second));
+          Leaves.erase(It);
+          continue;
+        }
+#endif
+
+        // If we still have uses that are not accounted for by the expression
+        // then it is not safe to modify the value.
+        if (!Op->hasOneUse())
+          continue;
+
+        // No uses outside the expression, try morphing it.
+        Weight = It->second;
+        Leaves.erase(It); // Since the value may be morphed below.
+      }
+
+      // At this point we have a value which, first of all, is not a binary
+      // expression of the right kind, and secondly, is only used inside the
+      // expression.  This means that it can safely be modified.  See if we
+      // can usefully morph it into an expression of the right kind.
+      assert((!isa<Instruction>(Op) ||
+              cast<Instruction>(Op)->getOpcode() != Opcode
+              || (isa<FPMathOperator>(Op) &&
+                  !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
+             "Should have been handled above!");
+      assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+      // If this is a multiply expression, turn any internal negations into
+      // multiplies by -1 so they can be reassociated.
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
+        if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+            (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+          DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+          BO = LowerNegateToMultiply(BO);
+          DEBUG(dbgs() << *BO << '\n');
+          Worklist.push_back(std::make_pair(BO, Weight));
+          Changed = true;
+          continue;
+        }
+
+      // Failed to morph into an expression of the right type.  This really is
+      // a leaf.
+      DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+      LeafOrder.push_back(Op);
+      Leaves[Op] = Weight;
+    }
+  }
+
+  // The leaves, repeated according to their weights, represent the linearized
+  // form of the expression.
+  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+    Value *V = LeafOrder[i];
+    LeafMap::iterator It = Leaves.find(V);
+    if (It == Leaves.end())
+      // Node initially thought to be a leaf wasn't.
+      continue;
+    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+    APInt Weight = It->second;
+    if (Weight.isMinValue())
+      // Leaf already output or weight reduction eliminated it.
+      continue;
+    // Ensure the leaf is only output once.
+    It->second = 0;
+    Ops.push_back(std::make_pair(V, Weight));
+  }
+
+  // For nilpotent operations or addition there may be no operands, for example
+  // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+  // in both cases the weight reduces to 0 causing the value to be skipped.
+  if (Ops.empty()) {
+    Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+    assert(Identity && "Associative operation without identity!");
+    Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+  }
+
+  return Changed;
+}
+
+/// Now that the operands for this expression tree are
+/// linearized and optimized, emit them in-order.
+void Reassociate::RewriteExprTree(BinaryOperator *I,
+                                  SmallVectorImpl<ValueEntry> &Ops) {
+  assert(Ops.size() > 1 && "Single values should be used directly!");
+
+  // Since our optimizations should never increase the number of operations, the
+  // new expression can usually be written reusing the existing binary operators
+  // from the original expression tree, without creating any new instructions,
+  // though the rewritten expression may have a completely different topology.
+  // We take care to not change anything if the new expression will be the same
+  // as the original.  If more than trivial changes (like commuting operands)
+  // were made then we are obliged to clear out any optional subclass data like
+  // nsw flags.
+
+  /// NodesToRewrite - Nodes from the original expression available for writing
+  /// the new expression into.
+  SmallVector<BinaryOperator*, 8> NodesToRewrite;
+  unsigned Opcode = I->getOpcode();
+  BinaryOperator *Op = I;
+
+  /// NotRewritable - The operands being written will be the leaves of the new
+  /// expression and must not be used as inner nodes (via NodesToRewrite) by
+  /// mistake.  Inner nodes are always reassociable, and usually leaves are not
+  /// (if they were they would have been incorporated into the expression and so
+  /// would not be leaves), so most of the time there is no danger of this.  But
+  /// in rare cases a leaf may become reassociable if an optimization kills uses
+  /// of it, or it may momentarily become reassociable during rewriting (below)
+  /// due it being removed as an operand of one of its uses.  Ensure that misuse
+  /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+  /// leaves and refusing to reuse any of them as inner nodes.
+  SmallPtrSet<Value*, 8> NotRewritable;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    NotRewritable.insert(Ops[i].Op);
+
+  // ExpressionChanged - Non-null if the rewritten expression differs from the
+  // original in some non-trivial way, requiring the clearing of optional flags.
+  // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+  BinaryOperator *ExpressionChanged = nullptr;
+  for (unsigned i = 0; ; ++i) {
+    // The last operation (which comes earliest in the IR) is special as both
+    // operands will come from Ops, rather than just one with the other being
+    // a subexpression.
+    if (i+2 == Ops.size()) {
+      Value *NewLHS = Ops[i].Op;
+      Value *NewRHS = Ops[i+1].Op;
+      Value *OldLHS = Op->getOperand(0);
+      Value *OldRHS = Op->getOperand(1);
+
+      if (NewLHS == OldLHS && NewRHS == OldRHS)
+        // Nothing changed, leave it alone.
+        break;
+
+      if (NewLHS == OldRHS && NewRHS == OldLHS) {
+        // The order of the operands was reversed.  Swap them.
+        DEBUG(dbgs() << "RA: " << *Op << '\n');
+        Op->swapOperands();
+        DEBUG(dbgs() << "TO: " << *Op << '\n');
+        MadeChange = true;
+        ++NumChanged;
+        break;
+      }
+
+      // The new operation differs non-trivially from the original. Overwrite
+      // the old operands with the new ones.
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewLHS != OldLHS) {
+        BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(0, NewLHS);
+      }
+      if (NewRHS != OldRHS) {
+        BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+      ExpressionChanged = Op;
+      MadeChange = true;
+      ++NumChanged;
+
+      break;
+    }
+
+    // Not the last operation.  The left-hand side will be a sub-expression
+    // while the right-hand side will be the current element of Ops.
+    Value *NewRHS = Ops[i].Op;
+    if (NewRHS != Op->getOperand(1)) {
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewRHS == Op->getOperand(0)) {
+        // The new right-hand side was already present as the left operand.  If
+        // we are lucky then swapping the operands will sort out both of them.
+        Op->swapOperands();
+      } else {
+        // Overwrite with the new right-hand side.
+        BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+        ExpressionChanged = Op;
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+      MadeChange = true;
+      ++NumChanged;
+    }
+
+    // Now deal with the left-hand side.  If this is already an operation node
+    // from the original expression then just rewrite the rest of the expression
+    // into it.
+    BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+    if (BO && !NotRewritable.count(BO)) {
+      Op = BO;
+      continue;
+    }
+
+    // Otherwise, grab a spare node from the original expression and use that as
+    // the left-hand side.  If there are no nodes left then the optimizers made
+    // an expression with more nodes than the original!  This usually means that
+    // they did something stupid but it might mean that the problem was just too
+    // hard (finding the mimimal number of multiplications needed to realize a
+    // multiplication expression is NP-complete).  Whatever the reason, smart or
+    // stupid, create a new node if there are none left.
+    BinaryOperator *NewOp;
+    if (NodesToRewrite.empty()) {
+      Constant *Undef = UndefValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+                                     Undef, Undef, "", I);
+      if (NewOp->getType()->isFPOrFPVectorTy())
+        NewOp->setFastMathFlags(I->getFastMathFlags());
+    } else {
+      NewOp = NodesToRewrite.pop_back_val();
+    }
+
+    DEBUG(dbgs() << "RA: " << *Op << '\n');
+    Op->setOperand(0, NewOp);
+    DEBUG(dbgs() << "TO: " << *Op << '\n');
+    ExpressionChanged = Op;
+    MadeChange = true;
+    ++NumChanged;
+    Op = NewOp;
+  }
+
+  // If the expression changed non-trivially then clear out all subclass data
+  // starting from the operator specified in ExpressionChanged, and compactify
+  // the operators to just before the expression root to guarantee that the
+  // expression tree is dominated by all of Ops.
+  if (ExpressionChanged)
+    do {
+      // Preserve FastMathFlags.
+      if (isa<FPMathOperator>(I)) {
+        FastMathFlags Flags = I->getFastMathFlags();
+        ExpressionChanged->clearSubclassOptionalData();
+        ExpressionChanged->setFastMathFlags(Flags);
+      } else
+        ExpressionChanged->clearSubclassOptionalData();
+
+      if (ExpressionChanged == I)
+        break;
+      ExpressionChanged->moveBefore(I);
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
+    } while (1);
+
+  // Throw away any left over nodes from the original expression.
+  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+    RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified.  The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds.  These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+                          SetVector<AssertingVH<Instruction>> &ToRedo) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getType()->isFPOrFPVectorTy()) {
+      return ConstantExpr::getFNeg(C);
+    }
+    return ConstantExpr::getNeg(C);
+  }
+
+
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions.
+  //
+  if (BinaryOperator *I =
+          isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
+    // Push the negates through the add.
+    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
+    if (I->getOpcode() == Instruction::Add) {
+      I->setHasNoUnsignedWrap(false);
+      I->setHasNoSignedWrap(false);
+    }
+
+    // We must move the add instruction here, because the neg instructions do
+    // not dominate the old add instruction in general.  By moving it, we are
+    // assured that the neg instructions we just inserted dominate the
+    // instruction we are about to insert after them.
+    //
+    I->moveBefore(BI);
+    I->setName(I->getName()+".neg");
+
+    // Add the intermediate negates to the redo list as processing them later
+    // could expose more reassociating opportunities.
+    ToRedo.insert(I);
+    return I;
+  }
+
+  // Okay, we need to materialize a negated version of V with an instruction.
+  // Scan the use lists of V to see if we have one already.
+  for (User *U : V->users()) {
+    if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+      continue;
+
+    // We found one!  Now we have to make sure that the definition dominates
+    // this use.  We do this by moving it to the entry block (if it is a
+    // non-instruction value) or right after the definition.  These negates will
+    // be zapped by reassociate later, so we don't need much finesse here.
+    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+
+    // Verify that the negate is in this function, V might be a constant expr.
+    if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+      continue;
+
+    BasicBlock::iterator InsertPt;
+    if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+        InsertPt = II->getNormalDest()->begin();
+      } else {
+        InsertPt = ++InstInput->getIterator();
+      }
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+    } else {
+      InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+    }
+    TheNeg->moveBefore(&*InsertPt);
+    if (TheNeg->getOpcode() == Instruction::Sub) {
+      TheNeg->setHasNoUnsignedWrap(false);
+      TheNeg->setHasNoSignedWrap(false);
+    } else {
+      TheNeg->andIRFlags(BI);
+    }
+    ToRedo.insert(TheNeg);
+    return TheNeg;
+  }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  ToRedo.insert(NewNeg);
+  return NewNeg;
+}
+
+/// Return true if we should break up this subtract of X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+  // If this is a negation, we can't split it up!
+  if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+    return false;
+
+  // Don't breakup X - undef.
+  if (isa<UndefValue>(Sub->getOperand(1)))
+    return false;
+
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // subtract or if this is only used by one.
+  Value *V0 = Sub->getOperand(0);
+  if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
+    return true;
+  Value *V1 = Sub->getOperand(1);
+  if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
+    return true;
+  Value *VB = Sub->user_back();
+  if (Sub->hasOneUse() &&
+      (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+       isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
+    return true;
+
+  return false;
+}
+
+/// If we have (X-Y), and if either X is an add, or if this is only used by an
+/// add, transform this into (X+(0-Y)) to promote better reassociation.
+static BinaryOperator *
+BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
+  // Convert a subtract into an add and a neg instruction. This allows sub
+  // instructions to be commuted with other add instructions.
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction,
+  // and set it as the RHS of the add instruction we just made.
+  //
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
+  BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
+  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  Sub->replaceAllUsesWith(New);
+  New->setDebugLoc(Sub->getDebugLoc());
+
+  DEBUG(dbgs() << "Negated: " << *New << '\n');
+  return New;
+}
+
+/// If this is a shift of a reassociable multiply or is used by one, change
+/// this into a multiply by a constant to assist with further reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+  Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+  MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+  BinaryOperator *Mul =
+    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Mul->takeName(Shl);
+
+  // Everyone now refers to the mul instruction.
+  Shl->replaceAllUsesWith(Mul);
+  Mul->setDebugLoc(Shl->getDebugLoc());
+
+  // We can safely preserve the nuw flag in all cases.  It's also safe to turn a
+  // nuw nsw shl into a nuw nsw mul.  However, nsw in isolation requires special
+  // handling.
+  bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+  bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+  if (NSW && NUW)
+    Mul->setHasNoSignedWrap(true);
+  Mul->setHasNoUnsignedWrap(NUW);
+  return Mul;
+}
+
+/// Scan backwards and forwards among values with the same rank as element i
+/// to see if X exists.  If X does not exist, return i.  This is useful when
+/// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
+                                  Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
+    if (Ops[j].Op == X)
+      return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
+  // Scan backwards.
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
+    if (Ops[j].Op == X)
+      return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
+  return i;
+}
+
+/// Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+                                  SmallVectorImpl<WeakVH> &Ops){
+  if (Ops.size() == 1) return Ops.back();
+
+  Value *V1 = Ops.back();
+  Ops.pop_back();
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return CreateAdd(V2, V1, "tmp", I, I);
+}
+
+/// If V is an expression tree that is a multiplication sequence,
+/// and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO)
+    return nullptr;
+
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(BO, Tree);
+  SmallVector<ValueEntry, 8> Factors;
+  Factors.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Factors.append(E.second.getZExtValue(),
+                   ValueEntry(getRank(E.first), E.first));
+  }
+
+  bool FoundFactor = false;
+  bool NeedsNegate = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+
+    // If this is a negative version of this factor, remove it.
+    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
+      if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+        if (FC1->getValue() == -FC2->getValue()) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin()+i);
+          break;
+        }
+    } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+      if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+        APFloat F1(FC1->getValueAPF());
+        APFloat F2(FC2->getValueAPF());
+        F2.changeSign();
+        if (F1.compare(F2) == APFloat::cmpEqual) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin() + i);
+          break;
+        }
+      }
+    }
+  }
+
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return nullptr;
+  }
+
+  BasicBlock::iterator InsertPt = ++BO->getIterator();
+
+  // If this was just a single multiply, remove the multiply and return the only
+  // remaining operand.
+  if (Factors.size() == 1) {
+    RedoInsts.insert(BO);
+    V = Factors[0].Op;
+  } else {
+    RewriteExprTree(BO, Factors);
+    V = BO;
+  }
+
+  if (NeedsNegate)
+    V = CreateNeg(V, "neg", &*InsertPt, BO);
+
+  return V;
+}
+
+/// If V is a single-use multiply, recursively add its operands as factors,
+/// otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         SmallVectorImpl<Value*> &Factors,
+                                       const SmallVectorImpl<ValueEntry> &Ops) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO) {
+    Factors.push_back(V);
+    return;
+  }
+
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+}
+
+/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
+/// This optimizes based on identities.  If it can be reduced to a single Value,
+/// it is returned, otherwise the Ops list is mutated as necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+                               SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+  // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    // First, check for X and ~X in the operand list.
+    assert(i < Ops.size());
+    if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+      Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+      unsigned FoundX = FindInOperandList(Ops, i, X);
+      if (FoundX != i) {
+        if (Opcode == Instruction::And)   // ...&X&~X = 0
+          return Constant::getNullValue(X->getType());
+
+        if (Opcode == Instruction::Or)    // ...|X|~X = -1
+          return Constant::getAllOnesValue(X->getType());
+      }
+    }
+
+    // Next, check for duplicate pairs of values, which we assume are next to
+    // each other, due to our sorting criteria.
+    assert(i < Ops.size());
+    if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+      if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+        // Drop duplicate values for And and Or.
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+        ++NumAnnihil;
+        continue;
+      }
+
+      // Drop pairs of values for Xor.
+      assert(Opcode == Instruction::Xor);
+      if (e == 2)
+        return Constant::getNullValue(Ops[0].Op->getType());
+
+      // Y ^ X^X -> Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+      i -= 1; e -= 2;
+      ++NumAnnihil;
+    }
+  }
+  return nullptr;
+}
+
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd != 0) {
+    if (!ConstOpnd.isAllOnesValue()) {
+      LLVMContext &Ctx = Opnd->getType()->getContext();
+      Instruction *I;
+      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
+                                    "and.ra", InsertBefore);
+      I->setDebugLoc(InsertBefore->getDebugLoc());
+      return I;
+    }
+    return Opnd;
+  }
+  return nullptr;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//  
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                 APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
+    if (!Opnd1->getValue()->hasOneUse())
+      return false;
+
+    const APInt &C1 = Opnd1->getConstPart();
+    if (C1 != ConstOpnd)
+      return false;
+
+    Value *X = Opnd1->getSymbolicPart();
+    Res = createAndInstr(I, X, ~C1);
+    // ConstOpnd was C2, now C1 ^ C2.
+    ConstOpnd ^= C1;
+
+    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+      RedoInsts.insert(T);
+    return true;
+  }
+  return false;
+}
+
+                           
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value. 
+// 
+// If it was successful, true is returned, and the "R" and "C" is returned 
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                                 APInt &ConstOpnd, Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    
+    // Do not increase code size
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *Reassociate::OptimizeXor(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+      
+  if (Ops.size() == 1)
+    return nullptr;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<XorOpnd*, 8> OpndPtrs;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    if (!isa<ConstantInt>(V)) {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+    } else
+      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+  }
+
+  // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+  //  It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+  //  the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+  //  with the previous loop --- the iterator of the "Opnds" may be invalidated
+  //  when new elements are added to the vector.
+  for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+    OpndPtrs.push_back(&Opnds[i]);
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = nullptr;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = OpndPtrs[i];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
+    //    
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = nullptr;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (ConstOpnd != 0) {
+      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    int Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    else if (Sz == 0) {
+      assert(ConstOpnd == 0);
+      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    }
+  }
+
+  return nullptr;
+}
+
+/// Optimize a series of operands to an 'add' instruction.  This
+/// optimizes based on identities.  If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *Reassociate::OptimizeAdd(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and -X pairs.  If we find any, we
+  // can simplify expressions like X+-X == 0 and X+~X ==-1.  While we're at it,
+  // scan for any
+  // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *TheOp = Ops[i].Op;
+    // Check to see if we've seen this operand before.  If so, we factor all
+    // instances of the operand together.  Due to our sorting criteria, we know
+    // that these need to be next to each other in the vector.
+    if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+      // Rescan the list, remove all instances of this operand from the expr.
+      unsigned NumFound = 0;
+      do {
+        Ops.erase(Ops.begin()+i);
+        ++NumFound;
+      } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+      DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+      ++NumFactor;
+
+      // Insert a new multiply.
+      Type *Ty = TheOp->getType();
+      Constant *C = Ty->isIntOrIntVectorTy() ?
+        ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
+      Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
+
+      // Now that we have inserted a multiply, optimize it. This allows us to
+      // handle cases that require multiple factoring steps, such as this:
+      // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+      RedoInsts.insert(Mul);
+
+      // If every add operand was a duplicate, return the multiply.
+      if (Ops.empty())
+        return Mul;
+
+      // Otherwise, we had some input that didn't have the dupe, such as
+      // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
+      // things being added by this operation.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+      --i;
+      e = Ops.size();
+      continue;
+    }
+
+    // Check for X and -X or X and ~X in the operand list.
+    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
+        !BinaryOperator::isNot(TheOp))
+      continue;
+
+    Value *X = nullptr;
+    if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
+      X = BinaryOperator::getNegArgument(TheOp);
+    else if (BinaryOperator::isNot(TheOp))
+      X = BinaryOperator::getNotArgument(TheOp);
+
+    unsigned FoundX = FindInOperandList(Ops, i, X);
+    if (FoundX == i)
+      continue;
+
+    // Remove X and -X from the operand list.
+    if (Ops.size() == 2 &&
+        (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
+      return Constant::getNullValue(X->getType());
+
+    // Remove X and ~X from the operand list.
+    if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+      return Constant::getAllOnesValue(X->getType());
+
+    Ops.erase(Ops.begin()+i);
+    if (i < FoundX)
+      --FoundX;
+    else
+      --i;   // Need to back up an extra one.
+    Ops.erase(Ops.begin()+FoundX);
+    ++NumAnnihil;
+    --i;     // Revisit element.
+    e -= 2;  // Removed two elements.
+
+    // if X and ~X we append -1 to the operand list.
+    if (BinaryOperator::isNot(TheOp)) {
+      Value *V = Constant::getAllOnesValue(X->getType());
+      Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+      e += 1;
+    }
+  }
+
+  // Scan the operand list, checking to see if there are any common factors
+  // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+  // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+  // To efficiently find this, we count the number of times a factor occurs
+  // for any ADD operands that are MULs.
+  DenseMap<Value*, unsigned> FactorOccurrences;
+
+  // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+  // where they are actually the same multiply.
+  unsigned MaxOcc = 0;
+  Value *MaxOccVal = nullptr;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    BinaryOperator *BOp =
+        isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+    if (!BOp)
+      continue;
+
+    // Compute all of the factors of this added value.
+    SmallVector<Value*, 8> Factors;
+    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+    assert(Factors.size() > 1 && "Bad linearize!");
+
+    // Add one to FactorOccurrences for each unique factor in this op.
+    SmallPtrSet<Value*, 8> Duplicates;
+    for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+      Value *Factor = Factors[i];
+      if (!Duplicates.insert(Factor).second)
+        continue;
+
+      unsigned Occ = ++FactorOccurrences[Factor];
+      if (Occ > MaxOcc) {
+        MaxOcc = Occ;
+        MaxOccVal = Factor;
+      }
+
+      // If Factor is a negative constant, add the negated value as a factor
+      // because we can percolate the negate out.  Watch for minint, which
+      // cannot be positivified.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
+        if (CI->isNegative() && !CI->isMinValue(true)) {
+          Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+          assert(!Duplicates.count(Factor) &&
+                 "Shouldn't have two constant factors, missed a canonicalize");
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+        if (CF->isNegative()) {
+          APFloat F(CF->getValueAPF());
+          F.changeSign();
+          Factor = ConstantFP::get(CF->getContext(), F);
+          assert(!Duplicates.count(Factor) &&
+                 "Shouldn't have two constant factors, missed a canonicalize");
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      }
+    }
+  }
+
+  // If any factor occurred more than one time, we can pull it out.
+  if (MaxOcc > 1) {
+    DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+    ++NumFactor;
+
+    // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+    // this, we could otherwise run into situations where removing a factor
+    // from an expression will drop a use of maxocc, and this can cause
+    // RemoveFactorFromExpression on successive values to behave differently.
+    Instruction *DummyInst =
+        I->getType()->isIntOrIntVectorTy()
+            ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+            : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
+    SmallVector<WeakVH, 4> NewMulOps;
+    for (unsigned i = 0; i != Ops.size(); ++i) {
+      // Only try to remove factors from expressions we're allowed to.
+      BinaryOperator *BOp =
+          isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+      if (!BOp)
+        continue;
+
+      if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+        // The factorized operand may occur several times.  Convert them all in
+        // one fell swoop.
+        for (unsigned j = Ops.size(); j != i;) {
+          --j;
+          if (Ops[j].Op == Ops[i].Op) {
+            NewMulOps.push_back(V);
+            Ops.erase(Ops.begin()+j);
+          }
+        }
+        --i;
+      }
+    }
+
+    // No need for extra uses anymore.
+    delete DummyInst;
+
+    unsigned NumAddedValues = NewMulOps.size();
+    Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+    // Now that we have inserted the add tree, optimize it. This allows us to
+    // handle cases that require multiple factoring steps, such as this:
+    // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+    assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+    (void)NumAddedValues;
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      RedoInsts.insert(VI);
+
+    // Create the multiply.
+    Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
+
+    // Rerun associate on the multiply in case the inner expression turned into
+    // a multiply.  We want to make sure that we keep things in canonical form.
+    RedoInsts.insert(V2);
+
+    // If every add operand included the factor (e.g. "A*B + A*C"), then the
+    // entire result expression is just the multiply "A*(B+C)".
+    if (Ops.empty())
+      return V2;
+
+    // Otherwise, we had some input that didn't have the factor, such as
+    // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
+    // things being added by this operation.
+    Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+  }
+
+  return nullptr;
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+///      (x*x)          -> [(x, 2)]
+///     ((x*x)*x)       -> [(x, 3)]
+///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                         SmallVectorImpl<Factor> &Factors) {
+  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+  // Compute the sum of powers of simplifiable factors.
+  unsigned FactorPowerSum = 0;
+  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    // Track for simplification all factors which occur 2 or more times.
+    if (Count > 1)
+      FactorPowerSum += Count;
+  }
+
+  // We can only simplify factors if the sum of the powers of our simplifiable
+  // factors is 4 or higher. When that is the case, we will *always* have
+  // a simplification. This is an important invariant to prevent cyclicly
+  // trying to simplify already minimal formations.
+  if (FactorPowerSum < 4)
+    return false;
+
+  // Now gather the simplifiable factors, removing them from Ops.
+  FactorPowerSum = 0;
+  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    if (Count == 1)
+      continue;
+    // Move an even number of occurrences to Factors.
+    Count &= ~1U;
+    Idx -= Count;
+    FactorPowerSum += Count;
+    Factors.push_back(Factor(Op, Count));
+    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+  }
+
+  // None of the adjustments above should have reduced the sum of factor powers
+  // below our mininum of '4'.
+  assert(FactorPowerSum >= 4);
+
+  std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+  return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+                                SmallVectorImpl<Value*> &Ops) {
+  if (Ops.size() == 1)
+    return Ops.back();
+
+  Value *LHS = Ops.pop_back_val();
+  do {
+    if (LHS->getType()->isIntOrIntVectorTy())
+      LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+    else
+      LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
+  } while (!Ops.empty());
+
+  return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                            SmallVectorImpl<Factor> &Factors) {
+  assert(Factors[0].Power);
+  SmallVector<Value *, 4> OuterProduct;
+  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+       Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+    if (Factors[Idx].Power != Factors[LastIdx].Power) {
+      LastIdx = Idx;
+      continue;
+    }
+
+    // We want to multiply across all the factors with the same power so that
+    // we can raise them to that power as a single entity. Build a mini tree
+    // for that.
+    SmallVector<Value *, 4> InnerProduct;
+    InnerProduct.push_back(Factors[LastIdx].Base);
+    do {
+      InnerProduct.push_back(Factors[Idx].Base);
+      ++Idx;
+    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+    // Reset the base value of the first factor to the new expression tree.
+    // We'll remove all the factors with the same power in a second pass.
+    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+    if (Instruction *MI = dyn_cast<Instruction>(M))
+      RedoInsts.insert(MI);
+
+    LastIdx = Idx;
+  }
+  // Unique factors with equal powers -- we've folded them into the first one's
+  // base.
+  Factors.erase(std::unique(Factors.begin(), Factors.end(),
+                            Factor::PowerEqual()),
+                Factors.end());
+
+  // Iteratively collect the base of each factor with an add power into the
+  // outer product, and halve each power in preparation for squaring the
+  // expression.
+  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+    if (Factors[Idx].Power & 1)
+      OuterProduct.push_back(Factors[Idx].Base);
+    Factors[Idx].Power >>= 1;
+  }
+  if (Factors[0].Power) {
+    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+    OuterProduct.push_back(SquareRoot);
+    OuterProduct.push_back(SquareRoot);
+  }
+  if (OuterProduct.size() == 1)
+    return OuterProduct.front();
+
+  Value *V = buildMultiplyTree(Builder, OuterProduct);
+  return V;
+}
+
+Value *Reassociate::OptimizeMul(BinaryOperator *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // We can only optimize the multiplies when there is a chain of more than
+  // three, such that a balanced tree might require fewer total multiplies.
+  if (Ops.size() < 4)
+    return nullptr;
+
+  // Try to turn linear trees of multiplies without other uses of the
+  // intermediate stages into minimal multiply DAGs with perfect sub-expression
+  // re-use.
+  SmallVector<Factor, 4> Factors;
+  if (!collectMultiplyFactors(Ops, Factors))
+    return nullptr; // All distinct factors, so nothing left for us to do.
+
+  IRBuilder<> Builder(I);
+  Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+  if (Ops.empty())
+    return V;
+
+  ValueEntry NewEntry = ValueEntry(getRank(V), V);
+  Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
+  return nullptr;
+}
+
+Value *Reassociate::OptimizeExpression(BinaryOperator *I,
+                                       SmallVectorImpl<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  Constant *Cst = nullptr;
+  unsigned Opcode = I->getOpcode();
+  while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+    Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+    Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+  }
+  // If there was nothing but constants then we are done.
+  if (Ops.empty())
+    return Cst;
+
+  // Put the combined constant back at the end of the operand list, except if
+  // there is no point.  For example, an add of 0 gets dropped here, while a
+  // multiplication by zero turns the whole expression into zero.
+  if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+    if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+      return Cst;
+    Ops.push_back(ValueEntry(0, Cst));
+  }
+
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation due to identities between elements in the
+  // argument list here.
+  unsigned NumOps = Ops.size();
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+    if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+      return Result;
+    break;
+
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Add:
+  case Instruction::FAdd:
+    if (Value *Result = OptimizeAdd(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Mul:
+  case Instruction::FMul:
+    if (Value *Result = OptimizeMul(I, Ops))
+      return Result;
+    break;
+  }
+
+  if (Ops.size() != NumOps)
+    return OptimizeExpression(I, Ops);
+  return nullptr;
+}
+
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void Reassociate::RecursivelyEraseDeadInsts(
+    Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
+  ValueRankMap.erase(I);
+  Insts.remove(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  for (auto Op : Ops)
+    if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+      if (OpInst->use_empty())
+        Insts.insert(OpInst);
+}
+
+/// Zap the given instruction, adding interesting operands to the work list.
+void Reassociate::EraseInst(Instruction *I) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+  // Erase the dead instruction.
+  ValueRankMap.erase(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  // Optimize its operands.
+  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+      // If this is a node in an expression tree, climb to the expression root
+      // and add that since that's where optimization actually happens.
+      unsigned Opcode = Op->getOpcode();
+      while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
+             Visited.insert(Op).second)
+        Op = Op->user_back();
+      RedoInsts.insert(Op);
+    }
+}
+
+// Canonicalize expressions of the following form:
+//  x + (-Constant * y) -> x - (Constant * y)
+//  x - (-Constant * y) -> x + (Constant * y)
+Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
+  if (!I->hasOneUse() || I->getType()->isVectorTy())
+    return nullptr;
+
+  // Must be a fmul or fdiv instruction.
+  unsigned Opcode = I->getOpcode();
+  if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
+    return nullptr;
+
+  auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
+  auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
+
+  // Both operands are constant, let it get constant folded away.
+  if (C0 && C1)
+    return nullptr;
+
+  ConstantFP *CF = C0 ? C0 : C1;
+
+  // Must have one constant operand.
+  if (!CF)
+    return nullptr;
+
+  // Must be a negative ConstantFP.
+  if (!CF->isNegative())
+    return nullptr;
+
+  // User must be a binary operator with one or more uses.
+  Instruction *User = I->user_back();
+  if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
+    return nullptr;
+
+  unsigned UserOpcode = User->getOpcode();
+  if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
+    return nullptr;
+
+  // Subtraction is not commutative. Explicitly, the following transform is
+  // not valid: (-Constant * y) - x  -> x + (Constant * y)
+  if (!User->isCommutative() && User->getOperand(1) != I)
+    return nullptr;
+
+  // Change the sign of the constant.
+  APFloat Val = CF->getValueAPF();
+  Val.changeSign();
+  I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
+
+  // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
+  // ((-Const*y) + x) -> (x + (-Const*y)).
+  if (User->getOperand(0) == I && User->isCommutative())
+    cast<BinaryOperator>(User)->swapOperands();
+
+  Value *Op0 = User->getOperand(0);
+  Value *Op1 = User->getOperand(1);
+  BinaryOperator *NI;
+  switch (UserOpcode) {
+  default:
+    llvm_unreachable("Unexpected Opcode!");
+  case Instruction::FAdd:
+    NI = BinaryOperator::CreateFSub(Op0, Op1);
+    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+    break;
+  case Instruction::FSub:
+    NI = BinaryOperator::CreateFAdd(Op0, Op1);
+    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+    break;
+  }
+
+  NI->insertBefore(User);
+  NI->setName(User->getName());
+  User->replaceAllUsesWith(NI);
+  NI->setDebugLoc(I->getDebugLoc());
+  RedoInsts.insert(I);
+  MadeChange = true;
+  return NI;
+}
+
+/// Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void Reassociate::OptimizeInst(Instruction *I) {
+  // Only consider operations that we understand.
+  if (!isa<BinaryOperator>(I))
+    return;
+
+  if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
+    // If an operand of this shift is a reassociable multiply, or if the shift
+    // is used by a reassociable multiply or add, turn into a multiply.
+    if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+        (I->hasOneUse() &&
+         (isReassociableOp(I->user_back(), Instruction::Mul) ||
+          isReassociableOp(I->user_back(), Instruction::Add)))) {
+      Instruction *NI = ConvertShiftToMul(I);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    }
+
+  // Canonicalize negative constants out of expressions.
+  if (Instruction *Res = canonicalizeNegConstExpr(I))
+    I = Res;
+
+  // Commute binary operators, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing other
+  // transformations simpler.
+  if (I->isCommutative())
+    canonicalizeOperands(I);
+
+  // TODO: We should optimize vector Xor instructions, but they are
+  // currently unsupported.
+  if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
+    return;
+
+  // Don't optimize floating point instructions that don't have unsafe algebra.
+  if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
+    return;
+
+  // Do not reassociate boolean (i1) expressions.  We want to preserve the
+  // original order of evaluation for short-circuited comparisons that
+  // SimplifyCFG has folded to AND/OR expressions.  If the expression
+  // is not further optimized, it is likely to be transformed back to a
+  // short-circuited form for code gen, and the source order may have been
+  // optimized for the most likely conditions.
+  if (I->getType()->isIntegerTy(1))
+    return;
+
+  // If this is a subtract instruction which is not already in negate form,
+  // see if we can convert it to X+-Y.
+  if (I->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (BinaryOperator::isNeg(I)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::Mul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  } else if (I->getOpcode() == Instruction::FSub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (BinaryOperator::isFNeg(I)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::FMul))) {
+        // If the negate was simplified, revisit the users to see if we can
+        // reassociate further.
+        Instruction *NI = LowerNegateToMultiply(I);
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  }
+
+  // If this instruction is an associative binary operator, process it.
+  if (!I->isAssociative()) return;
+  BinaryOperator *BO = cast<BinaryOperator>(I);
+
+  // If this is an interior node of a reassociable tree, ignore it until we
+  // get to the root of the tree, to avoid N^2 analysis.
+  unsigned Opcode = BO->getOpcode();
+  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+    // During the initial run we will get to the root of the tree.
+    // But if we get here while we are redoing instructions, there is no
+    // guarantee that the root will be visited. So Redo later
+    if (BO->user_back() != BO &&
+        BO->getParent() == BO->user_back()->getParent())
+      RedoInsts.insert(BO->user_back());
+    return;
+  }
+
+  // If this is an add tree that is used by a sub instruction, ignore it
+  // until we process the subtract.
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
+    return;
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+    return;
+
+  ReassociateExpression(BO);
+}
+
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+  // First, walk the expression tree, linearizing the tree, collecting the
+  // operand information.
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(I, Tree);
+  SmallVector<ValueEntry, 8> Ops;
+  Ops.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Ops.append(E.second.getZExtValue(),
+               ValueEntry(getRank(E.first), E.first));
+  }
+
+  DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  std::stable_sort(Ops.begin(), Ops.end());
+
+  // Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    if (V == I)
+      // Self-referential expression in unreachable code.
+      return;
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+    I->replaceAllUsesWith(V);
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      VI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    ++NumAnnihil;
+    return;
+  }
+
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->hasOneUse()) {
+    if (I->getOpcode() == Instruction::Mul &&
+        cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+        isa<ConstantInt>(Ops.back().Op) &&
+        cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    } else if (I->getOpcode() == Instruction::FMul &&
+               cast<Instruction>(I->user_back())->getOpcode() ==
+                   Instruction::FAdd &&
+               isa<ConstantFP>(Ops.back().Op) &&
+               cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    }
+  }
+
+  DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  if (Ops.size() == 1) {
+    if (Ops[0].Op == I)
+      // Self-referential expression in unreachable code.
+      return;
+
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+      OI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    return;
+  }
+
+  // Now that we ordered and optimized the expressions, splat them back into
+  // the expression tree, removing any unneeded nodes.
+  RewriteExprTree(I, Ops);
+}
+
+bool Reassociate::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  // Calculate the rank map for F
+  BuildRankMap(F);
+
+  MadeChange = false;
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Optimize every instruction in the basic block.
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
+      if (isInstructionTriviallyDead(&*II)) {
+        EraseInst(&*II++);
+      } else {
+        OptimizeInst(&*II);
+        assert(II->getParent() == BI && "Moved to a different block!");
+        ++II;
+      }
+
+    // Make a copy of all the instructions to be redone so we can remove dead
+    // instructions.
+    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+    // Iterate over all instructions to be reevaluated and remove trivially dead
+    // instructions. If any operand of the trivially dead instruction becomes
+    // dead mark it for deletion as well. Continue this process until all
+    // trivially dead instructions have been removed.
+    while (!ToRedo.empty()) {
+      Instruction *I = ToRedo.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        RecursivelyEraseDeadInsts(I, ToRedo);
+    }
+
+    // Now that we have removed dead instructions, we can reoptimize the
+    // remaining instructions.
+    while (!RedoInsts.empty()) {
+      Instruction *I = RedoInsts.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        EraseInst(I);
+      else
+        OptimizeInst(I);
+    }
+  }
+
+  // We are done with the rank map.
+  RankMap.clear();
+  ValueRankMap.clear();
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 0000000..915f897
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,132 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intended to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "reg2mem"
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+  struct RegToMem : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    RegToMem() : FunctionPass(ID) {
+      initializeRegToMemPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addPreservedID(BreakCriticalEdgesID);
+    }
+
+    bool valueEscapes(const Instruction *Inst) const {
+      const BasicBlock *BB = Inst->getParent();
+      for (const User *U : Inst->users()) {
+        const Instruction *UI = cast<Instruction>(U);
+        if (UI->getParent() != BB || isa<PHINode>(UI))
+          return true;
+      }
+      return false;
+    }
+
+    bool runOnFunction(Function &F) override;
+  };
+}
+
+char RegToMem::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false)
+
+bool RegToMem::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  // Insert all new allocas into entry block.
+  BasicBlock *BBEntry = &F.getEntryBlock();
+  assert(pred_empty(BBEntry) &&
+         "Entry block to function must not have predecessors!");
+
+  // Find first non-alloca instruction and create insertion point. This is
+  // safe if block is well-formed: it always have terminator, otherwise
+  // we'll get and assertion.
+  BasicBlock::iterator I = BBEntry->begin();
+  while (isa<AllocaInst>(I)) ++I;
+
+  CastInst *AllocaInsertionPoint = new BitCastInst(
+      Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+      Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+
+  // Find the escaped instructions. But don't create stack slots for
+  // allocas in entry block.
+  std::list<Instruction*> WorkList;
+  for (Function::iterator ibb = F.begin(), ibe = F.end();
+       ibb != ibe; ++ibb)
+    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+         iib != iie; ++iib) {
+      if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+          valueEscapes(&*iib)) {
+        WorkList.push_front(&*iib);
+      }
+    }
+
+  // Demote escaped instructions
+  NumRegsDemoted += WorkList.size();
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+       ile = WorkList.end(); ilb != ile; ++ilb)
+    DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+
+  WorkList.clear();
+
+  // Find all phi's
+  for (Function::iterator ibb = F.begin(), ibe = F.end();
+       ibb != ibe; ++ibb)
+    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+         iib != iie; ++iib)
+      if (isa<PHINode>(iib))
+        WorkList.push_front(&*iib);
+
+  // Demote phi nodes
+  NumPhisDemoted += WorkList.size();
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+       ile = WorkList.end(); ilb != ile; ++ilb)
+    DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+
+  return true;
+}
+
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+  return new RegToMem();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
new file mode 100644
index 0000000..d77d574
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -0,0 +1,2915 @@
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite an existing set of gc.statepoints such that they make potential
+// relocations performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+                                  cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
+                                      cl::init(false));
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
+                                       cl::init(false));
+
+// Cost threshold measuring when it is profitable to rematerialize value instead
+// of relocating it
+static cl::opt<unsigned>
+RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
+                           cl::init(6));
+
+#ifdef XDEBUG
+static bool ClobberNonLive = true;
+#else
+static bool ClobberNonLive = false;
+#endif
+static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
+                                                  cl::location(ClobberNonLive),
+                                                  cl::Hidden);
+
+static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden,
+                                     cl::init(false));
+static cl::opt<bool>
+    AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+                                   cl::Hidden, cl::init(true));
+
+/// Should we split vectors of pointers into their individual elements?  This
+/// is known to be buggy, but the alternate implementation isn't yet ready.
+/// This is purely to provide a debugging and dianostic hook until the vector
+/// split is replaced with vector relocations.
+static cl::opt<bool> UseVectorSplit("rs4gc-split-vector-values", cl::Hidden,
+                                    cl::init(true));
+
+namespace {
+struct RewriteStatepointsForGC : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteStatepointsForGC() : ModulePass(ID) {
+    initializeRewriteStatepointsForGCPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+  bool runOnModule(Module &M) override {
+    bool Changed = false;
+    for (Function &F : M)
+      Changed |= runOnFunction(F);
+
+    if (Changed) {
+      // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
+      // returns true for at least one function in the module.  Since at least
+      // one function changed, we know that the precondition is satisfied.
+      stripNonValidAttributes(M);
+    }
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We add and rewrite a bunch of instructions, but don't really do much
+    // else.  We could in theory preserve a lot more analyses here.
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  /// The IR fed into RewriteStatepointsForGC may have had attributes implying
+  /// dereferenceability that are no longer valid/correct after
+  /// RewriteStatepointsForGC has run.  This is because semantically, after
+  /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+  /// heap.  stripNonValidAttributes (conservatively) restores correctness
+  /// by erasing all attributes in the module that externally imply
+  /// dereferenceability.
+  /// Similar reasoning also applies to the noalias attributes. gc.statepoint
+  /// can touch the entire heap including noalias objects.
+  void stripNonValidAttributes(Module &M);
+
+  // Helpers for stripNonValidAttributes
+  void stripNonValidAttributesFromBody(Function &F);
+  void stripNonValidAttributesFromPrototype(Function &F);
+};
+} // namespace
+
+char RewriteStatepointsForGC::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCPass() {
+  return new RewriteStatepointsForGC();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+                      "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGC, "rewrite-statepoints-for-gc",
+                    "Make relocations explicit at statepoints", false, false)
+
+namespace {
+struct GCPtrLivenessData {
+  /// Values defined in this block.
+  DenseMap<BasicBlock *, DenseSet<Value *>> KillSet;
+  /// Values used in this block (and thus live); does not included values
+  /// killed within this block.
+  DenseMap<BasicBlock *, DenseSet<Value *>> LiveSet;
+
+  /// Values live into this basic block (i.e. used by any
+  /// instruction in this basic block or ones reachable from here)
+  DenseMap<BasicBlock *, DenseSet<Value *>> LiveIn;
+
+  /// Values live out of this basic block (i.e. live into
+  /// any successor block)
+  DenseMap<BasicBlock *, DenseSet<Value *>> LiveOut;
+};
+
+// The type of the internal cache used inside the findBasePointers family
+// of functions.  From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain.  Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+typedef DenseMap<Value *, Value *> DefiningValueMapTy;
+typedef DenseSet<Value *> StatepointLiveSetTy;
+typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>>
+  RematerializedValueMapTy;
+
+struct PartiallyConstructedSafepointRecord {
+  /// The set of values known to be live across this safepoint
+  StatepointLiveSetTy LiveSet;
+
+  /// Mapping from live pointers to a base-defining-value
+  DenseMap<Value *, Value *> PointerToBase;
+
+  /// The *new* gc.statepoint instruction itself.  This produces the token
+  /// that normal path gc.relocates and the gc.result are tied to.
+  Instruction *StatepointToken;
+
+  /// Instruction to which exceptional gc relocates are attached
+  /// Makes it easier to iterate through them during relocationViaAlloca.
+  Instruction *UnwindToken;
+
+  /// Record live values we are rematerialized instead of relocating.
+  /// They are not included into 'LiveSet' field.
+  /// Maps rematerialized copy to it's original value.
+  RematerializedValueMapTy RematerializedValues;
+};
+}
+
+static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) {
+  assert(UseDeoptBundles && "Should not be called otherwise!");
+
+  Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt");
+
+  if (!DeoptBundle.hasValue()) {
+    assert(AllowStatepointWithNoDeoptInfo &&
+           "Found non-leaf call without deopt info!");
+    return None;
+  }
+
+  return DeoptBundle.getValue().Inputs;
+}
+
+/// Compute the live-in set for every basic block in the function
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+                                GCPtrLivenessData &Data);
+
+/// Given results from the dataflow liveness computation, find the set of live
+/// Values at a particular instruction.
+static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
+                              StatepointLiveSetTy &out);
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+
+static bool isGCPointerType(Type *T) {
+  if (auto *PT = dyn_cast<PointerType>(T))
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.
+    return (1 == PT->getAddressSpace());
+  return false;
+}
+
+// Return true if this type is one which a) is a gc pointer or contains a GC
+// pointer and b) is of a type this code expects to encounter as a live value.
+// (The insertion code will assert that a type which matches (a) and not (b)
+// is not encountered.)
+static bool isHandledGCPointerType(Type *T) {
+  // We fully support gc pointers
+  if (isGCPointerType(T))
+    return true;
+  // We partially support vectors of gc pointers. The code will assert if it
+  // can't handle something.
+  if (auto VT = dyn_cast<VectorType>(T))
+    if (isGCPointerType(VT->getElementType()))
+      return true;
+  return false;
+}
+
+#ifndef NDEBUG
+/// Returns true if this type contains a gc pointer whether we know how to
+/// handle that type or not.
+static bool containsGCPtrType(Type *Ty) {
+  if (isGCPointerType(Ty))
+    return true;
+  if (VectorType *VT = dyn_cast<VectorType>(Ty))
+    return isGCPointerType(VT->getScalarType());
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+    return containsGCPtrType(AT->getElementType());
+  if (StructType *ST = dyn_cast<StructType>(Ty))
+    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
+                       containsGCPtrType);
+  return false;
+}
+
+// Returns true if this is a type which a) is a gc pointer or contains a GC
+// pointer and b) is of a type which the code doesn't expect (i.e. first class
+// aggregates).  Used to trip assertions.
+static bool isUnhandledGCPointerType(Type *Ty) {
+  return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
+}
+#endif
+
+static bool order_by_name(Value *a, Value *b) {
+  if (a->hasName() && b->hasName()) {
+    return -1 == a->getName().compare(b->getName());
+  } else if (a->hasName() && !b->hasName()) {
+    return true;
+  } else if (!a->hasName() && b->hasName()) {
+    return false;
+  } else {
+    // Better than nothing, but not stable
+    return a < b;
+  }
+}
+
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+                                    StringRef DefaultName) {
+  return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The  analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live.  Values used by that instruction are considered live.
+static void analyzeParsePointLiveness(
+    DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData,
+    const CallSite &CS, PartiallyConstructedSafepointRecord &result) {
+  Instruction *inst = CS.getInstruction();
+
+  StatepointLiveSetTy LiveSet;
+  findLiveSetAtInst(inst, OriginalLivenessData, LiveSet);
+
+  if (PrintLiveSet) {
+    // Note: This output is used by several of the test cases
+    // The order of elements in a set is not stable, put them in a vec and sort
+    // by name
+    SmallVector<Value *, 64> Temp;
+    Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end());
+    std::sort(Temp.begin(), Temp.end(), order_by_name);
+    errs() << "Live Variables:\n";
+    for (Value *V : Temp)
+      dbgs() << " " << V->getName() << " " << *V << "\n";
+  }
+  if (PrintLiveSetSize) {
+    errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n";
+    errs() << "Number live values: " << LiveSet.size() << "\n";
+  }
+  result.LiveSet = LiveSet;
+}
+
+static bool isKnownBaseResult(Value *V);
+namespace {
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value.  The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value.  The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer. 
+struct BaseDefiningValueResult {
+  /// Contains the value which is the base defining value.
+  Value * const BDV;
+  /// True if the base defining value is also known to be an actual base
+  /// pointer.
+  const bool IsKnownBase;
+  BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+    : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+    // Check consistency between new and old means of checking whether a BDV is
+    // a base.
+    bool MustBeBase = isKnownBaseResult(BDV);
+    assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
+  }
+};
+}
+
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+
+/// Return a base defining value for the 'Index' element of the given vector
+/// instruction 'I'.  If Index is null, returns a BDV for the entire vector
+/// 'I'.  As an optimization, this method will try to determine when the 
+/// element is known to already be a base pointer.  If this can be established,
+/// the second value in the returned pair will be true.  Note that either a
+/// vector or a pointer typed value can be returned.  For the former, the
+/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
+/// If the later, the return pointer is a BDV (or possibly a base) for the
+/// particular element in 'I'.  
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
+  // Each case parallels findBaseDefiningValue below, see that code for
+  // detailed motivation.
+
+  if (isa<Argument>(I))
+    // An incoming argument to the function is a base pointer
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I))
+    // Constant vectors consist only of constant pointers.
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<LoadInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<InsertElementInst>(I))
+    // We don't know whether this vector contains entirely base pointers or
+    // not.  To be conservatively correct, we treat it as a BDV and will
+    // duplicate code as needed to construct a parallel vector of bases.
+    return BaseDefiningValueResult(I, false);
+
+  if (isa<ShuffleVectorInst>(I))
+    // We don't know whether this vector contains entirely base pointers or
+    // not.  To be conservatively correct, we treat it as a BDV and will
+    // duplicate code as needed to construct a parallel vector of bases.
+    // TODO: There a number of local optimizations which could be applied here
+    // for particular sufflevector patterns.
+    return BaseDefiningValueResult(I, false);
+
+  // A PHI or Select is a base defining value.  The outer findBasePointer
+  // algorithm is responsible for constructing a base value for this BDV.
+  assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+         "unknown vector instruction - no base found for vector element");
+  return BaseDefiningValueResult(I, false);
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+  assert(I->getType()->isPtrOrPtrVectorTy() &&
+         "Illegal to ask for the base pointer of a non-pointer type");
+
+  if (I->getType()->isVectorTy())
+    return findBaseDefiningValueOfVector(I);
+
+  if (isa<Argument>(I))
+    // An incoming argument to the function is a base pointer
+    // We should have never reached here if this argument isn't an gc value
+    return BaseDefiningValueResult(I, true);
+
+  if (isa<Constant>(I))
+    // We assume that objects with a constant base (e.g. a global) can't move
+    // and don't need to be reported to the collector because they are always
+    // live.  All constants have constant bases.  Besides global references, all
+    // kinds of constants (e.g. undef, constant expressions, null pointers) can
+    // be introduced by the inliner or the optimizer, especially on dynamically
+    // dead paths.  See e.g. test4 in constants.ll.
+    return BaseDefiningValueResult(I, true);
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    Value *Def = CI->stripPointerCasts();
+    // If stripping pointer casts changes the address space there is an
+    // addrspacecast in between.
+    assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+               cast<PointerType>(CI->getType())->getAddressSpace() &&
+           "unsupported addrspacecast");
+    // If we find a cast instruction here, it means we've found a cast which is
+    // not simply a pointer cast (i.e. an inttoptr).  We don't know how to
+    // handle int->ptr conversion.
+    assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
+    return findBaseDefiningValue(Def);
+  }
+
+  if (isa<LoadInst>(I))
+    // The value loaded is an gc base itself
+    return BaseDefiningValueResult(I, true);
+  
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+    // The base of this GEP is the base
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      // fall through to general call handling
+      break;
+    case Intrinsic::experimental_gc_statepoint:
+      llvm_unreachable("statepoints don't produce pointers");
+    case Intrinsic::experimental_gc_relocate: {
+      // Rerunning safepoint insertion after safepoints are already
+      // inserted is not supported.  It could probably be made to work,
+      // but why are you doing this?  There's no good reason.
+      llvm_unreachable("repeat safepoint insertion is not supported");
+    }
+    case Intrinsic::gcroot:
+      // Currently, this mechanism hasn't been extended to work with gcroot.
+      // There's no reason it couldn't be, but I haven't thought about the
+      // implications much.
+      llvm_unreachable(
+          "interaction with the gcroot mechanism is not supported");
+    }
+  }
+  // We assume that functions in the source language only return base
+  // pointers.  This should probably be generalized via attributes to support
+  // both source language and internal functions.
+  if (isa<CallInst>(I) || isa<InvokeInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // I have absolutely no idea how to implement this part yet.  It's not
+  // necessarily hard, I just haven't really looked at it yet.
+  assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+  if (isa<AtomicCmpXchgInst>(I))
+    // A CAS is effectively a atomic store and load combined under a
+    // predicate.  From the perspective of base pointers, we just treat it
+    // like a load.
+    return BaseDefiningValueResult(I, true);
+
+  assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
+                                   "binary ops which don't apply to pointers");
+
+  // The aggregate ops.  Aggregates can either be in the heap or on the
+  // stack, but in either case, this is simply a field load.  As a result,
+  // this is a defining definition of the base just like a load is.
+  if (isa<ExtractValueInst>(I))
+    return BaseDefiningValueResult(I, true);
+
+  // We should never see an insert vector since that would require we be
+  // tracing back a struct value not a pointer value.
+  assert(!isa<InsertValueInst>(I) &&
+         "Base pointer for a struct is meaningless");
+
+  // An extractelement produces a base result exactly when it's input does.
+  // We may need to insert a parallel instruction to extract the appropriate
+  // element out of the base vector corresponding to the input. Given this,
+  // it's analogous to the phi and select case even though it's not a merge.
+  if (isa<ExtractElementInst>(I))
+    // Note: There a lot of obvious peephole cases here.  This are deliberately
+    // handled after the main base pointer inference algorithm to make writing
+    // test cases to exercise that code easier.
+    return BaseDefiningValueResult(I, false);
+
+  // The last two cases here don't return a base pointer.  Instead, they
+  // return a value which dynamically selects from among several base
+  // derived pointers (each with it's own base potentially).  It's the job of
+  // the caller to resolve these.
+  assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+         "missing instruction case in findBaseDefiningValing");
+  return BaseDefiningValueResult(I, false);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
+  Value *&Cached = Cache[I];
+  if (!Cached) {
+    Cached = findBaseDefiningValue(I).BDV;
+    DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+                 << Cached->getName() << "\n");
+  }
+  assert(Cache[I] != nullptr);
+  return Cached;
+}
+
+/// Return a base pointer for this value if known.  Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
+  Value *Def = findBaseDefiningValueCached(I, Cache);
+  auto Found = Cache.find(Def);
+  if (Found != Cache.end()) {
+    // Either a base-of relation, or a self reference.  Caller must check.
+    return Found->second;
+  }
+  // Only a BDV available
+  return Def;
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer?  Or do we need to continue searching.
+static bool isKnownBaseResult(Value *V) {
+  if (!isa<PHINode>(V) && !isa<SelectInst>(V) &&
+      !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+      !isa<ShuffleVectorInst>(V)) {
+    // no recursion possible
+    return true;
+  }
+  if (isa<Instruction>(V) &&
+      cast<Instruction>(V)->getMetadata("is_base_value")) {
+    // This is a previously inserted base phi or select.  We know
+    // that this is a base value.
+    return true;
+  }
+
+  // We need to keep searching
+  return false;
+}
+
+namespace {
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
+public:
+  enum Status { Unknown, Base, Conflict };
+
+  BDVState(Status s, Value *b = nullptr) : status(s), base(b) {
+    assert(status != Base || b);
+  }
+  explicit BDVState(Value *b) : status(Base), base(b) {}
+  BDVState() : status(Unknown), base(nullptr) {}
+
+  Status getStatus() const { return status; }
+  Value *getBase() const { return base; }
+
+  bool isBase() const { return getStatus() == Base; }
+  bool isUnknown() const { return getStatus() == Unknown; }
+  bool isConflict() const { return getStatus() == Conflict; }
+
+  bool operator==(const BDVState &other) const {
+    return base == other.base && status == other.status;
+  }
+
+  bool operator!=(const BDVState &other) const { return !(*this == other); }
+
+  LLVM_DUMP_METHOD
+  void dump() const { print(dbgs()); dbgs() << '\n'; }
+  
+  void print(raw_ostream &OS) const {
+    switch (status) {
+    case Unknown:
+      OS << "U";
+      break;
+    case Base:
+      OS << "B";
+      break;
+    case Conflict:
+      OS << "C";
+      break;
+    };
+    OS << " (" << base << " - "
+       << (base ? base->getName() : "nullptr") << "): ";
+  }
+
+private:
+  Status status;
+  AssertingVH<Value> base; // non null only if status == base
+};
+}
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+  State.print(OS);
+  return OS;
+}
+#endif
+
+namespace {
+// Values of type BDVState form a lattice, and this is a helper
+// class that implementes the meet operation.  The meat of the meet
+// operation is implemented in MeetBDVStates::pureMeet
+class MeetBDVStates {
+public:
+  /// Initializes the currentResult to the TOP state so that if can be met with
+  /// any other state to produce that state.
+  MeetBDVStates() {}
+
+  // Destructively meet the current result with the given BDVState
+  void meetWith(BDVState otherState) {
+    currentResult = meet(otherState, currentResult);
+  }
+
+  BDVState getResult() const { return currentResult; }
+
+private:
+  BDVState currentResult;
+
+  /// Perform a meet operation on two elements of the BDVState lattice.
+  static BDVState meet(BDVState LHS, BDVState RHS) {
+    assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) &&
+           "math is wrong: meet does not commute!");
+    BDVState Result = pureMeet(LHS, RHS);
+    DEBUG(dbgs() << "meet of " << LHS << " with " << RHS
+                 << " produced " << Result << "\n");
+    return Result;
+  }
+
+  static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) {
+    switch (stateA.getStatus()) {
+    case BDVState::Unknown:
+      return stateB;
+
+    case BDVState::Base:
+      assert(stateA.getBase() && "can't be null");
+      if (stateB.isUnknown())
+        return stateA;
+
+      if (stateB.isBase()) {
+        if (stateA.getBase() == stateB.getBase()) {
+          assert(stateA == stateB && "equality broken!");
+          return stateA;
+        }
+        return BDVState(BDVState::Conflict);
+      }
+      assert(stateB.isConflict() && "only three states!");
+      return BDVState(BDVState::Conflict);
+
+    case BDVState::Conflict:
+      return stateA;
+    }
+    llvm_unreachable("only three states!");
+  }
+};
+}
+
+
+/// For a given value or instruction, figure out what base ptr it's derived
+/// from.  For gc objects, this is simply itself.  On success, returns a value
+/// which is the base pointer.  (This is reliable and can be used for
+/// relocation.)  On failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) {
+  Value *def = findBaseOrBDV(I, cache);
+
+  if (isKnownBaseResult(def)) {
+    return def;
+  }
+
+  // Here's the rough algorithm:
+  // - For every SSA value, construct a mapping to either an actual base
+  //   pointer or a PHI which obscures the base pointer.
+  // - Construct a mapping from PHI to unknown TOP state.  Use an
+  //   optimistic algorithm to propagate base pointer information.  Lattice
+  //   looks like:
+  //   UNKNOWN
+  //   b1 b2 b3 b4
+  //   CONFLICT
+  //   When algorithm terminates, all PHIs will either have a single concrete
+  //   base or be in a conflict state.
+  // - For every conflict, insert a dummy PHI node without arguments.  Add
+  //   these to the base[Instruction] = BasePtr mapping.  For every
+  //   non-conflict, add the actual base.
+  //  - For every conflict, add arguments for the base[a] of each input
+  //   arguments.
+  //
+  // Note: A simpler form of this would be to add the conflict form of all
+  // PHIs without running the optimistic algorithm.  This would be
+  // analogous to pessimistic data flow and would likely lead to an
+  // overall worse solution.
+
+#ifndef NDEBUG
+  auto isExpectedBDVType = [](Value *BDV) {
+    return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+           isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV);
+  };
+#endif
+
+  // Once populated, will contain a mapping from each potentially non-base BDV
+  // to a lattice value (described above) which corresponds to that BDV.
+  // We use the order of insertion (DFS over the def/use graph) to provide a
+  // stable deterministic ordering for visiting DenseMaps (which are unordered)
+  // below.  This is important for deterministic compilation.
+  MapVector<Value *, BDVState> States;
+
+  // Recursively fill in all base defining values reachable from the initial
+  // one for which we don't already know a definite base value for
+  /* scope */ {
+    SmallVector<Value*, 16> Worklist;
+    Worklist.push_back(def);
+    States.insert(std::make_pair(def, BDVState()));
+    while (!Worklist.empty()) {
+      Value *Current = Worklist.pop_back_val();
+      assert(!isKnownBaseResult(Current) && "why did it get added?");
+
+      auto visitIncomingValue = [&](Value *InVal) {
+        Value *Base = findBaseOrBDV(InVal, cache);
+        if (isKnownBaseResult(Base))
+          // Known bases won't need new instructions introduced and can be
+          // ignored safely
+          return;
+        assert(isExpectedBDVType(Base) && "the only non-base values "
+               "we see should be base defining values");
+        if (States.insert(std::make_pair(Base, BDVState())).second)
+          Worklist.push_back(Base);
+      };
+      if (PHINode *Phi = dyn_cast<PHINode>(Current)) {
+        for (Value *InVal : Phi->incoming_values())
+          visitIncomingValue(InVal);
+      } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) {
+        visitIncomingValue(Sel->getTrueValue());
+        visitIncomingValue(Sel->getFalseValue());
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+        visitIncomingValue(EE->getVectorOperand());
+      } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+        visitIncomingValue(IE->getOperand(0)); // vector operand
+        visitIncomingValue(IE->getOperand(1)); // scalar operand
+      } else {
+        // There is one known class of instructions we know we don't handle.
+        assert(isa<ShuffleVectorInst>(Current));
+        llvm_unreachable("unimplemented instruction case");
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "States after initialization:\n");
+  for (auto Pair : States) {
+    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+  }
+#endif
+
+  // Return a phi state for a base defining value.  We'll generate a new
+  // base state for known bases and expect to find a cached state otherwise.
+  auto getStateForBDV = [&](Value *baseValue) {
+    if (isKnownBaseResult(baseValue))
+      return BDVState(baseValue);
+    auto I = States.find(baseValue);
+    assert(I != States.end() && "lookup failed!");
+    return I->second;
+  };
+
+  bool progress = true;
+  while (progress) {
+#ifndef NDEBUG
+    const size_t oldSize = States.size();
+#endif
+    progress = false;
+    // We're only changing values in this loop, thus safe to keep iterators.
+    // Since this is computing a fixed point, the order of visit does not
+    // effect the result.  TODO: We could use a worklist here and make this run
+    // much faster.
+    for (auto Pair : States) {
+      Value *BDV = Pair.first;
+      assert(!isKnownBaseResult(BDV) && "why did it get added?");
+
+      // Given an input value for the current instruction, return a BDVState
+      // instance which represents the BDV of that value.
+      auto getStateForInput = [&](Value *V) mutable {
+        Value *BDV = findBaseOrBDV(V, cache);
+        return getStateForBDV(BDV);
+      };
+
+      MeetBDVStates calculateMeet;
+      if (SelectInst *select = dyn_cast<SelectInst>(BDV)) {
+        calculateMeet.meetWith(getStateForInput(select->getTrueValue()));
+        calculateMeet.meetWith(getStateForInput(select->getFalseValue()));
+      } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) {
+        for (Value *Val : Phi->incoming_values())
+          calculateMeet.meetWith(getStateForInput(Val));
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+        // The 'meet' for an extractelement is slightly trivial, but it's still
+        // useful in that it drives us to conflict if our input is.
+        calculateMeet.meetWith(getStateForInput(EE->getVectorOperand()));
+      } else {
+        // Given there's a inherent type mismatch between the operands, will
+        // *always* produce Conflict.
+        auto *IE = cast<InsertElementInst>(BDV);
+        calculateMeet.meetWith(getStateForInput(IE->getOperand(0)));
+        calculateMeet.meetWith(getStateForInput(IE->getOperand(1)));
+      }
+
+      BDVState oldState = States[BDV];
+      BDVState newState = calculateMeet.getResult();
+      if (oldState != newState) {
+        progress = true;
+        States[BDV] = newState;
+      }
+    }
+
+    assert(oldSize == States.size() &&
+           "fixed point shouldn't be adding any new nodes to state");
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "States after meet iteration:\n");
+  for (auto Pair : States) {
+    DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+  }
+#endif
+  
+  // Insert Phis for all conflicts
+  // TODO: adjust naming patterns to avoid this order of iteration dependency
+  for (auto Pair : States) {
+    Instruction *I = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+    assert(!isKnownBaseResult(I) && "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+    // extractelement instructions are a bit special in that we may need to
+    // insert an extract even when we know an exact base for the instruction.
+    // The problem is that we need to convert from a vector base to a scalar
+    // base for the particular indice we're interested in.
+    if (State.isBase() && isa<ExtractElementInst>(I) &&
+        isa<VectorType>(State.getBase()->getType())) {
+      auto *EE = cast<ExtractElementInst>(I);
+      // TODO: In many cases, the new instruction is just EE itself.  We should
+      // exploit this, but can't do it here since it would break the invariant
+      // about the BDV not being known to be a base.
+      auto *BaseInst = ExtractElementInst::Create(State.getBase(),
+                                                  EE->getIndexOperand(),
+                                                  "base_ee", EE);
+      BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+      States[I] = BDVState(BDVState::Base, BaseInst);
+    }
+
+    // Since we're joining a vector and scalar base, they can never be the
+    // same.  As a result, we should always see insert element having reached
+    // the conflict state.
+    if (isa<InsertElementInst>(I)) {
+      assert(State.isConflict());
+    }
+    
+    if (!State.isConflict())
+      continue;
+
+    /// Create and insert a new instruction which will represent the base of
+    /// the given instruction 'I'.
+    auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+      if (isa<PHINode>(I)) {
+        BasicBlock *BB = I->getParent();
+        int NumPreds = std::distance(pred_begin(BB), pred_end(BB));
+        assert(NumPreds > 0 && "how did we reach here");
+        std::string Name = suffixed_name_or(I, ".base", "base_phi");
+        return PHINode::Create(I->getType(), NumPreds, Name, I);
+      } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) {
+        // The undef will be replaced later
+        UndefValue *Undef = UndefValue::get(Sel->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_select");
+        return SelectInst::Create(Sel->getCondition(), Undef,
+                                  Undef, Name, Sel);
+      } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+        UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ee");
+        return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+                                          EE);
+      } else {
+        auto *IE = cast<InsertElementInst>(I);
+        UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+        UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+        std::string Name = suffixed_name_or(I, ".base", "base_ie");
+        return InsertElementInst::Create(VecUndef, ScalarUndef,
+                                         IE->getOperand(2), Name, IE);
+      }
+
+    };
+    Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+    // Add metadata marking this as a base value
+    BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+    States[I] = BDVState(BDVState::Conflict, BaseInst);
+  }
+
+  // Returns a instruction which produces the base pointer for a given
+  // instruction.  The instruction is assumed to be an input to one of the BDVs
+  // seen in the inference algorithm above.  As such, we must either already
+  // know it's base defining value is a base, or have inserted a new
+  // instruction to propagate the base of it's BDV and have entered that newly
+  // introduced instruction into the state table.  In either case, we are
+  // assured to be able to determine an instruction which produces it's base
+  // pointer. 
+  auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+    Value *BDV = findBaseOrBDV(Input, cache);
+    Value *Base = nullptr;
+    if (isKnownBaseResult(BDV)) {
+      Base = BDV;
+    } else {
+      // Either conflict or base.
+      assert(States.count(BDV));
+      Base = States[BDV].getBase();
+    }
+    assert(Base && "can't be null");
+    // The cast is needed since base traversal may strip away bitcasts
+    if (Base->getType() != Input->getType() &&
+        InsertPt) {
+      Base = new BitCastInst(Base, Input->getType(), "cast",
+                             InsertPt);
+    }
+    return Base;
+  };
+
+  // Fixup all the inputs of the new PHIs.  Visit order needs to be
+  // deterministic and predictable because we're naming newly created
+  // instructions.
+  for (auto Pair : States) {
+    Instruction *BDV = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+    if (!State.isConflict())
+      continue;
+
+    if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) {
+      PHINode *phi = cast<PHINode>(BDV);
+      unsigned NumPHIValues = phi->getNumIncomingValues();
+      for (unsigned i = 0; i < NumPHIValues; i++) {
+        Value *InVal = phi->getIncomingValue(i);
+        BasicBlock *InBB = phi->getIncomingBlock(i);
+
+        // If we've already seen InBB, add the same incoming value
+        // we added for it earlier.  The IR verifier requires phi
+        // nodes with multiple entries from the same basic block
+        // to have the same incoming value for each of those
+        // entries.  If we don't do this check here and basephi
+        // has a different type than base, we'll end up adding two
+        // bitcasts (and hence two distinct values) as incoming
+        // values for the same basic block.
+
+        int blockIndex = basephi->getBasicBlockIndex(InBB);
+        if (blockIndex != -1) {
+          Value *oldBase = basephi->getIncomingValue(blockIndex);
+          basephi->addIncoming(oldBase, InBB);
+          
+#ifndef NDEBUG
+          Value *Base = getBaseForInput(InVal, nullptr);
+          // In essence this assert states: the only way two
+          // values incoming from the same basic block may be
+          // different is by being different bitcasts of the same
+          // value.  A cleanup that remains TODO is changing
+          // findBaseOrBDV to return an llvm::Value of the correct
+          // type (and still remain pure).  This will remove the
+          // need to add bitcasts.
+          assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() &&
+                 "sanity -- findBaseOrBDV should be pure!");
+#endif
+          continue;
+        }
+
+        // Find the instruction which produces the base for each input.  We may
+        // need to insert a bitcast in the incoming block.
+        // TODO: Need to split critical edges if insertion is needed
+        Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+        basephi->addIncoming(Base, InBB);
+      }
+      assert(basephi->getNumIncomingValues() == NumPHIValues);
+    } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) {
+      SelectInst *Sel = cast<SelectInst>(BDV);
+      // Operand 1 & 2 are true, false path respectively. TODO: refactor to
+      // something more safe and less hacky.
+      for (int i = 1; i <= 2; i++) {
+        Value *InVal = Sel->getOperand(i);
+        // Find the instruction which produces the base for each input.  We may
+        // need to insert a bitcast.
+        Value *Base = getBaseForInput(InVal, BaseSel);
+        BaseSel->setOperand(i, Base);
+      }
+    } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) {
+      Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+      // Find the instruction which produces the base for each input.  We may
+      // need to insert a bitcast.
+      Value *Base = getBaseForInput(InVal, BaseEE);
+      BaseEE->setOperand(0, Base);
+    } else {
+      auto *BaseIE = cast<InsertElementInst>(State.getBase());
+      auto *BdvIE = cast<InsertElementInst>(BDV);
+      auto UpdateOperand = [&](int OperandIdx) {
+        Value *InVal = BdvIE->getOperand(OperandIdx);
+        Value *Base = getBaseForInput(InVal, BaseIE);
+        BaseIE->setOperand(OperandIdx, Base);
+      };
+      UpdateOperand(0); // vector operand
+      UpdateOperand(1); // scalar operand
+    }
+
+  }
+
+  // Now that we're done with the algorithm, see if we can optimize the 
+  // results slightly by reducing the number of new instructions needed. 
+  // Arguably, this should be integrated into the algorithm above, but 
+  // doing as a post process step is easier to reason about for the moment.
+  DenseMap<Value *, Value *> ReverseMap;
+  SmallPtrSet<Instruction *, 16> NewInsts;
+  SmallSetVector<AssertingVH<Instruction>, 16> Worklist;
+  // Note: We need to visit the states in a deterministic order.  We uses the
+  // Keys we sorted above for this purpose.  Note that we are papering over a
+  // bigger problem with the algorithm above - it's visit order is not
+  // deterministic.  A larger change is needed to fix this.
+  for (auto Pair : States) {
+    auto *BDV = Pair.first;
+    auto State = Pair.second;
+    Value *Base = State.getBase();
+    assert(BDV && Base);
+    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    assert(isKnownBaseResult(Base) &&
+           "must be something we 'know' is a base pointer");
+    if (!State.isConflict())
+      continue;
+
+    ReverseMap[Base] = BDV;
+    if (auto *BaseI = dyn_cast<Instruction>(Base)) {
+      NewInsts.insert(BaseI);
+      Worklist.insert(BaseI);
+    }
+  }
+  auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI,
+                                 Value *Replacement) {
+    // Add users which are new instructions (excluding self references)
+    for (User *U : BaseI->users())
+      if (auto *UI = dyn_cast<Instruction>(U))
+        if (NewInsts.count(UI) && UI != BaseI)
+          Worklist.insert(UI);
+    // Then do the actual replacement
+    NewInsts.erase(BaseI);
+    ReverseMap.erase(BaseI);
+    BaseI->replaceAllUsesWith(Replacement);
+    assert(States.count(BDV));
+    assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI);
+    States[BDV] = BDVState(BDVState::Conflict, Replacement);
+    BaseI->eraseFromParent();
+  };
+  const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout();
+  while (!Worklist.empty()) {
+    Instruction *BaseI = Worklist.pop_back_val();
+    assert(NewInsts.count(BaseI));
+    Value *Bdv = ReverseMap[BaseI];
+    if (auto *BdvI = dyn_cast<Instruction>(Bdv))
+      if (BaseI->isIdenticalTo(BdvI)) {
+        DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n");
+        ReplaceBaseInstWith(Bdv, BaseI, Bdv);
+        continue;
+      }
+    if (Value *V = SimplifyInstruction(BaseI, DL)) {
+      DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n");
+      ReplaceBaseInstWith(Bdv, BaseI, V);
+      continue;
+    }
+  }
+
+  // Cache all of our results so we can cheaply reuse them
+  // NOTE: This is actually two caches: one of the base defining value
+  // relation and one of the base pointer relation!  FIXME
+  for (auto Pair : States) {
+    auto *BDV = Pair.first;
+    Value *base = Pair.second.getBase();
+    assert(BDV && base);
+
+    std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none";
+    DEBUG(dbgs() << "Updating base value cache"
+          << " for: " << BDV->getName()
+          << " from: " << fromstr
+          << " to: " << base->getName() << "\n");
+
+    if (cache.count(BDV)) {
+      // Once we transition from the BDV relation being store in the cache to
+      // the base relation being stored, it must be stable
+      assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) &&
+             "base relation should be stable");
+    }
+    cache[BDV] = base;
+  }
+  assert(cache.count(def));
+  return cache[def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from.  This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'.  This ensures that any use of 'derived' can
+// also use 'base'.  This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live.  Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void
+findBasePointers(const StatepointLiveSetTy &live,
+                 DenseMap<Value *, Value *> &PointerToBase,
+                 DominatorTree *DT, DefiningValueMapTy &DVCache) {
+  // For the naming of values inserted to be deterministic - which makes for
+  // much cleaner and more stable tests - we need to assign an order to the
+  // live values.  DenseSets do not provide a deterministic order across runs.
+  SmallVector<Value *, 64> Temp;
+  Temp.insert(Temp.end(), live.begin(), live.end());
+  std::sort(Temp.begin(), Temp.end(), order_by_name);
+  for (Value *ptr : Temp) {
+    Value *base = findBasePointer(ptr, DVCache);
+    assert(base && "failed to find base pointer");
+    PointerToBase[ptr] = base;
+    assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+            DT->dominates(cast<Instruction>(base)->getParent(),
+                          cast<Instruction>(ptr)->getParent())) &&
+           "The base we found better dominate the derived pointer");
+
+    // If you see this trip and like to live really dangerously, the code should
+    // be correct, just with idioms the verifier can't handle.  You can try
+    // disabling the verifier at your own substantial risk.
+    assert(!isa<ConstantPointerNull>(base) &&
+           "the relocation code needs adjustment to handle the relocation of "
+           "a null pointer constant without causing false positives in the "
+           "safepoint ir verifier.");
+  }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+                             const CallSite &CS,
+                             PartiallyConstructedSafepointRecord &result) {
+  DenseMap<Value *, Value *> PointerToBase;
+  findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
+
+  if (PrintBasePointers) {
+    // Note: Need to print these in a stable order since this is checked in
+    // some tests.
+    errs() << "Base Pairs (w/o Relocation):\n";
+    SmallVector<Value *, 64> Temp;
+    Temp.reserve(PointerToBase.size());
+    for (auto Pair : PointerToBase) {
+      Temp.push_back(Pair.first);
+    }
+    std::sort(Temp.begin(), Temp.end(), order_by_name);
+    for (Value *Ptr : Temp) {
+      Value *Base = PointerToBase[Ptr];
+      errs() << " derived ";
+      Ptr->printAsOperand(errs(), false);
+      errs() << " base ";
+      Base->printAsOperand(errs(), false);
+      errs() << "\n";;
+    }
+  }
+
+  result.PointerToBase = PointerToBase;
+}
+
+/// Given an updated version of the dataflow liveness results, update the
+/// liveset and base pointer maps for the call site CS.
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+                                  const CallSite &CS,
+                                  PartiallyConstructedSafepointRecord &result);
+
+static void recomputeLiveInValues(
+    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  // TODO-PERF: reuse the original liveness, then simply run the dataflow
+  // again.  The old values are still live and will help it stabilize quickly.
+  GCPtrLivenessData RevisedLivenessData;
+  computeLiveInValues(DT, F, RevisedLivenessData);
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    const CallSite &CS = toUpdate[i];
+    recomputeLiveInValues(RevisedLivenessData, CS, info);
+  }
+}
+
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call.  One case which can arise is a phi node
+// starting one of the successor blocks.  We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint.  We might
+// need to split an edge to make this possible.
+static BasicBlock *
+normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
+                            DominatorTree &DT) {
+  BasicBlock *Ret = BB;
+  if (!BB->getUniquePredecessor())
+    Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
+
+  // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
+  // from it
+  FoldSingleEntryPHINodes(Ret);
+  assert(!isa<PHINode>(Ret->begin()) &&
+         "All PHI nodes should have been removed!");
+
+  // At this point, we can safely insert a gc.relocate or gc.result as the first
+  // instruction in Ret if needed.
+  return Ret;
+}
+
+// Create new attribute set containing only attributes which can be transferred
+// from original call to the safepoint.
+static AttributeSet legalizeCallAttributes(AttributeSet AS) {
+  AttributeSet Ret;
+
+  for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
+    unsigned Index = AS.getSlotIndex(Slot);
+
+    if (Index == AttributeSet::ReturnIndex ||
+        Index == AttributeSet::FunctionIndex) {
+
+      for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
+
+        // Do not allow certain attributes - just skip them
+        // Safepoint can not be read only or read none.
+        if (Attr.hasAttribute(Attribute::ReadNone) ||
+            Attr.hasAttribute(Attribute::ReadOnly))
+          continue;
+
+        // These attributes control the generation of the gc.statepoint call /
+        // invoke itself; and once the gc.statepoint is in place, they're of no
+        // use.
+        if (Attr.hasAttribute("statepoint-num-patch-bytes") ||
+            Attr.hasAttribute("statepoint-id"))
+          continue;
+
+        Ret = Ret.addAttributes(
+            AS.getContext(), Index,
+            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
+      }
+    }
+
+    // Just skip parameter attributes for now
+  }
+
+  return Ret;
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+///   liveVariables - list of variables to be relocated.
+///   liveStart - index of the first live variable.
+///   basePtrs - base pointers.
+///   statepointToken - statepoint instruction to which relocates should be
+///   bound.
+///   Builder - Llvm IR builder to be used to construct new calls.
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
+                              const int LiveStart,
+                              ArrayRef<Value *> BasePtrs,
+                              Instruction *StatepointToken,
+                              IRBuilder<> Builder) {
+  if (LiveVariables.empty())
+    return;
+
+  auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+    auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val);
+    assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+    size_t Index = std::distance(LiveVec.begin(), ValIt);
+    assert(Index < LiveVec.size() && "Bug in std::find?");
+    return Index;
+  };
+  Module *M = StatepointToken->getModule();
+  
+  // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+  // element type is i8 addrspace(1)*). We originally generated unique
+  // declarations for each pointer type, but this proved problematic because
+  // the intrinsic mangling code is incomplete and fragile.  Since we're moving
+  // towards a single unified pointer type anyways, we can just cast everything
+  // to an i8* of the right address space.  A bitcast is added later to convert
+  // gc_relocate to the actual value's type.  
+  auto getGCRelocateDecl = [&] (Type *Ty) {
+    assert(isHandledGCPointerType(Ty));
+    auto AS = Ty->getScalarType()->getPointerAddressSpace();
+    Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+    if (auto *VT = dyn_cast<VectorType>(Ty))
+      NewTy = VectorType::get(NewTy, VT->getNumElements());
+    return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+                                     {NewTy});
+  };
+
+  // Lazily populated map from input types to the canonicalized form mentioned
+  // in the comment above.  This should probably be cached somewhere more
+  // broadly.
+  DenseMap<Type*, Value*> TypeToDeclMap;
+
+  for (unsigned i = 0; i < LiveVariables.size(); i++) {
+    // Generate the gc.relocate call and save the result
+    Value *BaseIdx =
+      Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i]));
+    Value *LiveIdx = Builder.getInt32(LiveStart + i);
+
+    Type *Ty = LiveVariables[i]->getType();
+    if (!TypeToDeclMap.count(Ty))
+      TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+    Value *GCRelocateDecl = TypeToDeclMap[Ty];
+
+    // only specify a debug name if we can give a useful one
+    CallInst *Reloc = Builder.CreateCall(
+        GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
+        suffixed_name_or(LiveVariables[i], ".relocated", ""));
+    // Trick CodeGen into thinking there are lots of free registers at this
+    // fake call.
+    Reloc->setCallingConv(CallingConv::Cold);
+  }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s.  Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+  AssertingVH<Instruction> Old;
+  AssertingVH<Instruction> New;
+
+public:
+  explicit DeferredReplacement(Instruction *Old, Instruction *New) :
+    Old(Old), New(New) {
+    assert(Old != New && "Not allowed!");
+  }
+
+  /// Does the task represented by this instance.
+  void doReplacement() {
+    Instruction *OldI = Old;
+    Instruction *NewI = New;
+
+    assert(OldI != NewI && "Disallowed at construction?!");
+
+    Old = nullptr;
+    New = nullptr;
+
+    if (NewI)
+      OldI->replaceAllUsesWith(NewI);
+    OldI->eraseFromParent();
+  }
+};
+}
+
+static void
+makeStatepointExplicitImpl(const CallSite CS, /* to replace */
+                           const SmallVectorImpl<Value *> &BasePtrs,
+                           const SmallVectorImpl<Value *> &LiveVariables,
+                           PartiallyConstructedSafepointRecord &Result,
+                           std::vector<DeferredReplacement> &Replacements) {
+  assert(BasePtrs.size() == LiveVariables.size());
+  assert((UseDeoptBundles || isStatepoint(CS)) &&
+         "This method expects to be rewriting a statepoint");
+
+  // Then go ahead and use the builder do actually do the inserts.  We insert
+  // immediately before the previous instruction under the assumption that all
+  // arguments will be available here.  We can't insert afterwards since we may
+  // be replacing a terminator.
+  Instruction *InsertBefore = CS.getInstruction();
+  IRBuilder<> Builder(InsertBefore);
+
+  ArrayRef<Value *> GCArgs(LiveVariables);
+  uint64_t StatepointID = 0xABCDEF00;
+  uint32_t NumPatchBytes = 0;
+  uint32_t Flags = uint32_t(StatepointFlags::None);
+
+  ArrayRef<Use> CallArgs;
+  ArrayRef<Use> DeoptArgs;
+  ArrayRef<Use> TransitionArgs;
+
+  Value *CallTarget = nullptr;
+
+  if (UseDeoptBundles) {
+    CallArgs = {CS.arg_begin(), CS.arg_end()};
+    DeoptArgs = GetDeoptBundleOperands(CS);
+    // TODO: we don't fill in TransitionArgs or Flags in this branch, but we
+    // could have an operand bundle for that too.
+    AttributeSet OriginalAttrs = CS.getAttributes();
+
+    Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                                  "statepoint-id");
+    if (AttrID.isStringAttribute())
+      AttrID.getValueAsString().getAsInteger(10, StatepointID);
+
+    Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute(
+        AttributeSet::FunctionIndex, "statepoint-num-patch-bytes");
+    if (AttrNumPatchBytes.isStringAttribute())
+      AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes);
+
+    CallTarget = CS.getCalledValue();
+  } else {
+    // This branch will be gone soon, and we will soon only support the
+    // UseDeoptBundles == true configuration.
+    Statepoint OldSP(CS);
+    StatepointID = OldSP.getID();
+    NumPatchBytes = OldSP.getNumPatchBytes();
+    Flags = OldSP.getFlags();
+
+    CallArgs = {OldSP.arg_begin(), OldSP.arg_end()};
+    DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()};
+    TransitionArgs = {OldSP.gc_transition_args_begin(),
+                      OldSP.gc_transition_args_end()};
+    CallTarget = OldSP.getCalledValue();
+  }
+
+  // Create the statepoint given all the arguments
+  Instruction *Token = nullptr;
+  AttributeSet ReturnAttrs;
+  if (CS.isCall()) {
+    CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
+    CallInst *Call = Builder.CreateGCStatepointCall(
+        StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+        TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+    Call->setTailCall(ToReplace->isTailCall());
+    Call->setCallingConv(ToReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.
+    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    // In case if we can handle this set of attributes - set up function attrs
+    // directly on statepoint and return attrs later for gc_result intrinsic.
+    Call->setAttributes(NewAttrs.getFnAttributes());
+    ReturnAttrs = NewAttrs.getRetAttributes();
+
+    Token = Call;
+
+    // Put the following gc_result and gc_relocate calls immediately after the
+    // the old call (which we're about to delete)
+    assert(ToReplace->getNextNode() && "Not a terminator, must have next!");
+    Builder.SetInsertPoint(ToReplace->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc());
+  } else {
+    InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction());
+
+    // Insert the new invoke into the old block.  We'll remove the old one in a
+    // moment at which point this will become the new terminator for the
+    // original block.
+    InvokeInst *Invoke = Builder.CreateGCStatepointInvoke(
+        StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(),
+        ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs,
+        GCArgs, "statepoint_token");
+
+    Invoke->setCallingConv(ToReplace->getCallingConv());
+
+    // Currently we will fail on parameter attributes and on certain
+    // function attributes.
+    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    // In case if we can handle this set of attributes - set up function attrs
+    // directly on statepoint and return attrs later for gc_result intrinsic.
+    Invoke->setAttributes(NewAttrs.getFnAttributes());
+    ReturnAttrs = NewAttrs.getRetAttributes();
+
+    Token = Invoke;
+
+    // Generate gc relocates in exceptional path
+    BasicBlock *UnwindBlock = ToReplace->getUnwindDest();
+    assert(!isa<PHINode>(UnwindBlock->begin()) &&
+           UnwindBlock->getUniquePredecessor() &&
+           "can't safely insert in this block!");
+
+    Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+    // Attach exceptional gc relocates to the landingpad.
+    Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+    Result.UnwindToken = ExceptionalToken;
+
+    const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+    CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
+                      Builder);
+
+    // Generate gc relocates and returns for normal block
+    BasicBlock *NormalDest = ToReplace->getNormalDest();
+    assert(!isa<PHINode>(NormalDest->begin()) &&
+           NormalDest->getUniquePredecessor() &&
+           "can't safely insert in this block!");
+
+    Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
+
+    // gc relocates will be generated later as if it were regular call
+    // statepoint
+  }
+  assert(Token && "Should be set in one of the above branches!");
+
+  if (UseDeoptBundles) {
+    Token->setName("statepoint_token");
+    if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) {
+      StringRef Name =
+          CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
+      CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
+      GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+
+      // We cannot RAUW or delete CS.getInstruction() because it could be in the
+      // live set of some other safepoint, in which case that safepoint's
+      // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+      // llvm::Instruction.  Instead, we defer the replacement and deletion to
+      // after the live sets have been made explicit in the IR, and we no longer
+      // have raw pointers to worry about.
+      Replacements.emplace_back(CS.getInstruction(), GCResult);
+    } else {
+      Replacements.emplace_back(CS.getInstruction(), nullptr);
+    }
+  } else {
+    assert(!CS.getInstruction()->hasNUsesOrMore(2) &&
+           "only valid use before rewrite is gc.result");
+    assert(!CS.getInstruction()->hasOneUse() ||
+           isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin())));
+
+    // Take the name of the original statepoint token if there was one.
+    Token->takeName(CS.getInstruction());
+
+    // Update the gc.result of the original statepoint (if any) to use the newly
+    // inserted statepoint.  This is safe to do here since the token can't be
+    // considered a live reference.
+    CS.getInstruction()->replaceAllUsesWith(Token);
+    CS.getInstruction()->eraseFromParent();
+  }
+
+  Result.StatepointToken = Token;
+
+  // Second, create a gc.relocate for every live variable
+  const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
+  CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
+}
+
+namespace {
+struct NameOrdering {
+  Value *Base;
+  Value *Derived;
+
+  bool operator()(NameOrdering const &a, NameOrdering const &b) {
+    return -1 == a.Derived->getName().compare(b.Derived->getName());
+  }
+};
+}
+
+static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec,
+                           SmallVectorImpl<Value *> &LiveVec) {
+  assert(BaseVec.size() == LiveVec.size());
+
+  SmallVector<NameOrdering, 64> Temp;
+  for (size_t i = 0; i < BaseVec.size(); i++) {
+    NameOrdering v;
+    v.Base = BaseVec[i];
+    v.Derived = LiveVec[i];
+    Temp.push_back(v);
+  }
+
+  std::sort(Temp.begin(), Temp.end(), NameOrdering());
+  for (size_t i = 0; i < BaseVec.size(); i++) {
+    BaseVec[i] = Temp[i].Base;
+    LiveVec[i] = Temp[i].Derived;
+  }
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values.  That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, const CallSite &CS,
+                       PartiallyConstructedSafepointRecord &Result,
+                       std::vector<DeferredReplacement> &Replacements) {
+  const auto &LiveSet = Result.LiveSet;
+  const auto &PointerToBase = Result.PointerToBase;
+
+  // Convert to vector for efficient cross referencing.
+  SmallVector<Value *, 64> BaseVec, LiveVec;
+  LiveVec.reserve(LiveSet.size());
+  BaseVec.reserve(LiveSet.size());
+  for (Value *L : LiveSet) {
+    LiveVec.push_back(L);
+    assert(PointerToBase.count(L));
+    Value *Base = PointerToBase.find(L)->second;
+    BaseVec.push_back(Base);
+  }
+  assert(LiveVec.size() == BaseVec.size());
+
+  // To make the output IR slightly more stable (for use in diffs), ensure a
+  // fixed order of the values in the safepoint (by sorting the value name).
+  // The order is otherwise meaningless.
+  StabilizeOrder(BaseVec, LiveVec);
+
+  // Do the actual rewriting and delete the old statepoint
+  makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements);
+}
+
+// Helper function for the relocationViaAlloca.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them.  It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
+                       DenseMap<Value *, Value *> &AllocaMap,
+                       DenseSet<Value *> &VisitedLiveValues) {
+
+  for (User *U : GCRelocs) {
+    GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+    if (!Relocate)
+      continue;
+
+    Value *OriginalValue = const_cast<Value *>(Relocate->getDerivedPtr());
+    assert(AllocaMap.count(OriginalValue));
+    Value *Alloca = AllocaMap[OriginalValue];
+
+    // Emit store into the related alloca
+    // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
+    // the correct type according to alloca.
+    assert(Relocate->getNextNode() &&
+           "Should always have one since it's not a terminator");
+    IRBuilder<> Builder(Relocate->getNextNode());
+    Value *CastedRelocatedValue =
+      Builder.CreateBitCast(Relocate,
+                            cast<AllocaInst>(Alloca)->getAllocatedType(),
+                            suffixed_name_or(Relocate, ".casted", ""));
+
+    StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
+    Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
+
+#ifndef NDEBUG
+    VisitedLiveValues.insert(OriginalValue);
+#endif
+  }
+}
+
+// Helper function for the "relocationViaAlloca". Similar to the
+// "insertRelocationStores" but works for rematerialized values.
+static void
+insertRematerializationStores(
+  RematerializedValueMapTy RematerializedValues,
+  DenseMap<Value *, Value *> &AllocaMap,
+  DenseSet<Value *> &VisitedLiveValues) {
+
+  for (auto RematerializedValuePair: RematerializedValues) {
+    Instruction *RematerializedValue = RematerializedValuePair.first;
+    Value *OriginalValue = RematerializedValuePair.second;
+
+    assert(AllocaMap.count(OriginalValue) &&
+           "Can not find alloca for rematerialized value");
+    Value *Alloca = AllocaMap[OriginalValue];
+
+    StoreInst *Store = new StoreInst(RematerializedValue, Alloca);
+    Store->insertAfter(RematerializedValue);
+
+#ifndef NDEBUG
+    VisitedLiveValues.insert(OriginalValue);
+#endif
+  }
+}
+
+/// Do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+    Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
+    ArrayRef<PartiallyConstructedSafepointRecord> Records) {
+#ifndef NDEBUG
+  // record initial number of (static) allocas; we'll check we have the same
+  // number when we get done.
+  int InitialAllocaNum = 0;
+  for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E;
+       I++)
+    if (isa<AllocaInst>(*I))
+      InitialAllocaNum++;
+#endif
+
+  // TODO-PERF: change data structures, reserve
+  DenseMap<Value *, Value *> AllocaMap;
+  SmallVector<AllocaInst *, 200> PromotableAllocas;
+  // Used later to chack that we have enough allocas to store all values
+  std::size_t NumRematerializedValues = 0;
+  PromotableAllocas.reserve(Live.size());
+
+  // Emit alloca for "LiveValue" and record it in "allocaMap" and
+  // "PromotableAllocas"
+  auto emitAllocaFor = [&](Value *LiveValue) {
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+                                        F.getEntryBlock().getFirstNonPHI());
+    AllocaMap[LiveValue] = Alloca;
+    PromotableAllocas.push_back(Alloca);
+  };
+
+  // Emit alloca for each live gc pointer
+  for (Value *V : Live)
+    emitAllocaFor(V);
+
+  // Emit allocas for rematerialized values
+  for (const auto &Info : Records)
+    for (auto RematerializedValuePair : Info.RematerializedValues) {
+      Value *OriginalValue = RematerializedValuePair.second;
+      if (AllocaMap.count(OriginalValue) != 0)
+        continue;
+
+      emitAllocaFor(OriginalValue);
+      ++NumRematerializedValues;
+    }
+
+  // The next two loops are part of the same conceptual operation.  We need to
+  // insert a store to the alloca after the original def and at each
+  // redefinition.  We need to insert a load before each use.  These are split
+  // into distinct loops for performance reasons.
+
+  // Update gc pointer after each statepoint: either store a relocated value or
+  // null (if no relocated value was found for this gc pointer and it is not a
+  // gc_result).  This must happen before we update the statepoint with load of
+  // alloca otherwise we lose the link between statepoint and old def.
+  for (const auto &Info : Records) {
+    Value *Statepoint = Info.StatepointToken;
+
+    // This will be used for consistency check
+    DenseSet<Value *> VisitedLiveValues;
+
+    // Insert stores for normal statepoint gc relocates
+    insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
+
+    // In case if it was invoke statepoint
+    // we will insert stores for exceptional path gc relocates.
+    if (isa<InvokeInst>(Statepoint)) {
+      insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
+                             VisitedLiveValues);
+    }
+
+    // Do similar thing with rematerialized values
+    insertRematerializationStores(Info.RematerializedValues, AllocaMap,
+                                  VisitedLiveValues);
+
+    if (ClobberNonLive) {
+      // As a debugging aid, pretend that an unrelocated pointer becomes null at
+      // the gc.statepoint.  This will turn some subtle GC problems into
+      // slightly easier to debug SEGVs.  Note that on large IR files with
+      // lots of gc.statepoints this is extremely costly both memory and time
+      // wise.
+      SmallVector<AllocaInst *, 64> ToClobber;
+      for (auto Pair : AllocaMap) {
+        Value *Def = Pair.first;
+        AllocaInst *Alloca = cast<AllocaInst>(Pair.second);
+
+        // This value was relocated
+        if (VisitedLiveValues.count(Def)) {
+          continue;
+        }
+        ToClobber.push_back(Alloca);
+      }
+
+      auto InsertClobbersAt = [&](Instruction *IP) {
+        for (auto *AI : ToClobber) {
+          auto AIType = cast<PointerType>(AI->getType());
+          auto PT = cast<PointerType>(AIType->getElementType());
+          Constant *CPN = ConstantPointerNull::get(PT);
+          StoreInst *Store = new StoreInst(CPN, AI);
+          Store->insertBefore(IP);
+        }
+      };
+
+      // Insert the clobbering stores.  These may get intermixed with the
+      // gc.results and gc.relocates, but that's fine.
+      if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+        InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+        InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
+      } else {
+        InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
+      }
+    }
+  }
+
+  // Update use with load allocas and add store for gc_relocated.
+  for (auto Pair : AllocaMap) {
+    Value *Def = Pair.first;
+    Value *Alloca = Pair.second;
+
+    // We pre-record the uses of allocas so that we dont have to worry about
+    // later update that changes the user information..
+
+    SmallVector<Instruction *, 20> Uses;
+    // PERF: trade a linear scan for repeated reallocation
+    Uses.reserve(std::distance(Def->user_begin(), Def->user_end()));
+    for (User *U : Def->users()) {
+      if (!isa<ConstantExpr>(U)) {
+        // If the def has a ConstantExpr use, then the def is either a
+        // ConstantExpr use itself or null.  In either case
+        // (recursively in the first, directly in the second), the oop
+        // it is ultimately dependent on is null and this particular
+        // use does not need to be fixed up.
+        Uses.push_back(cast<Instruction>(U));
+      }
+    }
+
+    std::sort(Uses.begin(), Uses.end());
+    auto Last = std::unique(Uses.begin(), Uses.end());
+    Uses.erase(Last, Uses.end());
+
+    for (Instruction *Use : Uses) {
+      if (isa<PHINode>(Use)) {
+        PHINode *Phi = cast<PHINode>(Use);
+        for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
+          if (Def == Phi->getIncomingValue(i)) {
+            LoadInst *Load = new LoadInst(
+                Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
+            Phi->setIncomingValue(i, Load);
+          }
+        }
+      } else {
+        LoadInst *Load = new LoadInst(Alloca, "", Use);
+        Use->replaceUsesOfWith(Def, Load);
+      }
+    }
+
+    // Emit store for the initial gc value.  Store must be inserted after load,
+    // otherwise store will be in alloca's use list and an extra load will be
+    // inserted before it.
+    StoreInst *Store = new StoreInst(Def, Alloca);
+    if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+      if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+        // InvokeInst is a TerminatorInst so the store need to be inserted
+        // into its normal destination block.
+        BasicBlock *NormalDest = Invoke->getNormalDest();
+        Store->insertBefore(NormalDest->getFirstNonPHI());
+      } else {
+        assert(!Inst->isTerminator() &&
+               "The only TerminatorInst that can produce a value is "
+               "InvokeInst which is handled above.");
+        Store->insertAfter(Inst);
+      }
+    } else {
+      assert(isa<Argument>(Def));
+      Store->insertAfter(cast<Instruction>(Alloca));
+    }
+  }
+
+  assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
+         "we must have the same allocas with lives");
+  if (!PromotableAllocas.empty()) {
+    // Apply mem2reg to promote alloca to SSA
+    PromoteMemToReg(PromotableAllocas, DT);
+  }
+
+#ifndef NDEBUG
+  for (auto &I : F.getEntryBlock())
+    if (isa<AllocaInst>(I))
+      InitialAllocaNum--;
+  assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector.  Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+  SmallSet<T, 8> Seen;
+  Vec.erase(std::remove_if(Vec.begin(), Vec.end(), [&](const T &V) {
+              return !Seen.insert(V).second;
+            }), Vec.end());
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// lifetime of the call.
+static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values,
+                                 SmallVectorImpl<CallInst *> &Holders) {
+  if (Values.empty())
+    // No values to hold live, might as well not insert the empty holder
+    return;
+
+  Module *M = CS.getInstruction()->getModule();
+  // Use a dummy vararg function to actually hold the values live
+  Function *Func = cast<Function>(M->getOrInsertFunction(
+      "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true)));
+  if (CS.isCall()) {
+    // For call safepoints insert dummy calls right after safepoint
+    Holders.push_back(CallInst::Create(Func, Values, "",
+                                       &*++CS.getInstruction()->getIterator()));
+    return;
+  }
+  // For invoke safepooints insert dummy calls both in normal and
+  // exceptional destination blocks
+  auto *II = cast<InvokeInst>(CS.getInstruction());
+  Holders.push_back(CallInst::Create(
+      Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
+  Holders.push_back(CallInst::Create(
+      Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
+}
+
+static void findLiveReferences(
+    Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate,
+    MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+  GCPtrLivenessData OriginalLivenessData;
+  computeLiveInValues(DT, F, OriginalLivenessData);
+  for (size_t i = 0; i < records.size(); i++) {
+    struct PartiallyConstructedSafepointRecord &info = records[i];
+    const CallSite &CS = toUpdate[i];
+    analyzeParsePointLiveness(DT, OriginalLivenessData, CS, info);
+  }
+}
+
+/// Remove any vector of pointers from the live set by scalarizing them over the
+/// statepoint instruction.  Adds the scalarized pieces to the live set.  It
+/// would be preferable to include the vector in the statepoint itself, but
+/// the lowering code currently does not handle that.  Extending it would be
+/// slightly non-trivial since it requires a format change.  Given how rare
+/// such cases are (for the moment?) scalarizing is an acceptable compromise.
+static void splitVectorValues(Instruction *StatepointInst,
+                              StatepointLiveSetTy &LiveSet,
+                              DenseMap<Value *, Value *>& PointerToBase,
+                              DominatorTree &DT) {
+  SmallVector<Value *, 16> ToSplit;
+  for (Value *V : LiveSet)
+    if (isa<VectorType>(V->getType()))
+      ToSplit.push_back(V);
+
+  if (ToSplit.empty())
+    return;
+
+  DenseMap<Value *, SmallVector<Value *, 16>> ElementMapping;
+
+  Function &F = *(StatepointInst->getParent()->getParent());
+
+  DenseMap<Value *, AllocaInst *> AllocaMap;
+  // First is normal return, second is exceptional return (invoke only)
+  DenseMap<Value *, std::pair<Value *, Value *>> Replacements;
+  for (Value *V : ToSplit) {
+    AllocaInst *Alloca =
+        new AllocaInst(V->getType(), "", F.getEntryBlock().getFirstNonPHI());
+    AllocaMap[V] = Alloca;
+
+    VectorType *VT = cast<VectorType>(V->getType());
+    IRBuilder<> Builder(StatepointInst);
+    SmallVector<Value *, 16> Elements;
+    for (unsigned i = 0; i < VT->getNumElements(); i++)
+      Elements.push_back(Builder.CreateExtractElement(V, Builder.getInt32(i)));
+    ElementMapping[V] = Elements;
+
+    auto InsertVectorReform = [&](Instruction *IP) {
+      Builder.SetInsertPoint(IP);
+      Builder.SetCurrentDebugLocation(IP->getDebugLoc());
+      Value *ResultVec = UndefValue::get(VT);
+      for (unsigned i = 0; i < VT->getNumElements(); i++)
+        ResultVec = Builder.CreateInsertElement(ResultVec, Elements[i],
+                                                Builder.getInt32(i));
+      return ResultVec;
+    };
+
+    if (isa<CallInst>(StatepointInst)) {
+      BasicBlock::iterator Next(StatepointInst);
+      Next++;
+      Instruction *IP = &*(Next);
+      Replacements[V].first = InsertVectorReform(IP);
+      Replacements[V].second = nullptr;
+    } else {
+      InvokeInst *Invoke = cast<InvokeInst>(StatepointInst);
+      // We've already normalized - check that we don't have shared destination
+      // blocks
+      BasicBlock *NormalDest = Invoke->getNormalDest();
+      assert(!isa<PHINode>(NormalDest->begin()));
+      BasicBlock *UnwindDest = Invoke->getUnwindDest();
+      assert(!isa<PHINode>(UnwindDest->begin()));
+      // Insert insert element sequences in both successors
+      Instruction *IP = &*(NormalDest->getFirstInsertionPt());
+      Replacements[V].first = InsertVectorReform(IP);
+      IP = &*(UnwindDest->getFirstInsertionPt());
+      Replacements[V].second = InsertVectorReform(IP);
+    }
+  }
+
+  for (Value *V : ToSplit) {
+    AllocaInst *Alloca = AllocaMap[V];
+
+    // Capture all users before we start mutating use lists
+    SmallVector<Instruction *, 16> Users;
+    for (User *U : V->users())
+      Users.push_back(cast<Instruction>(U));
+
+    for (Instruction *I : Users) {
+      if (auto Phi = dyn_cast<PHINode>(I)) {
+        for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++)
+          if (V == Phi->getIncomingValue(i)) {
+            LoadInst *Load = new LoadInst(
+                Alloca, "", Phi->getIncomingBlock(i)->getTerminator());
+            Phi->setIncomingValue(i, Load);
+          }
+      } else {
+        LoadInst *Load = new LoadInst(Alloca, "", I);
+        I->replaceUsesOfWith(V, Load);
+      }
+    }
+
+    // Store the original value and the replacement value into the alloca
+    StoreInst *Store = new StoreInst(V, Alloca);
+    if (auto I = dyn_cast<Instruction>(V))
+      Store->insertAfter(I);
+    else
+      Store->insertAfter(Alloca);
+
+    // Normal return for invoke, or call return
+    Instruction *Replacement = cast<Instruction>(Replacements[V].first);
+    (new StoreInst(Replacement, Alloca))->insertAfter(Replacement);
+    // Unwind return for invoke only
+    Replacement = cast_or_null<Instruction>(Replacements[V].second);
+    if (Replacement)
+      (new StoreInst(Replacement, Alloca))->insertAfter(Replacement);
+  }
+
+  // apply mem2reg to promote alloca to SSA
+  SmallVector<AllocaInst *, 16> Allocas;
+  for (Value *V : ToSplit)
+    Allocas.push_back(AllocaMap[V]);
+  PromoteMemToReg(Allocas, DT);
+
+  // Update our tracking of live pointers and base mappings to account for the
+  // changes we just made.
+  for (Value *V : ToSplit) {
+    auto &Elements = ElementMapping[V];
+
+    LiveSet.erase(V);
+    LiveSet.insert(Elements.begin(), Elements.end());
+    // We need to update the base mapping as well.
+    assert(PointerToBase.count(V));
+    Value *OldBase = PointerToBase[V];
+    auto &BaseElements = ElementMapping[OldBase];
+    PointerToBase.erase(V);
+    assert(Elements.size() == BaseElements.size());
+    for (unsigned i = 0; i < Elements.size(); i++) {
+      Value *Elem = Elements[i];
+      PointerToBase[Elem] = BaseElements[i];
+    }
+  }
+}
+
+// Helper function for the "rematerializeLiveValues". It walks use chain
+// starting from the "CurrentValue" until it meets "BaseValue". Only "simple"
+// values are visited (currently it is GEP's and casts). Returns true if it
+// successfully reached "BaseValue" and false otherwise.
+// Fills "ChainToBase" array with all visited values. "BaseValue" is not
+// recorded.
+static bool findRematerializableChainToBasePointer(
+  SmallVectorImpl<Instruction*> &ChainToBase,
+  Value *CurrentValue, Value *BaseValue) {
+
+  // We have found a base value
+  if (CurrentValue == BaseValue) {
+    return true;
+  }
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
+    ChainToBase.push_back(GEP);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  GEP->getPointerOperand(),
+                                                  BaseValue);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
+    if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+      return false;
+
+    ChainToBase.push_back(CI);
+    return findRematerializableChainToBasePointer(ChainToBase,
+                                                  CI->getOperand(0), BaseValue);
+  }
+
+  // Not supported instruction in the chain
+  return false;
+}
+
+// Helper function for the "rematerializeLiveValues". Compute cost of the use
+// chain we are going to rematerialize.
+static unsigned
+chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
+                       TargetTransformInfo &TTI) {
+  unsigned Cost = 0;
+
+  for (Instruction *Instr : Chain) {
+    if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
+      assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+             "non noop cast is found during rematerialization");
+
+      Type *SrcTy = CI->getOperand(0)->getType();
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+      // Cost of the address calculation
+      Type *ValTy = GEP->getPointerOperandType()->getPointerElementType();
+      Cost += TTI.getAddressComputationCost(ValTy);
+
+      // And cost of the GEP itself
+      // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
+      //       allowed for the external usage)
+      if (!GEP->hasAllConstantIndices())
+        Cost += 2;
+
+    } else {
+      llvm_unreachable("unsupported instruciton type during rematerialization");
+    }
+  }
+
+  return Cost;
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallSite CS,
+                                    PartiallyConstructedSafepointRecord &Info,
+                                    TargetTransformInfo &TTI) {
+  const unsigned int ChainLengthThreshold = 10;
+
+  // Record values we are going to delete from this statepoint live set.
+  // We can not di this in following loop due to iterator invalidation.
+  SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+  for (Value *LiveValue: Info.LiveSet) {
+    // For each live pointer find it's defining chain
+    SmallVector<Instruction *, 3> ChainToBase;
+    assert(Info.PointerToBase.count(LiveValue));
+    bool FoundChain =
+      findRematerializableChainToBasePointer(ChainToBase,
+                                             LiveValue,
+                                             Info.PointerToBase[LiveValue]);
+    // Nothing to do, or chain is too long
+    if (!FoundChain ||
+        ChainToBase.size() == 0 ||
+        ChainToBase.size() > ChainLengthThreshold)
+      continue;
+
+    // Compute cost of this chain
+    unsigned Cost = chainToBasePointerCost(ChainToBase, TTI);
+    // TODO: We can also account for cases when we will be able to remove some
+    //       of the rematerialized values by later optimization passes. I.e if
+    //       we rematerialized several intersecting chains. Or if original values
+    //       don't have any uses besides this statepoint.
+
+    // For invokes we need to rematerialize each chain twice - for normal and
+    // for unwind basic blocks. Model this by multiplying cost by two.
+    if (CS.isInvoke()) {
+      Cost *= 2;
+    }
+    // If it's too expensive - skip it
+    if (Cost >= RematerializationThreshold)
+      continue;
+
+    // Remove value from the live set
+    LiveValuesToBeDeleted.push_back(LiveValue);
+
+    // Clone instructions and record them inside "Info" structure
+
+    // Walk backwards to visit top-most instructions first
+    std::reverse(ChainToBase.begin(), ChainToBase.end());
+
+    // Utility function which clones all instructions from "ChainToBase"
+    // and inserts them before "InsertBefore". Returns rematerialized value
+    // which should be used after statepoint.
+    auto rematerializeChain = [&ChainToBase](Instruction *InsertBefore) {
+      Instruction *LastClonedValue = nullptr;
+      Instruction *LastValue = nullptr;
+      for (Instruction *Instr: ChainToBase) {
+        // Only GEP's and casts are suported as we need to be careful to not
+        // introduce any new uses of pointers not in the liveset.
+        // Note that it's fine to introduce new uses of pointers which were
+        // otherwise not used after this statepoint.
+        assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
+
+        Instruction *ClonedValue = Instr->clone();
+        ClonedValue->insertBefore(InsertBefore);
+        ClonedValue->setName(Instr->getName() + ".remat");
+
+        // If it is not first instruction in the chain then it uses previously
+        // cloned value. We should update it to use cloned value.
+        if (LastClonedValue) {
+          assert(LastValue);
+          ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
+#ifndef NDEBUG
+          // Assert that cloned instruction does not use any instructions from
+          // this chain other than LastClonedValue
+          for (auto OpValue : ClonedValue->operand_values()) {
+            assert(std::find(ChainToBase.begin(), ChainToBase.end(), OpValue) ==
+                       ChainToBase.end() &&
+                   "incorrect use in rematerialization chain");
+          }
+#endif
+        }
+
+        LastClonedValue = ClonedValue;
+        LastValue = Instr;
+      }
+      assert(LastClonedValue);
+      return LastClonedValue;
+    };
+
+    // Different cases for calls and invokes. For invokes we need to clone
+    // instructions both on normal and unwind path.
+    if (CS.isCall()) {
+      Instruction *InsertBefore = CS.getInstruction()->getNextNode();
+      assert(InsertBefore);
+      Instruction *RematerializedValue = rematerializeChain(InsertBefore);
+      Info.RematerializedValues[RematerializedValue] = LiveValue;
+    } else {
+      InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction());
+
+      Instruction *NormalInsertBefore =
+          &*Invoke->getNormalDest()->getFirstInsertionPt();
+      Instruction *UnwindInsertBefore =
+          &*Invoke->getUnwindDest()->getFirstInsertionPt();
+
+      Instruction *NormalRematerializedValue =
+          rematerializeChain(NormalInsertBefore);
+      Instruction *UnwindRematerializedValue =
+          rematerializeChain(UnwindInsertBefore);
+
+      Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
+      Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
+    }
+  }
+
+  // Remove rematerializaed values from the live set
+  for (auto LiveValue: LiveValuesToBeDeleted) {
+    Info.LiveSet.erase(LiveValue);
+  }
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+                              TargetTransformInfo &TTI,
+                              SmallVectorImpl<CallSite> &ToUpdate) {
+#ifndef NDEBUG
+  // sanity check the input
+  std::set<CallSite> Uniqued;
+  Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+  assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
+
+  for (CallSite CS : ToUpdate) {
+    assert(CS.getInstruction()->getParent()->getParent() == &F);
+    assert((UseDeoptBundles || isStatepoint(CS)) &&
+           "expected to already be a deopt statepoint");
+  }
+#endif
+
+  // When inserting gc.relocates for invokes, we need to be able to insert at
+  // the top of the successor blocks.  See the comment on
+  // normalForInvokeSafepoint on exactly what is needed.  Note that this step
+  // may restructure the CFG.
+  for (CallSite CS : ToUpdate) {
+    if (!CS.isInvoke())
+      continue;
+    auto *II = cast<InvokeInst>(CS.getInstruction());
+    normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+    normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
+  }
+
+  // A list of dummy calls added to the IR to keep various values obviously
+  // live in the IR.  We'll remove all of these when done.
+  SmallVector<CallInst *, 64> Holders;
+
+  // Insert a dummy call with all of the arguments to the vm_state we'll need
+  // for the actual safepoint insertion.  This ensures reference arguments in
+  // the deopt argument list are considered live through the safepoint (and
+  // thus makes sure they get relocated.)
+  for (CallSite CS : ToUpdate) {
+    SmallVector<Value *, 64> DeoptValues;
+
+    iterator_range<const Use *> DeoptStateRange =
+        UseDeoptBundles
+            ? iterator_range<const Use *>(GetDeoptBundleOperands(CS))
+            : iterator_range<const Use *>(Statepoint(CS).vm_state_args());
+
+    for (Value *Arg : DeoptStateRange) {
+      assert(!isUnhandledGCPointerType(Arg->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(Arg->getType()))
+        DeoptValues.push_back(Arg);
+    }
+
+    insertUseHolderAfter(CS, DeoptValues, Holders);
+  }
+
+  SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+  // A) Identify all gc pointers which are statically live at the given call
+  // site.
+  findLiveReferences(F, DT, ToUpdate, Records);
+
+  // B) Find the base pointers for each live pointer
+  /* scope for caching */ {
+    // Cache the 'defining value' relation used in the computation and
+    // insertion of base phis and selects.  This ensures that we don't insert
+    // large numbers of duplicate base_phis.
+    DefiningValueMapTy DVCache;
+
+    for (size_t i = 0; i < Records.size(); i++) {
+      PartiallyConstructedSafepointRecord &info = Records[i];
+      findBasePointers(DT, DVCache, ToUpdate[i], info);
+    }
+  } // end of cache scope
+
+  // The base phi insertion logic (for any safepoint) may have inserted new
+  // instructions which are now live at some safepoint.  The simplest such
+  // example is:
+  // loop:
+  //   phi a  <-- will be a new base_phi here
+  //   safepoint 1 <-- that needs to be live here
+  //   gep a + 1
+  //   safepoint 2
+  //   br loop
+  // We insert some dummy calls after each safepoint to definitely hold live
+  // the base pointers which were identified for that safepoint.  We'll then
+  // ask liveness for _every_ base inserted to see what is now live.  Then we
+  // remove the dummy calls.
+  Holders.reserve(Holders.size() + Records.size());
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
+    SmallVector<Value *, 128> Bases;
+    for (auto Pair : Info.PointerToBase)
+      Bases.push_back(Pair.second);
+
+    insertUseHolderAfter(ToUpdate[i], Bases, Holders);
+  }
+
+  // By selecting base pointers, we've effectively inserted new uses. Thus, we
+  // need to rerun liveness.  We may *also* have inserted new defs, but that's
+  // not the key issue.
+  recomputeLiveInValues(F, DT, ToUpdate, Records);
+
+  if (PrintBasePointers) {
+    for (auto &Info : Records) {
+      errs() << "Base Pairs: (w/Relocation)\n";
+      for (auto Pair : Info.PointerToBase) {
+        errs() << " derived ";
+        Pair.first->printAsOperand(errs(), false);
+        errs() << " base ";
+        Pair.second->printAsOperand(errs(), false);
+        errs() << "\n";
+      }
+    }
+  }
+
+  // It is possible that non-constant live variables have a constant base.  For
+  // example, a GEP with a variable offset from a global.  In this case we can
+  // remove it from the liveset.  We already don't add constants to the liveset
+  // because we assume they won't move at runtime and the GC doesn't need to be
+  // informed about them.  The same reasoning applies if the base is constant.
+  // Note that the relocation placement code relies on this filtering for
+  // correctness as it expects the base to be in the liveset, which isn't true
+  // if the base is constant.
+  for (auto &Info : Records)
+    for (auto &BasePair : Info.PointerToBase)
+      if (isa<Constant>(BasePair.second))
+        Info.LiveSet.erase(BasePair.first);
+
+  for (CallInst *CI : Holders)
+    CI->eraseFromParent();
+
+  Holders.clear();
+
+  // Do a limited scalarization of any live at safepoint vector values which
+  // contain pointers.  This enables this pass to run after vectorization at
+  // the cost of some possible performance loss.  Note: This is known to not
+  // handle updating of the side tables correctly which can lead to relocation
+  // bugs when the same vector is live at multiple statepoints.  We're in the
+  // process of implementing the alternate lowering - relocating the
+  // vector-of-pointers as first class item and updating the backend to
+  // understand that - but that's not yet complete.  
+  if (UseVectorSplit)
+    for (size_t i = 0; i < Records.size(); i++) {
+      PartiallyConstructedSafepointRecord &Info = Records[i];
+      Instruction *Statepoint = ToUpdate[i].getInstruction();
+      splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet,
+                        Info.PointerToBase, DT);
+    }
+
+  // In order to reduce live set of statepoint we might choose to rematerialize
+  // some values instead of relocating them. This is purely an optimization and
+  // does not influence correctness.
+  for (size_t i = 0; i < Records.size(); i++)
+    rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
+
+  // We need this to safely RAUW and delete call or invoke return values that
+  // may themselves be live over a statepoint.  For details, please see usage in
+  // makeStatepointExplicitImpl.
+  std::vector<DeferredReplacement> Replacements;
+
+  // Now run through and replace the existing statepoints with new ones with
+  // the live variables listed.  We do not yet update uses of the values being
+  // relocated. We have references to live variables that need to
+  // survive to the last iteration of this loop.  (By construction, the
+  // previous statepoint can not be a live variable, thus we can and remove
+  // the old statepoint calls as we go.)
+  for (size_t i = 0; i < Records.size(); i++)
+    makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+  ToUpdate.clear(); // prevent accident use of invalid CallSites
+
+  for (auto &PR : Replacements)
+    PR.doReplacement();
+
+  Replacements.clear();
+
+  for (auto &Info : Records) {
+    // These live sets may contain state Value pointers, since we replaced calls
+    // with operand bundles with calls wrapped in gc.statepoint, and some of
+    // those calls may have been def'ing live gc pointers.  Clear these out to
+    // avoid accidentally using them.
+    //
+    // TODO: We should create a separate data structure that does not contain
+    // these live sets, and migrate to using that data structure from this point
+    // onward.
+    Info.LiveSet.clear();
+    Info.PointerToBase.clear();
+  }
+
+  // Do all the fixups of the original live variables to their relocated selves
+  SmallVector<Value *, 128> Live;
+  for (size_t i = 0; i < Records.size(); i++) {
+    PartiallyConstructedSafepointRecord &Info = Records[i];
+
+    // We can't simply save the live set from the original insertion.  One of
+    // the live values might be the result of a call which needs a safepoint.
+    // That Value* no longer exists and we need to use the new gc_result.
+    // Thankfully, the live set is embedded in the statepoint (and updated), so
+    // we just grab that.
+    Statepoint Statepoint(Info.StatepointToken);
+    Live.insert(Live.end(), Statepoint.gc_args_begin(),
+                Statepoint.gc_args_end());
+#ifndef NDEBUG
+    // Do some basic sanity checks on our liveness results before performing
+    // relocation.  Relocation can and will turn mistakes in liveness results
+    // into non-sensical code which is must harder to debug.
+    // TODO: It would be nice to test consistency as well
+    assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
+           "statepoint must be reachable or liveness is meaningless");
+    for (Value *V : Statepoint.gc_args()) {
+      if (!isa<Instruction>(V))
+        // Non-instruction values trivial dominate all possible uses
+        continue;
+      auto *LiveInst = cast<Instruction>(V);
+      assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
+             "unreachable values should never be live");
+      assert(DT.dominates(LiveInst, Info.StatepointToken) &&
+             "basic SSA liveness expectation violated by liveness analysis");
+    }
+#endif
+  }
+  unique_unsorted(Live);
+
+#ifndef NDEBUG
+  // sanity check
+  for (auto *Ptr : Live)
+    assert(isHandledGCPointerType(Ptr->getType()) &&
+           "must be a gc pointer type");
+#endif
+
+  relocationViaAlloca(F, DT, Live, Records);
+  return !Records.empty();
+}
+
+// Handles both return values and arguments for Functions and CallSites.
+template <typename AttrHolder>
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+                                      unsigned Index) {
+  AttrBuilder R;
+  if (AH.getDereferenceableBytes(Index))
+    R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
+                                  AH.getDereferenceableBytes(Index)));
+  if (AH.getDereferenceableOrNullBytes(Index))
+    R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
+                                  AH.getDereferenceableOrNullBytes(Index)));
+  if (AH.doesNotAlias(Index))
+    R.addAttribute(Attribute::NoAlias);
+
+  if (!R.empty())
+    AH.setAttributes(AH.getAttributes().removeAttributes(
+        Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+}
+
+void
+RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
+  LLVMContext &Ctx = F.getContext();
+
+  for (Argument &A : F.args())
+    if (isa<PointerType>(A.getType()))
+      RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
+
+  if (isa<PointerType>(F.getReturnType()))
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
+  if (F.empty())
+    return;
+
+  LLVMContext &Ctx = F.getContext();
+  MDBuilder Builder(Ctx);
+
+  for (Instruction &I : instructions(F)) {
+    if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
+      assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
+      bool IsImmutableTBAA =
+          MD->getNumOperands() == 4 &&
+          mdconst::extract<ConstantInt>(MD->getOperand(3))->getValue() == 1;
+
+      if (!IsImmutableTBAA)
+        continue; // no work to do, MD_tbaa is already marked mutable
+
+      MDNode *Base = cast<MDNode>(MD->getOperand(0));
+      MDNode *Access = cast<MDNode>(MD->getOperand(1));
+      uint64_t Offset =
+          mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
+
+      MDNode *MutableTBAA =
+          Builder.createTBAAStructTagNode(Base, Access, Offset);
+      I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
+    }
+
+    if (CallSite CS = CallSite(&I)) {
+      for (int i = 0, e = CS.arg_size(); i != e; i++)
+        if (isa<PointerType>(CS.getArgument(i)->getType()))
+          RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
+      if (isa<PointerType>(CS.getType()))
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+    }
+  }
+}
+
+/// Returns true if this function should be rewritten by this pass.  The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+  // TODO: This should check the GCStrategy
+  if (F.hasGC()) {
+    const auto &FunctionGCName = F.getGC();
+    const StringRef StatepointExampleName("statepoint-example");
+    const StringRef CoreCLRName("coreclr");
+    return (StatepointExampleName == FunctionGCName) ||
+           (CoreCLRName == FunctionGCName);
+  } else
+    return false;
+}
+
+void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
+#ifndef NDEBUG
+  assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) &&
+         "precondition!");
+#endif
+
+  for (Function &F : M)
+    stripNonValidAttributesFromPrototype(F);
+
+  for (Function &F : M)
+    stripNonValidAttributesFromBody(F);
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F) {
+  // Nothing to do for declarations.
+  if (F.isDeclaration() || F.empty())
+    return false;
+
+  // Policy choice says not to rewrite - the most common reason is that we're
+  // compiling code without a GCStrategy.
+  if (!shouldRewriteStatepointsIn(F))
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  auto NeedsRewrite = [](Instruction &I) {
+    if (UseDeoptBundles) {
+      if (ImmutableCallSite CS = ImmutableCallSite(&I))
+        return !callsGCLeafFunction(CS);
+      return false;
+    }
+
+    return isStatepoint(I);
+  };
+
+  // Gather all the statepoints which need rewritten.  Be careful to only
+  // consider those in reachable code since we need to ask dominance queries
+  // when rewriting.  We'll delete the unreachable ones in a moment.
+  SmallVector<CallSite, 64> ParsePointNeeded;
+  bool HasUnreachableStatepoint = false;
+  for (Instruction &I : instructions(F)) {
+    // TODO: only the ones with the flag set!
+    if (NeedsRewrite(I)) {
+      if (DT.isReachableFromEntry(I.getParent()))
+        ParsePointNeeded.push_back(CallSite(&I));
+      else
+        HasUnreachableStatepoint = true;
+    }
+  }
+
+  bool MadeChange = false;
+
+  // Delete any unreachable statepoints so that we don't have unrewritten
+  // statepoints surviving this pass.  This makes testing easier and the
+  // resulting IR less confusing to human readers.  Rather than be fancy, we
+  // just reuse a utility function which removes the unreachable blocks.
+  if (HasUnreachableStatepoint)
+    MadeChange |= removeUnreachableBlocks(F);
+
+  // Return early if no work to do.
+  if (ParsePointNeeded.empty())
+    return MadeChange;
+
+  // As a prepass, go ahead and aggressively destroy single entry phi nodes.
+  // These are created by LCSSA.  They have the effect of increasing the size
+  // of liveness sets for no good reason.  It may be harder to do this post
+  // insertion since relocations and base phis can confuse things.
+  for (BasicBlock &BB : F)
+    if (BB.getUniquePredecessor()) {
+      MadeChange = true;
+      FoldSingleEntryPHINodes(&BB);
+    }
+
+  // Before we start introducing relocations, we want to tweak the IR a bit to
+  // avoid unfortunate code generation effects.  The main example is that we 
+  // want to try to make sure the comparison feeding a branch is after any
+  // safepoints.  Otherwise, we end up with a comparison of pre-relocation
+  // values feeding a branch after relocation.  This is semantically correct,
+  // but results in extra register pressure since both the pre-relocation and
+  // post-relocation copies must be available in registers.  For code without
+  // relocations this is handled elsewhere, but teaching the scheduler to
+  // reverse the transform we're about to do would be slightly complex.
+  // Note: This may extend the live range of the inputs to the icmp and thus
+  // increase the liveset of any statepoint we move over.  This is profitable
+  // as long as all statepoints are in rare blocks.  If we had in-register
+  // lowering for live values this would be a much safer transform.
+  auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+    if (auto *BI = dyn_cast<BranchInst>(TI))
+      if (BI->isConditional())
+        return dyn_cast<Instruction>(BI->getCondition());
+    // TODO: Extend this to handle switches
+    return nullptr;
+  };
+  for (BasicBlock &BB : F) {
+    TerminatorInst *TI = BB.getTerminator();
+    if (auto *Cond = getConditionInst(TI))
+      // TODO: Handle more than just ICmps here.  We should be able to move
+      // most instructions without side effects or memory access.  
+      if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+        MadeChange = true;
+        Cond->moveBefore(TI);
+      }
+  }
+
+  MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
+  return MadeChange;
+}
+
+// liveness computation via standard dataflow
+// -------------------------------------------------------------------
+
+// TODO: Consider using bitvectors for liveness, the set of potentially
+// interesting values should be small and easy to pre-compute.
+
+/// Compute the live-in set for the location rbegin starting from
+/// the live-out set of the basic block
+static void computeLiveInValues(BasicBlock::reverse_iterator rbegin,
+                                BasicBlock::reverse_iterator rend,
+                                DenseSet<Value *> &LiveTmp) {
+
+  for (BasicBlock::reverse_iterator ritr = rbegin; ritr != rend; ritr++) {
+    Instruction *I = &*ritr;
+
+    // KILL/Def - Remove this definition from LiveIn
+    LiveTmp.erase(I);
+
+    // Don't consider *uses* in PHI nodes, we handle their contribution to
+    // predecessor blocks when we seed the LiveOut sets
+    if (isa<PHINode>(I))
+      continue;
+
+    // USE - Add to the LiveIn set for this instruction
+    for (Value *V : I->operands()) {
+      assert(!isUnhandledGCPointerType(V->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+        // The choice to exclude all things constant here is slightly subtle.
+        // There are two independent reasons:
+        // - We assume that things which are constant (from LLVM's definition)
+        // do not move at runtime.  For example, the address of a global
+        // variable is fixed, even though it's contents may not be.
+        // - Second, we can't disallow arbitrary inttoptr constants even
+        // if the language frontend does.  Optimization passes are free to
+        // locally exploit facts without respect to global reachability.  This
+        // can create sections of code which are dynamically unreachable and
+        // contain just about anything.  (see constants.ll in tests)
+        LiveTmp.insert(V);
+      }
+    }
+  }
+}
+
+static void computeLiveOutSeed(BasicBlock *BB, DenseSet<Value *> &LiveTmp) {
+
+  for (BasicBlock *Succ : successors(BB)) {
+    const BasicBlock::iterator E(Succ->getFirstNonPHI());
+    for (BasicBlock::iterator I = Succ->begin(); I != E; I++) {
+      PHINode *Phi = cast<PHINode>(&*I);
+      Value *V = Phi->getIncomingValueForBlock(BB);
+      assert(!isUnhandledGCPointerType(V->getType()) &&
+             "support for FCA unimplemented");
+      if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+        LiveTmp.insert(V);
+      }
+    }
+  }
+}
+
+static DenseSet<Value *> computeKillSet(BasicBlock *BB) {
+  DenseSet<Value *> KillSet;
+  for (Instruction &I : *BB)
+    if (isHandledGCPointerType(I.getType()))
+      KillSet.insert(&I);
+  return KillSet;
+}
+
+#ifndef NDEBUG
+/// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
+/// sanity check for the liveness computation.
+static void checkBasicSSA(DominatorTree &DT, DenseSet<Value *> &Live,
+                          TerminatorInst *TI, bool TermOkay = false) {
+  for (Value *V : Live) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      // The terminator can be a member of the LiveOut set.  LLVM's definition
+      // of instruction dominance states that V does not dominate itself.  As
+      // such, we need to special case this to allow it.
+      if (TermOkay && TI == I)
+        continue;
+      assert(DT.dominates(I, TI) &&
+             "basic SSA liveness expectation violated by liveness analysis");
+    }
+  }
+}
+
+/// Check that all the liveness sets used during the computation of liveness
+/// obey basic SSA properties.  This is useful for finding cases where we miss
+/// a def.
+static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
+                          BasicBlock &BB) {
+  checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
+  checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
+  checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
+}
+#endif
+
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+                                GCPtrLivenessData &Data) {
+
+  SmallSetVector<BasicBlock *, 200> Worklist;
+  auto AddPredsToWorklist = [&](BasicBlock *BB) {
+    // We use a SetVector so that we don't have duplicates in the worklist.
+    Worklist.insert(pred_begin(BB), pred_end(BB));
+  };
+  auto NextItem = [&]() {
+    BasicBlock *BB = Worklist.back();
+    Worklist.pop_back();
+    return BB;
+  };
+
+  // Seed the liveness for each individual block
+  for (BasicBlock &BB : F) {
+    Data.KillSet[&BB] = computeKillSet(&BB);
+    Data.LiveSet[&BB].clear();
+    computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
+
+#ifndef NDEBUG
+    for (Value *Kill : Data.KillSet[&BB])
+      assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
+#endif
+
+    Data.LiveOut[&BB] = DenseSet<Value *>();
+    computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
+    Data.LiveIn[&BB] = Data.LiveSet[&BB];
+    set_union(Data.LiveIn[&BB], Data.LiveOut[&BB]);
+    set_subtract(Data.LiveIn[&BB], Data.KillSet[&BB]);
+    if (!Data.LiveIn[&BB].empty())
+      AddPredsToWorklist(&BB);
+  }
+
+  // Propagate that liveness until stable
+  while (!Worklist.empty()) {
+    BasicBlock *BB = NextItem();
+
+    // Compute our new liveout set, then exit early if it hasn't changed
+    // despite the contribution of our successor.
+    DenseSet<Value *> LiveOut = Data.LiveOut[BB];
+    const auto OldLiveOutSize = LiveOut.size();
+    for (BasicBlock *Succ : successors(BB)) {
+      assert(Data.LiveIn.count(Succ));
+      set_union(LiveOut, Data.LiveIn[Succ]);
+    }
+    // assert OutLiveOut is a subset of LiveOut
+    if (OldLiveOutSize == LiveOut.size()) {
+      // If the sets are the same size, then we didn't actually add anything
+      // when unioning our successors LiveIn  Thus, the LiveIn of this block
+      // hasn't changed.
+      continue;
+    }
+    Data.LiveOut[BB] = LiveOut;
+
+    // Apply the effects of this basic block
+    DenseSet<Value *> LiveTmp = LiveOut;
+    set_union(LiveTmp, Data.LiveSet[BB]);
+    set_subtract(LiveTmp, Data.KillSet[BB]);
+
+    assert(Data.LiveIn.count(BB));
+    const DenseSet<Value *> &OldLiveIn = Data.LiveIn[BB];
+    // assert: OldLiveIn is a subset of LiveTmp
+    if (OldLiveIn.size() != LiveTmp.size()) {
+      Data.LiveIn[BB] = LiveTmp;
+      AddPredsToWorklist(BB);
+    }
+  } // while( !worklist.empty() )
+
+#ifndef NDEBUG
+  // Sanity check our output against SSA properties.  This helps catch any
+  // missing kills during the above iteration.
+  for (BasicBlock &BB : F) {
+    checkBasicSSA(DT, Data, BB);
+  }
+#endif
+}
+
+static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
+                              StatepointLiveSetTy &Out) {
+
+  BasicBlock *BB = Inst->getParent();
+
+  // Note: The copy is intentional and required
+  assert(Data.LiveOut.count(BB));
+  DenseSet<Value *> LiveOut = Data.LiveOut[BB];
+
+  // We want to handle the statepoint itself oddly.  It's
+  // call result is not live (normal), nor are it's arguments
+  // (unless they're used again later).  This adjustment is
+  // specifically what we need to relocate
+  BasicBlock::reverse_iterator rend(Inst->getIterator());
+  computeLiveInValues(BB->rbegin(), rend, LiveOut);
+  LiveOut.erase(Inst);
+  Out.insert(LiveOut.begin(), LiveOut.end());
+}
+
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+                                  const CallSite &CS,
+                                  PartiallyConstructedSafepointRecord &Info) {
+  Instruction *Inst = CS.getInstruction();
+  StatepointLiveSetTy Updated;
+  findLiveSetAtInst(Inst, RevisedLivenessData, Updated);
+
+#ifndef NDEBUG
+  DenseSet<Value *> Bases;
+  for (auto KVPair : Info.PointerToBase) {
+    Bases.insert(KVPair.second);
+  }
+#endif
+  // We may have base pointers which are now live that weren't before.  We need
+  // to update the PointerToBase structure to reflect this.
+  for (auto V : Updated)
+    if (!Info.PointerToBase.count(V)) {
+      assert(Bases.count(V) && "can't find base for unexpected live value");
+      Info.PointerToBase[V] = V;
+      continue;
+    }
+
+#ifndef NDEBUG
+  for (auto V : Updated) {
+    assert(Info.PointerToBase.count(V) &&
+           "must be able to find base for live value");
+  }
+#endif
+
+  // Remove any stale base mappings - this can happen since our liveness is
+  // more precise then the one inherent in the base pointer analysis
+  DenseSet<Value *> ToErase;
+  for (auto KVPair : Info.PointerToBase)
+    if (!Updated.count(KVPair.first))
+      ToErase.insert(KVPair.first);
+  for (auto V : ToErase)
+    Info.PointerToBase.erase(V);
+
+#ifndef NDEBUG
+  for (auto KVPair : Info.PointerToBase)
+    assert(Updated.count(KVPair.first) && "record for non-live value");
+#endif
+
+  Info.LiveSet = Updated;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 0000000..8569e08
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,1980 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "sccp"
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy.  It is a simple class with value semantics.
+///
+class LatticeVal {
+  enum LatticeValueTy {
+    /// undefined - This LLVM Value has no known value yet.
+    undefined,
+
+    /// constant - This LLVM Value has a specific constant value.
+    constant,
+
+    /// forcedconstant - This LLVM Value was thought to be undef until
+    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
+    /// with another (different) constant, it goes to overdefined, instead of
+    /// asserting.
+    forcedconstant,
+
+    /// overdefined - This instruction is not known to be constant, and we know
+    /// it has a value.
+    overdefined
+  };
+
+  /// Val: This stores the current lattice value along with the Constant* for
+  /// the constant if this is a 'constant' or 'forcedconstant' value.
+  PointerIntPair<Constant *, 2, LatticeValueTy> Val;
+
+  LatticeValueTy getLatticeValue() const {
+    return Val.getInt();
+  }
+
+public:
+  LatticeVal() : Val(nullptr, undefined) {}
+
+  bool isUndefined() const { return getLatticeValue() == undefined; }
+  bool isConstant() const {
+    return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
+  }
+  bool isOverdefined() const { return getLatticeValue() == overdefined; }
+
+  Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return Val.getPointer();
+  }
+
+  /// markOverdefined - Return true if this is a change in status.
+  bool markOverdefined() {
+    if (isOverdefined())
+      return false;
+
+    Val.setInt(overdefined);
+    return true;
+  }
+
+  /// markConstant - Return true if this is a change in status.
+  bool markConstant(Constant *V) {
+    if (getLatticeValue() == constant) { // Constant but not forcedconstant.
+      assert(getConstant() == V && "Marking constant with different value");
+      return false;
+    }
+
+    if (isUndefined()) {
+      Val.setInt(constant);
+      assert(V && "Marking constant with NULL");
+      Val.setPointer(V);
+    } else {
+      assert(getLatticeValue() == forcedconstant &&
+             "Cannot move from overdefined to constant!");
+      // Stay at forcedconstant if the constant is the same.
+      if (V == getConstant()) return false;
+
+      // Otherwise, we go to overdefined.  Assumptions made based on the
+      // forced value are possibly wrong.  Assuming this is another constant
+      // could expose a contradiction.
+      Val.setInt(overdefined);
+    }
+    return true;
+  }
+
+  /// getConstantInt - If this is a constant with a ConstantInt value, return it
+  /// otherwise return null.
+  ConstantInt *getConstantInt() const {
+    if (isConstant())
+      return dyn_cast<ConstantInt>(getConstant());
+    return nullptr;
+  }
+
+  void markForcedConstant(Constant *V) {
+    assert(isUndefined() && "Can't force a defined value!");
+    Val.setInt(forcedconstant);
+    Val.setPointer(V);
+  }
+};
+} // end anonymous namespace.
+
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  const DataLayout &DL;
+  const TargetLibraryInfo *TLI;
+  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
+  DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
+
+  /// StructValueState - This maintains ValueState for values that have
+  /// StructType, for example for formal arguments, calls, insertelement, etc.
+  ///
+  DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+  /// TrackedRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  DenseMap<Function*, LatticeVal> TrackedRetVals;
+
+  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+  /// that return multiple values.
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+  /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+  /// represented here for efficient lookup.
+  SmallPtrSet<Function*, 16> MRVFunctionsTracked;
+
+  /// TrackingIncomingArguments - This is the set of functions for whose
+  /// arguments we make optimistic assumptions about and try to prove as
+  /// constants.
+  SmallPtrSet<Function*, 16> TrackingIncomingArguments;
+
+  /// The reason for two worklists is that overdefined is the lowest state
+  /// on the lattice, and moving things to overdefined as fast as possible
+  /// makes SCCP converge much faster.
+  ///
+  /// By having a separate worklist, we accomplish this because everything
+  /// possibly overdefined will become overdefined at the soonest possible
+  /// point.
+  SmallVector<Value*, 64> OverdefinedInstWorkList;
+  SmallVector<Value*, 64> InstWorkList;
+
+
+  SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+  DenseSet<Edge> KnownFeasibleEdges;
+public:
+  SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
+      : DL(DL), TLI(tli) {}
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  ///
+  /// This returns true if the block was not considered live before.
+  bool MarkBlockExecutable(BasicBlock *BB) {
+    if (!BBExecutable.insert(BB).second)
+      return false;
+    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+    return true;
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    // We only track the contents of scalar globals.
+    if (GV->getType()->getElementType()->isSingleValueType()) {
+      LatticeVal &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    // Add an entry, F -> undef.
+    if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+      MRVFunctionsTracked.insert(F);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+                                                     LatticeVal()));
+    } else
+      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+  }
+
+  void AddArgumentTrackedFunction(Function *F) {
+    TrackingIncomingArguments.insert(F);
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  ///
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  bool isBlockExecutable(BasicBlock *BB) const {
+    return BBExecutable.count(BB);
+  }
+
+  LatticeVal getLatticeValueFor(Value *V) const {
+    DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+    assert(I != ValueState.end() && "V is not in valuemap!");
+    return I->second;
+  }
+
+  /// getTrackedRetVals - Get the inferred return value map.
+  ///
+  const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+    return TrackedRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  void markOverdefined(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    markOverdefined(ValueState[V], V);
+  }
+
+  /// markAnythingOverdefined - Mark the specified value overdefined.  This
+  /// works with both scalars and structs.
+  void markAnythingOverdefined(Value *V) {
+    if (StructType *STy = dyn_cast<StructType>(V->getType()))
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        markOverdefined(getStructValueState(V, i), V);
+    else
+      markOverdefined(V);
+  }
+
+private:
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  //
+  void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (!IV.markConstant(C)) return;
+    DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+    if (IV.isOverdefined())
+      OverdefinedInstWorkList.push_back(V);
+    else
+      InstWorkList.push_back(V);
+  }
+
+  void markConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    markConstant(ValueState[V], V, C);
+  }
+
+  void markForcedConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    LatticeVal &IV = ValueState[V];
+    IV.markForcedConstant(C);
+    DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+    if (IV.isOverdefined())
+      OverdefinedInstWorkList.push_back(V);
+    else
+      InstWorkList.push_back(V);
+  }
+
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+  void markOverdefined(LatticeVal &IV, Value *V) {
+    if (!IV.markOverdefined()) return;
+
+    DEBUG(dbgs() << "markOverdefined: ";
+          if (Function *F = dyn_cast<Function>(V))
+            dbgs() << "Function '" << F->getName() << "'\n";
+          else
+            dbgs() << *V << '\n');
+    // Only instructions go on the work list
+    OverdefinedInstWorkList.push_back(V);
+  }
+
+  void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+    if (IV.isOverdefined() || MergeWithV.isUndefined())
+      return;  // Noop.
+    if (MergeWithV.isOverdefined())
+      markOverdefined(IV, V);
+    else if (IV.isUndefined())
+      markConstant(IV, V, MergeWithV.getConstant());
+    else if (IV.getConstant() != MergeWithV.getConstant())
+      markOverdefined(IV, V);
+  }
+
+  void mergeInValue(Value *V, LatticeVal MergeWithV) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    mergeInValue(ValueState[V], V, MergeWithV);
+  }
+
+
+  /// getValueState - Return the LatticeVal object that corresponds to the
+  /// value.  This function handles the case when the value hasn't been seen yet
+  /// by properly seeding constants etc.
+  LatticeVal &getValueState(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+    std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
+      ValueState.insert(std::make_pair(V, LatticeVal()));
+    LatticeVal &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      // Undef values remain undefined.
+      if (!isa<UndefValue>(V))
+        LV.markConstant(C);          // Constants are constant
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+  /// getStructValueState - Return the LatticeVal object that corresponds to the
+  /// value/field pair.  This function handles the case when the value hasn't
+  /// been seen yet by properly seeding constants etc.
+  LatticeVal &getStructValueState(Value *V, unsigned i) {
+    assert(V->getType()->isStructTy() && "Should use getValueState");
+    assert(i < cast<StructType>(V->getType())->getNumElements() &&
+           "Invalid element #");
+
+    std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
+              bool> I = StructValueState.insert(
+                        std::make_pair(std::make_pair(V, i), LatticeVal()));
+    LatticeVal &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      Constant *Elt = C->getAggregateElement(i);
+
+      if (!Elt)
+        LV.markOverdefined();      // Unknown sort of constant.
+      else if (isa<UndefValue>(Elt))
+        ; // Undef values remain undefined.
+      else
+        LV.markConstant(Elt);      // Constants are constant.
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+
+  /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  /// work list if it is not already executable.
+  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return;  // This edge is already known to be executable!
+
+    if (!MarkBlockExecutable(Dest)) {
+      // If the destination is already executable, we just made an *edge*
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+            << " -> " << Dest->getName() << '\n');
+
+      PHINode *PN;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I)
+        visitPHINode(*PN);
+    }
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  //
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible.
+  //
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow.  Based on this
+  // information, we need to update the specified user of this instruction.
+  //
+  void OperandChangedState(Instruction *I) {
+    if (BBExecutable.count(I->getParent()))   // Inst is executable?
+      visit(*I);
+  }
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction.  Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminatorInst(TerminatorInst &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
+  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+  void visitFuncletPadInst(FuncletPadInst &FPI) {
+    markAnythingOverdefined(&FPI);
+  }
+  void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+    markAnythingOverdefined(&CPI);
+    visitTerminatorInst(CPI);
+  }
+
+  // Instructions that cannot be folded away.
+  void visitStoreInst     (StoreInst &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitCallInst      (CallInst &I) {
+    visitCallSite(&I);
+  }
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallSite(&II);
+    visitTerminatorInst(II);
+  }
+  void visitCallSite      (CallSite CS);
+  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    markAnythingOverdefined(&I);
+  }
+  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
+  void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
+
+  void visitInstruction(Instruction &I) {
+    // If a new instruction is added to LLVM that we don't handle.
+    dbgs() << "SCCP: Don't know how to handle: " << I << '\n';
+    markAnythingOverdefined(&I);   // Just in case
+  }
+};
+
+} // end anonymous namespace
+
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                       SmallVectorImpl<bool> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+      return;
+    }
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (!CI) {
+      // Overdefined condition variables, and branches on unfoldable constant
+      // conditions, mean the branch could go either way.
+      if (!BCValue.isUndefined())
+        Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // Constant condition variables mean the branch can only go a single way.
+    Succs[CI->isZero()] = true;
+    return;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI.isExceptional()) {
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (!SI->getNumCases()) {
+      Succs[0] = true;
+      return;
+    }
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (!CI) {   // Overdefined or undefined condition?
+      // All destinations are executable!
+      if (!SCValue.isUndefined())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+    return;
+  }
+
+  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
+  if (isa<IndirectBrInst>(&TI)) {
+    // Just mark all destinations executable!
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+
+#ifndef NDEBUG
+  dbgs() << "Unknown terminator instruction: " << TI << '\n';
+#endif
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now.
+  TerminatorInst *TI = From->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+
+    // Overdefined condition variables mean the branch could go either way,
+    // undef conditions mean that neither edge is feasible yet.
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (!CI)
+      return !BCValue.isUndefined();
+
+    // Constant condition variables mean the branch can only go a single way.
+    return BI->getSuccessor(CI->isZero()) == To;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI->isExceptional())
+    return true;
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumCases() < 1)
+      return true;
+
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (!CI)
+      return !SCValue.isUndefined();
+
+    return SI->findCaseValue(CI).getCaseSuccessor() == To;
+  }
+
+  // Just mark all destinations executable!
+  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
+  if (isa<IndirectBrInst>(TI))
+    return true;
+
+#ifndef NDEBUG
+  dbgs() << "Unknown terminator instruction: " << *TI << '\n';
+#endif
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  // If this PN returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (PN.getType()->isStructTy())
+    return markAnythingOverdefined(&PN);
+
+  if (getValueState(&PN).isOverdefined())
+    return;  // Quick exit
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64)
+    return markOverdefined(&PN);
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is overdefined.
+  // If there are no executable operands, the PHI remains undefined.
+  //
+  Constant *OperandVal = nullptr;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LatticeVal IV = getValueState(PN.getIncomingValue(i));
+    if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
+
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+      continue;
+
+    if (IV.isOverdefined())    // PHI node becomes overdefined!
+      return markOverdefined(&PN);
+
+    if (!OperandVal) {   // Grab the first value.
+      OperandVal = IV.getConstant();
+      continue;
+    }
+
+    // There is already a reachable operand.  If we conflict with it,
+    // then the PHI node becomes overdefined.  If we agree with it, we
+    // can continue on.
+
+    // Check to see if there are two different constants merging, if so, the PHI
+    // node is overdefined.
+    if (IV.getConstant() != OperandVal)
+      return markOverdefined(&PN);
+  }
+
+  // If we exited the loop, this means that the PHI node only has constant
+  // arguments that agree with each other(and OperandVal is the constant) or
+  // OperandVal is null because there are no defined incoming arguments.  If
+  // this is the case, the PHI remains undefined.
+  //
+  if (OperandVal)
+    markConstant(&PN, OperandVal);      // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // ret void
+
+  Function *F = I.getParent()->getParent();
+  Value *ResultOp = I.getOperand(0);
+
+  // If we are tracking the return value of this function, merge it in.
+  if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI =
+      TrackedRetVals.find(F);
+    if (TFRVI != TrackedRetVals.end()) {
+      mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+      return;
+    }
+  }
+
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty()) {
+    if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
+      if (MRVFunctionsTracked.count(F))
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+                       getStructValueState(ResultOp, i));
+
+  }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable.
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  LatticeVal OpSt = getValueState(I.getOperand(0));
+  if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
+    markOverdefined(&I);
+  else if (OpSt.isConstant()) {
+    Constant *C =
+        ConstantExpr::getCast(I.getOpcode(), OpSt.getConstant(), I.getType());
+    if (isa<UndefValue>(C))
+      return;
+    // Propagate constant value
+    markConstant(&I, C);
+  }
+}
+
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  // If this returns a struct, mark all elements over defined, we don't track
+  // structs in structs.
+  if (EVI.getType()->isStructTy())
+    return markAnythingOverdefined(&EVI);
+
+  // If this is extracting from more than one level of struct, we don't know.
+  if (EVI.getNumIndices() != 1)
+    return markOverdefined(&EVI);
+
+  Value *AggVal = EVI.getAggregateOperand();
+  if (AggVal->getType()->isStructTy()) {
+    unsigned i = *EVI.idx_begin();
+    LatticeVal EltVal = getStructValueState(AggVal, i);
+    mergeInValue(getValueState(&EVI), &EVI, EltVal);
+  } else {
+    // Otherwise, must be extracting from an array.
+    return markOverdefined(&EVI);
+  }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  StructType *STy = dyn_cast<StructType>(IVI.getType());
+  if (!STy)
+    return markOverdefined(&IVI);
+
+  // If this has more than one index, we can't handle it, drive all results to
+  // undef.
+  if (IVI.getNumIndices() != 1)
+    return markAnythingOverdefined(&IVI);
+
+  Value *Aggr = IVI.getAggregateOperand();
+  unsigned Idx = *IVI.idx_begin();
+
+  // Compute the result based on what we're inserting.
+  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    // This passes through all values that aren't the inserted element.
+    if (i != Idx) {
+      LatticeVal EltVal = getStructValueState(Aggr, i);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+      continue;
+    }
+
+    Value *Val = IVI.getInsertedValueOperand();
+    if (Val->getType()->isStructTy())
+      // We don't track structs in structs.
+      markOverdefined(getStructValueState(&IVI, i), &IVI);
+    else {
+      LatticeVal InVal = getValueState(Val);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+    }
+  }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  // If this select returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (I.getType()->isStructTy())
+    return markAnythingOverdefined(&I);
+
+  LatticeVal CondValue = getValueState(I.getCondition());
+  if (CondValue.isUndefined())
+    return;
+
+  if (ConstantInt *CondCB = CondValue.getConstantInt()) {
+    Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+    mergeInValue(&I, getValueState(OpVal));
+    return;
+  }
+
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  LatticeVal TVal = getValueState(I.getTrueValue());
+  LatticeVal FVal = getValueState(I.getFalseValue());
+
+  // select ?, C, C -> C.
+  if (TVal.isConstant() && FVal.isConstant() &&
+      TVal.getConstant() == FVal.getConstant())
+    return markConstant(&I, FVal.getConstant());
+
+  if (TVal.isUndefined())   // select ?, undef, X -> X.
+    return mergeInValue(&I, FVal);
+  if (FVal.isUndefined())   // select ?, X, undef -> X.
+    return mergeInValue(&I, TVal);
+  markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  LatticeVal V1State = getValueState(I.getOperand(0));
+  LatticeVal V2State = getValueState(I.getOperand(1));
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (V1State.isConstant() && V2State.isConstant()) {
+    Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                    V2State.getConstant());
+    // X op Y -> undef.
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
+
+  // If something is undef, wait for it to resolve.
+  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+    return;
+
+  // Otherwise, one of our operands is overdefined.  Try to produce something
+  // better than overdefined with some tricks.
+
+  // If this is an AND or OR with 0 or -1, it doesn't matter that the other
+  // operand is overdefined.
+  if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+    LatticeVal *NonOverdefVal = nullptr;
+    if (!V1State.isOverdefined())
+      NonOverdefVal = &V1State;
+    else if (!V2State.isOverdefined())
+      NonOverdefVal = &V2State;
+
+    if (NonOverdefVal) {
+      if (NonOverdefVal->isUndefined()) {
+        // Could annihilate value.
+        if (I.getOpcode() == Instruction::And)
+          markConstant(IV, &I, Constant::getNullValue(I.getType()));
+        else if (VectorType *PT = dyn_cast<VectorType>(I.getType()))
+          markConstant(IV, &I, Constant::getAllOnesValue(PT));
+        else
+          markConstant(IV, &I,
+                       Constant::getAllOnesValue(I.getType()));
+        return;
+      }
+
+      if (I.getOpcode() == Instruction::And) {
+        // X and 0 = 0
+        if (NonOverdefVal->getConstant()->isNullValue())
+          return markConstant(IV, &I, NonOverdefVal->getConstant());
+      } else {
+        if (ConstantInt *CI = NonOverdefVal->getConstantInt())
+          if (CI->isAllOnesValue())     // X or -1 = -1
+            return markConstant(IV, &I, NonOverdefVal->getConstant());
+      }
+    }
+  }
+
+
+  markOverdefined(&I);
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  LatticeVal V1State = getValueState(I.getOperand(0));
+  LatticeVal V2State = getValueState(I.getOperand(1));
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (V1State.isConstant() && V2State.isConstant()) {
+    Constant *C = ConstantExpr::getCompare(
+        I.getPredicate(), V1State.getConstant(), V2State.getConstant());
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
+
+  // If operands are still undefined, wait for it to resolve.
+  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+    return;
+
+  markOverdefined(&I);
+}
+
+void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
+  // TODO : SCCP does not handle vectors properly.
+  return markOverdefined(&I);
+
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &IdxState = getValueState(I.getOperand(1));
+
+  if (ValState.isOverdefined() || IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
+                                                     IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
+  // TODO : SCCP does not handle vectors properly.
+  return markOverdefined(&I);
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &EltState = getValueState(I.getOperand(1));
+  LatticeVal &IdxState = getValueState(I.getOperand(2));
+
+  if (ValState.isOverdefined() || EltState.isOverdefined() ||
+      IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && EltState.isConstant() &&
+          IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
+                                                    EltState.getConstant(),
+                                                    IdxState.getConstant()));
+  else if (ValState.isUndefined() && EltState.isConstant() &&
+           IdxState.isConstant())
+    markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
+                                                   EltState.getConstant(),
+                                                   IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  // TODO : SCCP does not handle vectors properly.
+  return markOverdefined(&I);
+#if 0
+  LatticeVal &V1State   = getValueState(I.getOperand(0));
+  LatticeVal &V2State   = getValueState(I.getOperand(1));
+  LatticeVal &MaskState = getValueState(I.getOperand(2));
+
+  if (MaskState.isUndefined() ||
+      (V1State.isUndefined() && V2State.isUndefined()))
+    return;  // Undefined output if mask or both inputs undefined.
+
+  if (V1State.isOverdefined() || V2State.isOverdefined() ||
+      MaskState.isOverdefined()) {
+    markOverdefined(&I);
+  } else {
+    // A mix of constant/undef inputs.
+    Constant *V1 = V1State.isConstant() ?
+        V1State.getConstant() : UndefValue::get(I.getType());
+    Constant *V2 = V2State.isConstant() ?
+        V2State.getConstant() : UndefValue::get(I.getType());
+    Constant *Mask = MaskState.isConstant() ?
+      MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
+    markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
+  }
+#endif
+}
+
+// Handle getelementptr instructions.  If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  if (ValueState[&I].isOverdefined()) return;
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    LatticeVal State = getValueState(I.getOperand(i));
+    if (State.isUndefined())
+      return;  // Operands are not resolved yet.
+
+    if (State.isOverdefined())
+      return markOverdefined(&I);
+
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  Constant *Ptr = Operands[0];
+  auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
+  Constant *C =
+      ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+  if (isa<UndefValue>(C))
+      return;
+  markConstant(&I, C);
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+  // If this store is of a struct, ignore it.
+  if (SI.getOperand(0)->getType()->isStructTy())
+    return;
+
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+  // Get the value we are storing into the global, then merge it.
+  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  // If this load is of a struct, just mark the result overdefined.
+  if (I.getType()->isStructTy())
+    return markAnythingOverdefined(&I);
+
+  LatticeVal PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (!PtrVal.isConstant() || I.isVolatile())
+    return markOverdefined(IV, &I);
+
+  Constant *Ptr = PtrVal.getConstant();
+
+  // load null is undefined.
+  if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
+    return;
+
+  // Transform load (constant global) into the value loaded.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+    if (!TrackedGlobals.empty()) {
+      // If we are tracking this global, merge in the known value for it.
+      DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+        TrackedGlobals.find(GV);
+      if (It != TrackedGlobals.end()) {
+        mergeInValue(IV, &I, It->second);
+        return;
+      }
+    }
+  }
+
+  // Transform load from a constant into a constant if possible.
+  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL)) {
+    if (isa<UndefValue>(C))
+      return;
+    return markConstant(IV, &I, C);
+  }
+
+  // Otherwise we cannot say for certain what value this load will produce.
+  // Bail out.
+  markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  Instruction *I = CS.getInstruction();
+
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (!F || F->isDeclaration()) {
+CallOverdefined:
+    // Void return and not tracking callee, just bail.
+    if (I->getType()->isVoidTy()) return;
+
+    // Otherwise, if we have a single return value case, and if the function is
+    // a declaration, maybe we can constant fold it.
+    if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
+        canConstantFoldCallTo(F)) {
+
+      SmallVector<Constant*, 8> Operands;
+      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+           AI != E; ++AI) {
+        LatticeVal State = getValueState(*AI);
+
+        if (State.isUndefined())
+          return;  // Operands are not resolved yet.
+        if (State.isOverdefined())
+          return markOverdefined(I);
+        assert(State.isConstant() && "Unknown state!");
+        Operands.push_back(State.getConstant());
+      }
+
+      if (getValueState(I).isOverdefined())
+        return;
+
+      // If we can constant fold this, mark the result of the call as a
+      // constant.
+      if (Constant *C = ConstantFoldCall(F, Operands, TLI)) {
+        // call -> undef.
+        if (isa<UndefValue>(C))
+          return;
+        return markConstant(I, C);
+      }
+    }
+
+    // Otherwise, we don't know anything about this call, mark it overdefined.
+    return markAnythingOverdefined(I);
+  }
+
+  // If this is a local function that doesn't have its address taken, mark its
+  // entry block executable and merge in the actual arguments to the call into
+  // the formal arguments of the function.
+  if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
+    MarkBlockExecutable(&F->front());
+
+    // Propagate information from this call site into the callee.
+    CallSite::arg_iterator CAI = CS.arg_begin();
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI, ++CAI) {
+      // If this argument is byval, and if the function is not readonly, there
+      // will be an implicit copy formed of the input aggregate.
+      if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+        markOverdefined(&*AI);
+        continue;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal CallArg = getStructValueState(*CAI, i);
+          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
+        }
+      } else {
+        mergeInValue(&*AI, getValueState(*CAI));
+      }
+    }
+  }
+
+  // If this is a single/zero retval case, see if we're tracking the function.
+  if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+    if (!MRVFunctionsTracked.count(F))
+      goto CallOverdefined;  // Not tracking this callee.
+
+    // If we are tracking this callee, propagate the result of the function
+    // into this call site.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      mergeInValue(getStructValueState(I, i), I,
+                   TrackedMultipleRetVals[std::make_pair(F, i)]);
+  } else {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+    if (TFRVI == TrackedRetVals.end())
+      goto CallOverdefined;  // Not tracking this callee.
+
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(I, TFRVI->second);
+  }
+}
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the overdefined instruction's work list first, which drives other
+    // things to overdefined more quickly.
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.pop_back_val();
+
+      DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant, or to overdefined.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value.
+      //
+      for (User *U : I->users())
+        if (Instruction *UI = dyn_cast<Instruction>(U))
+          OperandChangedState(UI);
+    }
+
+    // Process the instruction work list.
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.pop_back_val();
+
+      DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+      // "I" got into the work list because it made the transition from undef to
+      // constant.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value.
+      //
+      if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+        for (User *U : I->users())
+          if (Instruction *UI = dyn_cast<Instruction>(U))
+            OperandChangedState(UI);
+    }
+
+    // Process the basic block work list.
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(&*BB))
+      continue;
+
+    for (Instruction &I : *BB) {
+      // Look for instructions which produce undef values.
+      if (I.getType()->isVoidTy()) continue;
+
+      if (StructType *STy = dyn_cast<StructType>(I.getType())) {
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (CallSite CS = CallSite(&I))
+          if (Function *F = CS.getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands.
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal &LV = getStructValueState(&I, i);
+          if (LV.isUndefined())
+            markOverdefined(LV, &I);
+        }
+        continue;
+      }
+
+      LatticeVal &LV = getValueState(&I);
+      if (!LV.isUndefined()) continue;
+
+      // extractvalue is safe; check here because the argument is a struct.
+      if (isa<ExtractValueInst>(I))
+        continue;
+
+      // Compute the operand LatticeVals, for convenience below.
+      // Anything taking a struct is conservatively assumed to require
+      // overdefined markings.
+      if (I.getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(&I);
+        return true;
+      }
+      LatticeVal Op0LV = getValueState(I.getOperand(0));
+      LatticeVal Op1LV;
+      if (I.getNumOperands() == 2) {
+        if (I.getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(&I);
+          return true;
+        }
+
+        Op1LV = getValueState(I.getOperand(1));
+      }
+      // If this is an instructions whose result is defined even if the input is
+      // not fully defined, propagate the information.
+      Type *ITy = I.getType();
+      switch (I.getOpcode()) {
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+        break; // Any undef -> undef
+      case Instruction::FSub:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        // Floating-point binary operation: be conservative.
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          markForcedConstant(&I, Constant::getNullValue(ITy));
+        else
+          markOverdefined(&I);
+        return true;
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        // undef -> 0; some outputs are impossible
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Mul:
+      case Instruction::And:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
+        // undef * X -> 0.   X could be zero.
+        // undef & X -> 0.   X could be zero.
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::Or:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
+        // undef | X -> -1.   X could be -1.
+        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+        return true;
+
+      case Instruction::Xor:
+        // undef ^ undef -> 0; strictly speaking, this is not strictly
+        // necessary, but we try to be nice to people who expect this
+        // behavior in simple cases
+        if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+          markForcedConstant(&I, Constant::getNullValue(ITy));
+          return true;
+        }
+        // undef ^ X -> undef
+        break;
+
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // X / undef -> undef.  No change.
+        // X % undef -> undef.  No change.
+        if (Op1LV.isUndefined()) break;
+
+        // X / 0 -> undef.  No change.
+        // X % 0 -> undef.  No change.
+        if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
+          break;
+
+        // undef / X -> 0.   X could be maxint.
+        // undef % X -> 0.   X could be 1.
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::AShr:
+        // X >>a undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef >>a X -> all ones
+        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
+        return true;
+      case Instruction::LShr:
+      case Instruction::Shl:
+        // X << undef -> undef.
+        // X >> undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef << X -> 0
+        // undef >> X -> 0
+        markForcedConstant(&I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Select:
+        Op1LV = getValueState(I.getOperand(1));
+        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
+        if (Op0LV.isUndefined()) {
+          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
+            Op1LV = getValueState(I.getOperand(2));
+        } else if (Op1LV.isUndefined()) {
+          // c ? undef : undef -> undef.  No change.
+          Op1LV = getValueState(I.getOperand(2));
+          if (Op1LV.isUndefined())
+            break;
+          // Otherwise, c ? undef : x -> x.
+        } else {
+          // Leave Op1LV as Operand(1)'s LatticeValue.
+        }
+
+        if (Op1LV.isConstant())
+          markForcedConstant(&I, Op1LV.getConstant());
+        else
+          markOverdefined(&I);
+        return true;
+      case Instruction::Load:
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        break;
+      case Instruction::ICmp:
+        // X == undef -> undef.  Other comparisons get more complicated.
+        if (cast<ICmpInst>(&I)->isEquality())
+          break;
+        markOverdefined(&I);
+        return true;
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        // There are two reasons a call can have an undef result
+        // 1. It could be tracked.
+        // 2. It could be constant-foldable.
+        // Because of the way we solve return values, tracked calls must
+        // never be marked overdefined in ResolvedUndefsIn.
+        if (Function *F = CallSite(&I).getCalledFunction())
+          if (TrackedRetVals.count(F))
+            break;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(&I);
+        return true;
+      }
+      default:
+        // If we don't know what should happen here, conservatively mark it
+        // overdefined.
+        markOverdefined(&I);
+        return true;
+      }
+    }
+
+    // Check to see if we have a branch or switch on an undefined value.  If so
+    // we force the branch to go one way or the other to make the successor
+    // values live.  It doesn't really matter which way we force it.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUndefined())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // false.
+      if (isa<UndefValue>(BI->getCondition())) {
+        BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+        markEdgeExecutable(&*BB, TI->getSuccessor(1));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to false.
+      markForcedConstant(BI->getCondition(),
+                         ConstantInt::getFalse(TI->getContext()));
+      return true;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (!SI->getNumCases())
+        continue;
+      if (!getValueState(SI->getCondition()).isUndefined())
+        continue;
+
+      // If the input to SCCP is actually switch on undef, fix the undef to
+      // the first constant.
+      if (isa<UndefValue>(SI->getCondition())) {
+        SI->setCondition(SI->case_begin().getCaseValue());
+        markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor());
+        return true;
+      }
+
+      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// SCCP Class - This class uses the SCCPSolver to implement a per-function
+  /// Sparse Conditional Constant Propagator.
+  ///
+  struct SCCP : public FunctionPass {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addPreserved<GlobalsAAWrapperPass>();
+    }
+    static char ID; // Pass identification, replacement for typeid
+    SCCP() : FunctionPass(ID) {
+      initializeSCCPPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnFunction - Run the Sparse Conditional Constant Propagation
+    // algorithm, and return true if the function was modified.
+    //
+    bool runOnFunction(Function &F) override;
+  };
+} // end anonymous namespace
+
+char SCCP::ID = 0;
+INITIALIZE_PASS(SCCP, "sccp",
+                "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() {
+  return new SCCP();
+}
+
+static void DeleteInstructionInBlock(BasicBlock *BB) {
+  DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+  ++NumDeadBlocks;
+
+  // Check to see if there are non-terminating instructions to delete.
+  if (isa<TerminatorInst>(BB->begin()))
+    return;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != BB->begin()) {
+    // Delete the next to last instruction.
+    Instruction *Inst = &*--EndInst->getIterator();
+    if (!Inst->use_empty())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (Inst->isEHPad()) {
+      EndInst = Inst;
+      continue;
+    }
+    BB->getInstList().erase(Inst);
+    ++NumInstRemoved;
+  }
+}
+
+// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+bool SCCP::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  SCCPSolver Solver(DL, TLI);
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(&F.front());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Argument &AI : F.args())
+    Solver.markAnythingOverdefined(&AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!Solver.isBlockExecutable(&*BB)) {
+      DeleteInstructionInBlock(&*BB);
+      MadeChanges = true;
+      continue;
+    }
+
+    // Iterate over all of the instructions in a function, replacing them with
+    // constants if we have found them to be of constant values.
+    //
+    for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+      Instruction *Inst = &*BI++;
+      if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
+        continue;
+
+      // TODO: Reconstruct structs from their elements.
+      if (Inst->getType()->isStructTy())
+        continue;
+
+      LatticeVal IV = Solver.getLatticeValueFor(Inst);
+      if (IV.isOverdefined())
+        continue;
+
+      Constant *Const = IV.isConstant()
+        ? IV.getConstant() : UndefValue::get(Inst->getType());
+      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
+
+      // Replaces all of the uses of a variable with uses of the constant.
+      Inst->replaceAllUsesWith(Const);
+
+      // Delete the instruction.
+      Inst->eraseFromParent();
+
+      // Hey, we just changed something!
+      MadeChanges = true;
+      ++NumInstRemoved;
+    }
+  }
+
+  return MadeChanges;
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// IPSCCP Class - This class implements interprocedural Sparse Conditional
+  /// Constant Propagation.
+  ///
+  struct IPSCCP : public ModulePass {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+    static char ID;
+    IPSCCP() : ModulePass(ID) {
+      initializeIPSCCPPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnModule(Module &M) override;
+  };
+} // end anonymous namespace
+
+char IPSCCP::ID = 0;
+INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
+                "Interprocedural Sparse Conditional Constant Propagation",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCP, "ipsccp",
+                "Interprocedural Sparse Conditional Constant Propagation",
+                false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() {
+  return new IPSCCP();
+}
+
+
+static bool AddressIsTaken(const GlobalValue *GV) {
+  // Delete any dead constantexpr klingons.
+  GV->removeDeadConstantUsers();
+
+  for (const Use &U : GV->uses()) {
+    const User *UR = U.getUser();
+    if (const StoreInst *SI = dyn_cast<StoreInst>(UR)) {
+      if (SI->getOperand(0) == GV || SI->isVolatile())
+        return true;  // Storing addr of GV.
+    } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
+      // Make sure we are calling the function, not passing the address.
+      ImmutableCallSite CS(cast<Instruction>(UR));
+      if (!CS.isCallee(&U))
+        return true;
+    } else if (const LoadInst *LI = dyn_cast<LoadInst>(UR)) {
+      if (LI->isVolatile())
+        return true;
+    } else if (isa<BlockAddress>(UR)) {
+      // blockaddress doesn't take the address of the function, it takes addr
+      // of label.
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IPSCCP::runOnModule(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  SCCPSolver Solver(DL, TLI);
+
+  // AddressTakenFunctions - This set keeps track of the address-taken functions
+  // that are in the input.  As IPSCCP runs through and simplifies code,
+  // functions that were address taken can end up losing their
+  // address-taken-ness.  Because of this, we keep track of their addresses from
+  // the first pass so we can use them for the later simplification pass.
+  SmallPtrSet<Function*, 32> AddressTakenFunctions;
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  //
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration())
+      continue;
+
+    // If this is a strong or ODR definition of this function, then we can
+    // propagate information about its result into callsites of it.
+    if (!F->mayBeOverridden())
+      Solver.AddTrackedFunction(&*F);
+
+    // If this function only has direct calls that we can see, we can track its
+    // arguments and return value aggressively, and can assume it is not called
+    // unless we see evidence to the contrary.
+    if (F->hasLocalLinkage()) {
+      if (AddressIsTaken(&*F))
+        AddressTakenFunctions.insert(&*F);
+      else {
+        Solver.AddArgumentTrackedFunction(&*F);
+        continue;
+      }
+    }
+
+    // Assume the function is called.
+    Solver.MarkBlockExecutable(&F->front());
+
+    // Assume nothing about the incoming arguments.
+    for (Argument &AI : F->args())
+      Solver.markAnythingOverdefined(&AI);
+  }
+
+  // Loop over global variables.  We inform the solver about any internal global
+  // variables that do not have their 'addresses taken'.  If they don't have
+  // their addresses taken, we can propagate constants through them.
+  for (GlobalVariable &G : M.globals())
+    if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+      Solver.TrackValueOfGlobalVariable(&G);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+
+    DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+    ResolvedUndefs = false;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+  //
+  SmallVector<BasicBlock*, 512> BlocksToErase;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration())
+      continue;
+
+    if (Solver.isBlockExecutable(&F->front())) {
+      for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+           AI != E; ++AI) {
+        if (AI->use_empty() || AI->getType()->isStructTy()) continue;
+
+        // TODO: Could use getStructLatticeValueFor to find out if the entire
+        // result is a constant and replace it entirely if so.
+
+        LatticeVal IV = Solver.getLatticeValueFor(&*AI);
+        if (IV.isOverdefined()) continue;
+
+        Constant *CST = IV.isConstant() ?
+        IV.getConstant() : UndefValue::get(AI->getType());
+        DEBUG(dbgs() << "***  Arg " << *AI << " = " << *CST <<"\n");
+
+        // Replaces all of the uses of a variable with uses of the
+        // constant.
+        AI->replaceAllUsesWith(CST);
+        ++IPNumArgsElimed;
+      }
+    }
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      if (!Solver.isBlockExecutable(&*BB)) {
+        DeleteInstructionInBlock(&*BB);
+        MadeChanges = true;
+
+        TerminatorInst *TI = BB->getTerminator();
+        for (BasicBlock *Succ : TI->successors()) {
+          if (!Succ->empty() && isa<PHINode>(Succ->begin()))
+            Succ->removePredecessor(&*BB);
+        }
+        if (!TI->use_empty())
+          TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        TI->eraseFromParent();
+        new UnreachableInst(M.getContext(), &*BB);
+
+        if (&*BB != &F->front())
+          BlocksToErase.push_back(&*BB);
+        continue;
+      }
+
+      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+        Instruction *Inst = &*BI++;
+        if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())
+          continue;
+
+        // TODO: Could use getStructLatticeValueFor to find out if the entire
+        // result is a constant and replace it entirely if so.
+
+        LatticeVal IV = Solver.getLatticeValueFor(Inst);
+        if (IV.isOverdefined())
+          continue;
+
+        Constant *Const = IV.isConstant()
+          ? IV.getConstant() : UndefValue::get(Inst->getType());
+        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
+
+        // Replaces all of the uses of a variable with uses of the
+        // constant.
+        Inst->replaceAllUsesWith(Const);
+
+        // Delete the instruction.
+        if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+          Inst->eraseFromParent();
+
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++IPNumInstRemoved;
+      }
+    }
+
+    // Now that all instructions in the function are constant folded, erase dead
+    // blocks, because we can now use ConstantFoldTerminator to get rid of
+    // in-edges.
+    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+      // If there are any PHI nodes in this successor, drop entries for BB now.
+      BasicBlock *DeadBB = BlocksToErase[i];
+      for (Value::user_iterator UI = DeadBB->user_begin(),
+                                UE = DeadBB->user_end();
+           UI != UE;) {
+        // Grab the user and then increment the iterator early, as the user
+        // will be deleted. Step past all adjacent uses from the same user.
+        Instruction *I = dyn_cast<Instruction>(*UI);
+        do { ++UI; } while (UI != UE && *UI == I);
+
+        // Ignore blockaddress users; BasicBlock's dtor will handle them.
+        if (!I) continue;
+
+        bool Folded = ConstantFoldTerminator(I->getParent());
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+#ifndef NDEBUG
+          if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            llvm_unreachable("Didn't fold away reference to block!");
+          }
+#endif
+
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          BranchInst::Create(TI->getSuccessor(0), TI);
+
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
+      }
+
+      // Finally, delete the basic block.
+      F->getBasicBlockList().erase(DeadBB);
+    }
+    BlocksToErase.clear();
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  //
+  // Do this in two stages: first identify the functions we should process, then
+  // actually zap their returns.  This is important because we can only do this
+  // if the address of the function isn't taken.  In cases where a return is the
+  // last use of a function, the order of processing functions would affect
+  // whether other functions are optimizable.
+  SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+  // TODO: Process multiple value ret instructions also.
+  const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+  for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
+       E = RV.end(); I != E; ++I) {
+    Function *F = I->first;
+    if (I->second.isOverdefined() || F->getReturnType()->isVoidTy())
+      continue;
+
+    // We can only do this if we know that nothing else can call the function.
+    if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
+      continue;
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+        if (!isa<UndefValue>(RI->getOperand(0)))
+          ReturnsToZap.push_back(RI);
+  }
+
+  // Zap all returns which we've identified as zap to change.
+  for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+    Function *F = ReturnsToZap[i]->getParent()->getParent();
+    ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+  }
+
+  // If we inferred constant or undef values for globals variables, we can
+  // delete the global and any stores that remain to it.
+  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+         E = TG.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    assert(!I->second.isOverdefined() &&
+           "Overdefined values should have been taken out of the map!");
+    DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->user_back());
+      SI->eraseFromParent();
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 0000000..a7361b5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,4291 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+// We only use this for a debug check in C++11
+#include <random>
+#endif
+
+using namespace llvm;
+using namespace llvm::sroa;
+
+#define DEBUG_TYPE "sroa"
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to enable randomly shuffling the slices to help uncover
+/// instability in their order.
+static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
+                                             cl::init(false), cl::Hidden);
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+                                        cl::Hidden);
+
+namespace {
+/// \brief A custom IRBuilder inserter which prefixes all names if they are
+/// preserved.
+template <bool preserveNames = true>
+class IRBuilderPrefixedInserter
+    : public IRBuilderDefaultInserter<preserveNames> {
+  std::string Prefix;
+
+public:
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+protected:
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
+        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+  }
+};
+
+// Specialization for not preserving the name is trivial.
+template <>
+class IRBuilderPrefixedInserter<false>
+    : public IRBuilderDefaultInserter<false> {
+public:
+  void SetNamePrefix(const Twine &P) {}
+};
+
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+#ifndef NDEBUG
+typedef llvm::IRBuilder<true, ConstantFolder, IRBuilderPrefixedInserter<true>>
+    IRBuilderTy;
+#else
+typedef llvm::IRBuilder<false, ConstantFolder, IRBuilderPrefixedInserter<false>>
+    IRBuilderTy;
+#endif
+}
+
+namespace {
+/// \brief A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
+  /// \brief The beginning offset of the range.
+  uint64_t BeginOffset;
+
+  /// \brief The ending offset, not included in the range.
+  uint64_t EndOffset;
+
+  /// \brief Storage for both the use of this slice and whether it can be
+  /// split.
+  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+  Slice() : BeginOffset(), EndOffset() {}
+  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset),
+        UseAndIsSplittable(U, IsSplittable) {}
+
+  uint64_t beginOffset() const { return BeginOffset; }
+  uint64_t endOffset() const { return EndOffset; }
+
+  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+  bool isDead() const { return getUse() == nullptr; }
+  void kill() { UseAndIsSplittable.setPointer(nullptr); }
+
+  /// \brief Support for ordering ranges.
+  ///
+  /// This provides an ordering over ranges such that start offsets are
+  /// always increasing, and within equal start offsets, the end offsets are
+  /// decreasing. Thus the spanning range comes first in a cluster with the
+  /// same start position.
+  bool operator<(const Slice &RHS) const {
+    if (beginOffset() < RHS.beginOffset())
+      return true;
+    if (beginOffset() > RHS.beginOffset())
+      return false;
+    if (isSplittable() != RHS.isSplittable())
+      return !isSplittable();
+    if (endOffset() > RHS.endOffset())
+      return true;
+    return false;
+  }
+
+  /// \brief Support comparison with a single offset to allow binary searches.
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+                                              uint64_t RHSOffset) {
+    return LHS.beginOffset() < RHSOffset;
+  }
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                              const Slice &RHS) {
+    return LHSOffset < RHS.beginOffset();
+  }
+
+  bool operator==(const Slice &RHS) const {
+    return isSplittable() == RHS.isSplittable() &&
+           beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
+  }
+  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
+};
+} // end anonymous namespace
+
+namespace llvm {
+template <typename T> struct isPodLike;
+template <> struct isPodLike<Slice> { static const bool value = true; };
+}
+
+/// \brief Representation of the alloca slices.
+///
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class llvm::sroa::AllocaSlices {
+public:
+  /// \brief Construct the slices of a particular alloca.
+  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+
+  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  ///
+  /// If this is true, the slices are never fully built and should be
+  /// ignored.
+  bool isEscaped() const { return PointerEscapingInstr; }
+
+  /// \brief Support for iterating over the slices.
+  /// @{
+  typedef SmallVectorImpl<Slice>::iterator iterator;
+  typedef iterator_range<iterator> range;
+  iterator begin() { return Slices.begin(); }
+  iterator end() { return Slices.end(); }
+
+  typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+  typedef iterator_range<const_iterator> const_range;
+  const_iterator begin() const { return Slices.begin(); }
+  const_iterator end() const { return Slices.end(); }
+  /// @}
+
+  /// \brief Erase a range of slices.
+  void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+  /// \brief Insert new slices for this alloca.
+  ///
+  /// This moves the slices into the alloca's slices collection, and re-sorts
+  /// everything so that the usual ordering properties of the alloca's slices
+  /// hold.
+  void insert(ArrayRef<Slice> NewSlices) {
+    int OldSize = Slices.size();
+    Slices.append(NewSlices.begin(), NewSlices.end());
+    auto SliceI = Slices.begin() + OldSize;
+    std::sort(SliceI, Slices.end());
+    std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+  }
+
+  // Forward declare the iterator and range accessor for walking the
+  // partitions.
+  class partition_iterator;
+  iterator_range<partition_iterator> partitions();
+
+  /// \brief Access the dead users for this alloca.
+  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
+  /// \brief Access the dead operands referring to this alloca.
+  ///
+  /// These are operands which have cannot actually be used to refer to the
+  /// alloca as they are outside its range and the user doesn't correct for
+  /// that. These mostly consist of PHI node inputs and the like which we just
+  /// need to replace with undef.
+  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
+  void printSlice(raw_ostream &OS, const_iterator I,
+                  StringRef Indent = "  ") const;
+  void printUse(raw_ostream &OS, const_iterator I,
+                StringRef Indent = "  ") const;
+  void print(raw_ostream &OS) const;
+  void dump(const_iterator I) const;
+  void dump() const;
+#endif
+
+private:
+  template <typename DerivedT, typename RetT = void> class BuilderBase;
+  class SliceBuilder;
+  friend class AllocaSlices::SliceBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// \brief Handle to alloca instruction to simplify method interfaces.
+  AllocaInst &AI;
+#endif
+
+  /// \brief The instruction responsible for this alloca not having a known set
+  /// of slices.
+  ///
+  /// When an instruction (potentially) escapes the pointer to the alloca, we
+  /// store a pointer to that here and abort trying to form slices of the
+  /// alloca. This will be null if the alloca slices are analyzed successfully.
+  Instruction *PointerEscapingInstr;
+
+  /// \brief The slices of the alloca.
+  ///
+  /// We store a vector of the slices formed by uses of the alloca here. This
+  /// vector is sorted by increasing begin offset, and then the unsplittable
+  /// slices before the splittable ones. See the Slice inner class for more
+  /// details.
+  SmallVector<Slice, 8> Slices;
+
+  /// \brief Instructions which will become dead if we rewrite the alloca.
+  ///
+  /// Note that these are not separated by slice. This is because we expect an
+  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+  /// all these instructions can simply be removed and replaced with undef as
+  /// they come from outside of the allocated space.
+  SmallVector<Instruction *, 8> DeadUsers;
+
+  /// \brief Operands which will become dead if we rewrite the alloca.
+  ///
+  /// These are operands that in their particular use can be replaced with
+  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// to PHI nodes and the like. They aren't entirely dead (there might be
+  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+  /// want to swap this particular input for undef to simplify the use lists of
+  /// the alloca.
+  SmallVector<Use *, 8> DeadOperands;
+};
+
+/// \brief A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+  friend class AllocaSlices;
+  friend class AllocaSlices::partition_iterator;
+
+  typedef AllocaSlices::iterator iterator;
+
+  /// \brief The beginning and ending offsets of the alloca for this
+  /// partition.
+  uint64_t BeginOffset, EndOffset;
+
+  /// \brief The start end end iterators of this partition.
+  iterator SI, SJ;
+
+  /// \brief A collection of split slice tails overlapping the partition.
+  SmallVector<Slice *, 4> SplitTails;
+
+  /// \brief Raw constructor builds an empty partition starting and ending at
+  /// the given iterator.
+  Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+  /// \brief The start offset of this partition.
+  ///
+  /// All of the contained slices start at or after this offset.
+  uint64_t beginOffset() const { return BeginOffset; }
+
+  /// \brief The end offset of this partition.
+  ///
+  /// All of the contained slices end at or before this offset.
+  uint64_t endOffset() const { return EndOffset; }
+
+  /// \brief The size of the partition.
+  ///
+  /// Note that this can never be zero.
+  uint64_t size() const {
+    assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+    return EndOffset - BeginOffset;
+  }
+
+  /// \brief Test whether this partition contains no slices, and merely spans
+  /// a region occupied by split slices.
+  bool empty() const { return SI == SJ; }
+
+  /// \name Iterate slices that start within the partition.
+  /// These may be splittable or unsplittable. They have a begin offset >= the
+  /// partition begin offset.
+  /// @{
+  // FIXME: We should probably define a "concat_iterator" helper and use that
+  // to stitch together pointee_iterators over the split tails and the
+  // contiguous iterators of the partition. That would give a much nicer
+  // interface here. We could then additionally expose filtered iterators for
+  // split, unsplit, and unsplittable splices based on the usage patterns.
+  iterator begin() const { return SI; }
+  iterator end() const { return SJ; }
+  /// @}
+
+  /// \brief Get the sequence of split slice tails.
+  ///
+  /// These tails are of slices which start before this partition but are
+  /// split and overlap into the partition. We accumulate these while forming
+  /// partitions.
+  ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// \brief An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+    : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+                                  Partition> {
+  friend class AllocaSlices;
+
+  /// \brief Most of the state for walking the partitions is held in a class
+  /// with a nice interface for examining them.
+  Partition P;
+
+  /// \brief We need to keep the end of the slices to know when to stop.
+  AllocaSlices::iterator SE;
+
+  /// \brief We also need to keep track of the maximum split end offset seen.
+  /// FIXME: Do we really?
+  uint64_t MaxSplitSliceEndOffset;
+
+  /// \brief Sets the partition to be empty at given iterator, and sets the
+  /// end iterator.
+  partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+      : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
+    // If not already at the end, advance our state to form the initial
+    // partition.
+    if (SI != SE)
+      advance();
+  }
+
+  /// \brief Advance the iterator to the next partition.
+  ///
+  /// Requires that the iterator not be at the end of the slices.
+  void advance() {
+    assert((P.SI != SE || !P.SplitTails.empty()) &&
+           "Cannot advance past the end of the slices!");
+
+    // Clear out any split uses which have ended.
+    if (!P.SplitTails.empty()) {
+      if (P.EndOffset >= MaxSplitSliceEndOffset) {
+        // If we've finished all splits, this is easy.
+        P.SplitTails.clear();
+        MaxSplitSliceEndOffset = 0;
+      } else {
+        // Remove the uses which have ended in the prior partition. This
+        // cannot change the max split slice end because we just checked that
+        // the prior partition ended prior to that max.
+        P.SplitTails.erase(
+            std::remove_if(
+                P.SplitTails.begin(), P.SplitTails.end(),
+                [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
+            P.SplitTails.end());
+        assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
+                           [&](Slice *S) {
+                             return S->endOffset() == MaxSplitSliceEndOffset;
+                           }) &&
+               "Could not find the current max split slice offset!");
+        assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
+                           [&](Slice *S) {
+                             return S->endOffset() <= MaxSplitSliceEndOffset;
+                           }) &&
+               "Max split slice end offset is not actually the max!");
+      }
+    }
+
+    // If P.SI is already at the end, then we've cleared the split tail and
+    // now have an end iterator.
+    if (P.SI == SE) {
+      assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+      return;
+    }
+
+    // If we had a non-empty partition previously, set up the state for
+    // subsequent partitions.
+    if (P.SI != P.SJ) {
+      // Accumulate all the splittable slices which started in the old
+      // partition into the split list.
+      for (Slice &S : P)
+        if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+          P.SplitTails.push_back(&S);
+          MaxSplitSliceEndOffset =
+              std::max(S.endOffset(), MaxSplitSliceEndOffset);
+        }
+
+      // Start from the end of the previous partition.
+      P.SI = P.SJ;
+
+      // If P.SI is now at the end, we at most have a tail of split slices.
+      if (P.SI == SE) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = MaxSplitSliceEndOffset;
+        return;
+      }
+
+      // If the we have split slices and the next slice is after a gap and is
+      // not splittable immediately form an empty partition for the split
+      // slices up until the next slice begins.
+      if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+          !P.SI->isSplittable()) {
+        P.BeginOffset = P.EndOffset;
+        P.EndOffset = P.SI->beginOffset();
+        return;
+      }
+    }
+
+    // OK, we need to consume new slices. Set the end offset based on the
+    // current slice, and step SJ past it. The beginning offset of the
+    // partition is the beginning offset of the next slice unless we have
+    // pre-existing split slices that are continuing, in which case we begin
+    // at the prior end offset.
+    P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+    P.EndOffset = P.SI->endOffset();
+    ++P.SJ;
+
+    // There are two strategies to form a partition based on whether the
+    // partition starts with an unsplittable slice or a splittable slice.
+    if (!P.SI->isSplittable()) {
+      // When we're forming an unsplittable region, it must always start at
+      // the first slice and will extend through its end.
+      assert(P.BeginOffset == P.SI->beginOffset());
+
+      // Form a partition including all of the overlapping slices with this
+      // unsplittable slice.
+      while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+        if (!P.SJ->isSplittable())
+          P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+        ++P.SJ;
+      }
+
+      // We have a partition across a set of overlapping unsplittable
+      // partitions.
+      return;
+    }
+
+    // If we're starting with a splittable slice, then we need to form
+    // a synthetic partition spanning it and any other overlapping splittable
+    // splices.
+    assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+    // Collect all of the overlapping splittable slices.
+    while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+           P.SJ->isSplittable()) {
+      P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+      ++P.SJ;
+    }
+
+    // Back upiP.EndOffset if we ended the span early when encountering an
+    // unsplittable slice. This synthesizes the early end offset of
+    // a partition spanning only splittable slices.
+    if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+      assert(!P.SJ->isSplittable());
+      P.EndOffset = P.SJ->beginOffset();
+    }
+  }
+
+public:
+  bool operator==(const partition_iterator &RHS) const {
+    assert(SE == RHS.SE &&
+           "End iterators don't match between compared partition iterators!");
+
+    // The observed positions of partitions is marked by the P.SI iterator and
+    // the emptiness of the split slices. The latter is only relevant when
+    // P.SI == SE, as the end iterator will additionally have an empty split
+    // slices list, but the prior may have the same P.SI and a tail of split
+    // slices.
+    if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+      assert(P.SJ == RHS.P.SJ &&
+             "Same set of slices formed two different sized partitions!");
+      assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+             "Same slice position with differently sized non-empty split "
+             "slice tails!");
+      return true;
+    }
+    return false;
+  }
+
+  partition_iterator &operator++() {
+    advance();
+    return *this;
+  }
+
+  Partition &operator*() { return P; }
+};
+
+/// \brief A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+  return make_range(partition_iterator(begin(), end()),
+                    partition_iterator(end(), end()));
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+  // If the condition being selected on is a constant or the same value is
+  // being selected between, fold the select. Yes this does (rarely) happen
+  // early on.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+    return SI.getOperand(1 + CI->isZero());
+  if (SI.getOperand(1) == SI.getOperand(2))
+    return SI.getOperand(1);
+
+  return nullptr;
+}
+
+/// \brief A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+    // If PN merges together the same value, return that value.
+    return PN->hasConstantValue();
+  }
+  return foldSelectInst(cast<SelectInst>(I));
+}
+
+/// \brief Builder for the alloca slices.
+///
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+  friend class PtrUseVisitor<SliceBuilder>;
+  friend class InstVisitor<SliceBuilder>;
+  typedef PtrUseVisitor<SliceBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaSlices &AS;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+  SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
+
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+      : PtrUseVisitor<SliceBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
+
+private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I).second)
+      AS.DeadUsers.push_back(&I);
+  }
+
+  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+                 bool IsSplittable = false) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset.uge(AllocSize)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+                   << " which has zero size or starts outside of the "
+                   << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << AS.AI << "\n"
+                   << "       use: " << I << "\n");
+      return markAsDead(I);
+    }
+
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    // This may appear superficially to be something we could ignore entirely,
+    // but that is not so! There may be widened loads or PHI-node uses where
+    // some instructions are dead but not others. We can't completely ignore
+    // them, and so have to record at least the information here.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to remain within the " << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << AS.AI << "\n"
+                   << "       use: " << I << "\n");
+      EndOffset = AllocSize;
+    }
+
+    AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    return Base::visitBitCastInst(BC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    if (SROAStrictInbounds && GEPI.isInBounds()) {
+      // FIXME: This is a manually un-factored variant of the basic code inside
+      // of GEPs with checking of the inbounds invariant specified in the
+      // langref in a very strict sense. If we ever want to enable
+      // SROAStrictInbounds, this code should be factored cleanly into
+      // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+      // by writing out the code here where we have tho underlying allocation
+      // size readily available.
+      APInt GEPOffset = Offset;
+      const DataLayout &DL = GEPI.getModule()->getDataLayout();
+      for (gep_type_iterator GTI = gep_type_begin(GEPI),
+                             GTE = gep_type_end(GEPI);
+           GTI != GTE; ++GTI) {
+        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+        if (!OpC)
+          break;
+
+        // Handle a struct index, which adds its field offset to the pointer.
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          unsigned ElementIdx = OpC->getZExtValue();
+          const StructLayout *SL = DL.getStructLayout(STy);
+          GEPOffset +=
+              APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+        } else {
+          // For array or vector indices, scale the index by the size of the
+          // type.
+          APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+          GEPOffset += Index * APInt(Offset.getBitWidth(),
+                                     DL.getTypeAllocSize(GTI.getIndexedType()));
+        }
+
+        // If this index has computed an intermediate pointer which is not
+        // inbounds, then the result of the GEP is a poison value and we can
+        // delete it and all uses.
+        if (GEPOffset.ugt(AllocSize))
+          return markAsDead(GEPI);
+      }
+    }
+
+    return Base::visitGetElementPtrInst(GEPI);
+  }
+
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+                         uint64_t Size, bool IsVolatile) {
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
+
+    insertUse(I, Offset, Size, IsSplittable);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&LI);
+
+    const DataLayout &DL = LI.getModule()->getDataLayout();
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return PI.setEscapedAndAborted(&SI);
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
+
+    const DataLayout &DL = SI.getModule()->getDataLayout();
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
+                   << " which extends past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << AS.AI << "\n"
+                   << "       use: " << SI << "\n");
+      return markAsDead(SI);
+    }
+
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return markAsDead(II);
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue(),
+              (bool)Length);
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if (Length && Length->getValue() == 0)
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return markAsDead(II);
+
+    // Because we can visit these intrinsics twice, also check to see if the
+    // first time marked this instruction as dead. If so, skip it.
+    if (VisitedDeadInsts.count(&II))
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    // This side of the transfer is completely out-of-bounds, and so we can
+    // nuke the entire transfer. However, we also need to nuke the other side
+    // if already added to our partitions.
+    // FIXME: Yet another place we really should bypass this when
+    // instrumenting for ASan.
+    if (Offset.uge(AllocSize)) {
+      SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+          MemTransferSliceMap.find(&II);
+      if (MTPI != MemTransferSliceMap.end())
+        AS.Slices[MTPI->second].kill();
+      return markAsDead(II);
+    }
+
+    uint64_t RawOffset = Offset.getLimitedValue();
+    uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
+
+    // Check for the special case where the same exact value is used for both
+    // source and dest.
+    if (*U == II.getRawDest() && *U == II.getRawSource()) {
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return markAsDead(II);
+
+      return insertUse(II, Offset, Size, /*IsSplittable=*/false);
+    }
+
+    // If we have seen both source and destination for a mem transfer, then
+    // they both point to the same alloca.
+    bool Inserted;
+    SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+    std::tie(MTPI, Inserted) =
+        MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
+    unsigned PrevIdx = MTPI->second;
+    if (!Inserted) {
+      Slice &PrevP = AS.Slices[PrevIdx];
+
+      // Check if the begin offsets match and this is a non-volatile transfer.
+      // In that case, we can completely elide the transfer.
+      if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+        PrevP.kill();
+        return markAsDead(II);
+      }
+
+      // Otherwise we have an offset transfer within the same alloca. We can't
+      // split those.
+      PrevP.makeUnsplittable();
+    }
+
+    // Insert the use now that we've fixed up the splittable nature.
+    insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+    // Check that we ended up with a valid index in the map.
+    assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
+           "Map index doesn't point back to a slice with this user.");
+  }
+
+  // Disable SRoA for any intrinsics except for lifetime invariants.
+  // FIXME: What about debug intrinsics? This matches old behavior, but
+  // doesn't make sense.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+        II.getIntrinsicID() == Intrinsic::lifetime_end) {
+      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+                               Length->getLimitedValue());
+      insertUse(II, Offset, Size, true);
+      return;
+    }
+
+    Base::visitIntrinsicInst(II);
+  }
+
+  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+    // We consider any PHI or select that results in a direct load or store of
+    // the same offset to be a viable use for slicing purposes. These uses
+    // are considered unsplittable and the size is the maximum loaded or stored
+    // size.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+    Visited.insert(Root);
+    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    const DataLayout &DL = Root->getModule()->getDataLayout();
+    // If there are no loads or stores, the access is dead. We mark that as
+    // a size zero access.
+    Size = 0;
+    do {
+      Instruction *I, *UsedI;
+      std::tie(UsedI, I) = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        Value *Op = SI->getOperand(0);
+        if (Op == UsedI)
+          return SI;
+        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+        continue;
+      }
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        if (!GEP->hasAllZeroIndices())
+          return GEP;
+      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+                 !isa<SelectInst>(I)) {
+        return I;
+      }
+
+      for (User *U : I->users())
+        if (Visited.insert(cast<Instruction>(U)).second)
+          Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
+    } while (!Uses.empty());
+
+    return nullptr;
+  }
+
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+    if (I.use_empty())
+      return markAsDead(I);
+
+    // TODO: We could use SimplifyInstruction here to fold PHINodes and
+    // SelectInsts. However, doing so requires to change the current
+    // dead-operand-tracking mechanism. For instance, suppose neither loading
+    // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+    // trap either.  However, if we simply replace %U with undef using the
+    // current dead-operand-tracking mechanism, "load (select undef, undef,
+    // %other)" may trap because the select may return the first operand
+    // "undef".
+    if (Value *Result = foldPHINodeOrSelectInst(I)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the PHI/select as if we had RAUW'ed it.
+        enqueueUsers(I);
+      else
+        // Otherwise the operand to the PHI/select is dead, and we can replace
+        // it with undef.
+        AS.DeadOperands.push_back(U);
+
+      return;
+    }
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&I);
+
+    // See if we already have computed info on this node.
+    uint64_t &Size = PHIOrSelectSizes[&I];
+    if (!Size) {
+      // This is a new PHI/Select, check for an unsafe use of it.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
+        return PI.setAborted(UnsafeI);
+    }
+
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if (Offset.uge(AllocSize)) {
+      AS.DeadOperands.push_back(U);
+      return;
+    }
+
+    insertUse(I, Offset, Size);
+  }
+
+  void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+  void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+};
+
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+    :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+      AI(AI),
+#endif
+      PointerEscapingInstr(nullptr) {
+  SliceBuilder PB(DL, AI, *this);
+  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    // FIXME: We should sink the escape vs. abort info into the caller nicely,
+    // possibly by just storing the PtrInfo in the AllocaSlices.
+    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+                                                  : PtrI.getAbortingInst();
+    assert(PointerEscapingInstr && "Did not track a bad instruction");
+    return;
+  }
+
+  Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
+                              [](const Slice &S) {
+                                return S.isDead();
+                              }),
+               Slices.end());
+
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+  if (SROARandomShuffleSlices) {
+    std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
+    std::shuffle(Slices.begin(), Slices.end(), MT);
+  }
+#endif
+
+  // Sort the uses. This arranges for the offsets to be in ascending order,
+  // and the sizes to be in descending order.
+  std::sort(Slices.begin(), Slices.end());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+                         StringRef Indent) const {
+  printSlice(OS, I, Indent);
+  OS << "\n";
+  printUse(OS, I, Indent);
+}
+
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+                              StringRef Indent) const {
+  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+     << " slice #" << (I - begin())
+     << (I->isSplittable() ? " (splittable)" : "");
+}
+
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+                            StringRef Indent) const {
+  OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
+}
+
+void AllocaSlices::print(raw_ostream &OS) const {
+  if (PointerEscapingInstr) {
+    OS << "Can't analyze slices for alloca: " << AI << "\n"
+       << "  A pointer to this alloca escaped by:\n"
+       << "  " << *PointerEscapingInstr << "\n";
+    return;
+  }
+
+  OS << "Slices of alloca: " << AI << "\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    print(OS, I);
+}
+
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+  print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
+static Type *findCommonType(AllocaSlices::const_iterator B,
+                            AllocaSlices::const_iterator E,
+                            uint64_t EndOffset) {
+  Type *Ty = nullptr;
+  bool TyIsCommon = true;
+  IntegerType *ITy = nullptr;
+
+  // Note that we need to look at *every* alloca slice's Use to ensure we
+  // always get consistent results regardless of the order of slices.
+  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+    Use *U = I->getUse();
+    if (isa<IntrinsicInst>(*U->getUser()))
+      continue;
+    if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+      continue;
+
+    Type *UserTy = nullptr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      UserTy = SI->getValueOperand()->getType();
+    }
+
+    if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (UserITy->getBitWidth() % 8 != 0 ||
+          UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+        continue;
+
+      // Track the largest bitwidth integer type used in this way in case there
+      // is no common type.
+      if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+        ITy = UserITy;
+    }
+
+    // To avoid depending on the order of slices, Ty and TyIsCommon must not
+    // depend on types skipped above.
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
+    else
+      Ty = UserTy;
+  }
+
+  return TyIsCommon ? Ty : ITy;
+}
+
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN) {
+  // For now, we can only do this promotion if the load is in the same block
+  // as the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN.getParent();
+  unsigned MaxAlign = 0;
+  bool HaveLoad = false;
+  for (User *U : PN.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // For now we only allow loads in the same block as the PHI.  This is
+    // a common case that happens when instcombine merges two loads through
+    // a PHI.
+    if (LI->getParent() != BB)
+      return false;
+
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+    HaveLoad = true;
+  }
+
+  if (!HaveLoad)
+    return false;
+
+  const DataLayout &DL = PN.getModule()->getDataLayout();
+
+  // We can only transform this if it is safe to push the loads into the
+  // predecessor blocks. The only thing to watch out for is that we can't put
+  // a possibly trapping load in the predecessor if it is a critical edge.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+
+    // If the value is produced by the terminator of the predecessor (an
+    // invoke) or it has side-effects, there is no valid place to put a load
+    // in the predecessor.
+    if (TI == InVal || TI->mayHaveSideEffects())
+      return false;
+
+    // If the predecessor has a single successor, then the edge isn't
+    // critical.
+    if (TI->getNumSuccessors() == 1)
+      continue;
+
+    // If this pointer is always safe to load, or if we can prove that there
+    // is already a load in the block, then we can move the load to the pred
+    // block.
+    if (isDereferenceablePointer(InVal, DL) ||
+        isSafeToLoadUnconditionally(InVal, TI, MaxAlign))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+static void speculatePHINodeLoads(PHINode &PN) {
+  DEBUG(dbgs() << "    original: " << PN << "\n");
+
+  Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+  IRBuilderTy PHIBuilder(&PN);
+  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                        PN.getName() + ".sroa.speculated");
+
+  // Get the AA tags and alignment to use from one of the loads.  It doesn't
+  // matter which one we get and if any differ.
+  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
+
+  AAMDNodes AATags;
+  SomeLoad->getAAMetadata(AATags);
+  unsigned Align = SomeLoad->getAlignment();
+
+  // Rewrite all loads of the PN to use the new PHI.
+  while (!PN.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(PN.user_back());
+    LI->replaceAllUsesWith(NewPN);
+    LI->eraseFromParent();
+  }
+
+  // Inject loads into all of the pred blocks.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    BasicBlock *Pred = PN.getIncomingBlock(Idx);
+    TerminatorInst *TI = Pred->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+    IRBuilderTy PredBuilder(TI);
+
+    LoadInst *Load = PredBuilder.CreateLoad(
+        InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+    ++NumLoadsSpeculated;
+    Load->setAlignment(Align);
+    if (AATags)
+      Load->setAAMetadata(AATags);
+    NewPN->addIncoming(Load, Pred);
+  }
+
+  DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  PN.eraseFromParent();
+}
+
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
+  Value *TValue = SI.getTrueValue();
+  Value *FValue = SI.getFalseValue();
+  const DataLayout &DL = SI.getModule()->getDataLayout();
+  bool TDerefable = isDereferenceablePointer(TValue, DL);
+  bool FDerefable = isDereferenceablePointer(FValue, DL);
+
+  for (User *U : SI.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple())
+      return false;
+
+    // Both operands to the select need to be dereferencable, either
+    // absolutely (e.g. allocas) or at this point because we can see other
+    // accesses to it.
+    if (!TDerefable &&
+        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment()))
+      return false;
+    if (!FDerefable &&
+        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment()))
+      return false;
+  }
+
+  return true;
+}
+
+static void speculateSelectInstLoads(SelectInst &SI) {
+  DEBUG(dbgs() << "    original: " << SI << "\n");
+
+  IRBuilderTy IRB(&SI);
+  Value *TV = SI.getTrueValue();
+  Value *FV = SI.getFalseValue();
+  // Replace the loads of the select with a select of two loads.
+  while (!SI.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(SI.user_back());
+    assert(LI->isSimple() && "We only speculate simple loads");
+
+    IRB.SetInsertPoint(LI);
+    LoadInst *TL =
+        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
+    LoadInst *FL =
+        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
+    NumLoadsSpeculated += 2;
+
+    // Transfer alignment and AA info if present.
+    TL->setAlignment(LI->getAlignment());
+    FL->setAlignment(LI->getAlignment());
+
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags) {
+      TL->setAAMetadata(Tags);
+      FL->setAAMetadata(Tags);
+    }
+
+    Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                LI->getName() + ".sroa.speculated");
+
+    DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LI->replaceAllUsesWith(V);
+    LI->eraseFromParent();
+  }
+  SI.eraseFromParent();
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
+  if (Indices.empty())
+    return BasePtr;
+
+  // A single zero index is a no-op, so check for this and avoid building a GEP
+  // in that case.
+  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+    return BasePtr;
+
+  return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
+                               NamePrefix + "sroa_idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
+                                    Value *BasePtr, Type *Ty, Type *TargetTy,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    Twine NamePrefix) {
+  if (Ty == TargetTy)
+    return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+  // Pointer size to use for the indices.
+  unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
+
+  // See if we can descend into a struct and locate a field with the correct
+  // type.
+  unsigned NumLayers = 0;
+  Type *ElementTy = Ty;
+  do {
+    if (ElementTy->isPointerTy())
+      break;
+
+    if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+      ElementTy = ArrayTy->getElementType();
+      Indices.push_back(IRB.getIntN(PtrSize, 0));
+    } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+      ElementTy = VectorTy->getElementType();
+      Indices.push_back(IRB.getInt32(0));
+    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+      if (STy->element_begin() == STy->element_end())
+        break; // Nothing left to descend into.
+      ElementTy = *STy->element_begin();
+      Indices.push_back(IRB.getInt32(0));
+    } else {
+      break;
+    }
+    ++NumLayers;
+  } while (ElementTy != TargetTy);
+  if (ElementTy != TargetTy)
+    Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
+                                       Value *Ptr, Type *Ty, APInt &Offset,
+                                       Type *TargetTy,
+                                       SmallVectorImpl<Value *> &Indices,
+                                       Twine NamePrefix) {
+  if (Offset == 0)
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+                                 NamePrefix);
+
+  // We can't recurse through pointer types.
+  if (Ty->isPointerTy())
+    return nullptr;
+
+  // We try to analyze GEPs over vectors here, but note that these GEPs are
+  // extremely poorly defined currently. The long-term goal is to remove GEPing
+  // over a vector from the IR completely.
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    if (ElementSizeInBits % 8 != 0) {
+      // GEPs over non-multiple of 8 size vector elements are invalid.
+      return nullptr;
+    }
+    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(VecTy->getNumElements()))
+      return nullptr;
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
+                                    Offset, TargetTy, Indices, NamePrefix);
+  }
+
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    Type *ElementTy = ArrTy->getElementType();
+    APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+      return nullptr;
+
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                    Indices, NamePrefix);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return nullptr;
+
+  const StructLayout *SL = DL.getStructLayout(STy);
+  uint64_t StructOffset = Offset.getZExtValue();
+  if (StructOffset >= SL->getSizeInBytes())
+    return nullptr;
+  unsigned Index = SL->getElementContainingOffset(StructOffset);
+  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+  Type *ElementTy = STy->getElementType(Index);
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
+    return nullptr; // The offset points into alignment padding.
+
+  Indices.push_back(IRB.getInt32(Index));
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, NamePrefix);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
+                                      Value *Ptr, APInt Offset, Type *TargetTy,
+                                      SmallVectorImpl<Value *> &Indices,
+                                      Twine NamePrefix) {
+  PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+  // an i8.
+  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
+    return nullptr;
+
+  Type *ElementTy = Ty->getElementType();
+  if (!ElementTy->isSized())
+    return nullptr; // We can't GEP through an unsized element.
+  APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+  if (ElementSize == 0)
+    return nullptr; // Zero-length arrays can't help us build a natural GEP.
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+  Offset -= NumSkippedElements * ElementSize;
+  Indices.push_back(IRB.getInt(NumSkippedElements));
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, NamePrefix);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+                             APInt Offset, Type *PointerTy, Twine NamePrefix) {
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(Ptr);
+  SmallVector<Value *, 4> Indices;
+
+  // We may end up computing an offset pointer that has the wrong type. If we
+  // never are able to compute one directly that has the correct type, we'll
+  // fall back to it, so keep it and the base it was computed from around here.
+  Value *OffsetPtr = nullptr;
+  Value *OffsetBasePtr;
+
+  // Remember any i8 pointer we come across to re-use if we need to do a raw
+  // byte offset.
+  Value *Int8Ptr = nullptr;
+  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+  Type *TargetTy = PointerTy->getPointerElementType();
+
+  do {
+    // First fold any existing GEPs into the offset.
+    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(Offset.getBitWidth(), 0);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+        break;
+      Offset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+      if (!Visited.insert(Ptr).second)
+        break;
+    }
+
+    // See if we can perform a natural GEP here.
+    Indices.clear();
+    if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
+                                           Indices, NamePrefix)) {
+      // If we have a new natural pointer at the offset, clear out any old
+      // offset pointer we computed. Unless it is the base pointer or
+      // a non-instruction, we built a GEP we don't need. Zap it.
+      if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+        if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+          assert(I->use_empty() && "Built a GEP with uses some how!");
+          I->eraseFromParent();
+        }
+      OffsetPtr = P;
+      OffsetBasePtr = Ptr;
+      // If we also found a pointer of the right type, we're done.
+      if (P->getType() == PointerTy)
+        return P;
+    }
+
+    // Stash this pointer if we've found an i8*.
+    if (Ptr->getType()->isIntegerTy(8)) {
+      Int8Ptr = Ptr;
+      Int8PtrOffset = Offset;
+    }
+
+    // Peel off a layer of the pointer and update the offset appropriately.
+    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->mayBeOverridden())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(Ptr).second);
+
+  if (!OffsetPtr) {
+    if (!Int8Ptr) {
+      Int8Ptr = IRB.CreateBitCast(
+          Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+          NamePrefix + "sroa_raw_cast");
+      Int8PtrOffset = Offset;
+    }
+
+    OffsetPtr = Int8PtrOffset == 0
+                    ? Int8Ptr
+                    : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+                                            IRB.getInt(Int8PtrOffset),
+                                            NamePrefix + "sroa_raw_idx");
+  }
+  Ptr = OffsetPtr;
+
+  // On the off chance we were targeting i8*, guard the bitcast here.
+  if (Ptr->getType() != PointerTy)
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
+
+  return Ptr;
+}
+
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+                                     const DataLayout &DL) {
+  unsigned Alignment;
+  Type *Ty;
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    Ty = LI->getType();
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    Ty = SI->getValueOperand()->getType();
+  } else {
+    llvm_unreachable("Only loads and stores are allowed!");
+  }
+
+  if (!Alignment)
+    Alignment = DL.getABITypeAlignment(Ty);
+
+  return MinAlign(Alignment, Offset);
+}
+
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+  if (OldTy == NewTy)
+    return true;
+
+  // For integer types, we can't handle any bit-width differences. This would
+  // break both vector conversions with extension and introduce endianness
+  // issues when in conjunction with loads and stores.
+  if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+    assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+               cast<IntegerType>(NewTy)->getBitWidth() &&
+           "We can't have the same bitwidth for different int types");
+    return false;
+  }
+
+  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+    return false;
+  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+    return false;
+
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
+  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+    if (NewTy->isPointerTy() && OldTy->isPointerTy())
+      return true;
+    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
+      return true;
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
+    return V;
+
+  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+         "Integer types must be the exact same to convert.");
+
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isIntegerTy() &&
+      NewTy->getScalarType()->isPointerTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isPointerTy() &&
+      NewTy->getScalarType()->isIntegerTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
+}
+
+/// \brief Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partition which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+                                            VectorType *Ty,
+                                            uint64_t ElementSize,
+                                            const DataLayout &DL) {
+  // First validate the slice offsets.
+  uint64_t BeginOffset =
+      std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
+  uint64_t BeginIndex = BeginOffset / ElementSize;
+  if (BeginIndex * ElementSize != BeginOffset ||
+      BeginIndex >= Ty->getNumElements())
+    return false;
+  uint64_t EndOffset =
+      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndIndex = EndOffset / ElementSize;
+  if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+    return false;
+
+  assert(EndIndex > BeginIndex && "Empty vector!");
+  uint64_t NumElements = EndIndex - BeginIndex;
+  Type *SliceTy = (NumElements == 1)
+                      ? Ty->getElementType()
+                      : VectorType::get(Ty->getElementType(), NumElements);
+
+  Type *SplitIntTy =
+      Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+  Use *U = S.getUse();
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile())
+      return false;
+    if (!S.isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+    // Disable vector promotion when there are loads or stores of an FCA.
+    return false;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    Type *LTy = LI->getType();
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+      assert(LTy->isIntegerTy());
+      LTy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, SliceTy, LTy))
+      return false;
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    if (SI->isVolatile())
+      return false;
+    Type *STy = SI->getValueOperand()->getType();
+    if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+      assert(STy->isIntegerTy());
+      STy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, STy, SliceTy))
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+  // Collect the candidate types for vector-based promotion. Also track whether
+  // we have different element types.
+  SmallVector<VectorType *, 4> CandidateTys;
+  Type *CommonEltTy = nullptr;
+  bool HaveCommonEltTy = true;
+  auto CheckCandidateType = [&](Type *Ty) {
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      CandidateTys.push_back(VTy);
+      if (!CommonEltTy)
+        CommonEltTy = VTy->getElementType();
+      else if (CommonEltTy != VTy->getElementType())
+        HaveCommonEltTy = false;
+    }
+  };
+  // Consider any loads or stores that are the exact size of the slice.
+  for (const Slice &S : P)
+    if (S.beginOffset() == P.beginOffset() &&
+        S.endOffset() == P.endOffset()) {
+      if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+        CheckCandidateType(LI->getType());
+      else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+        CheckCandidateType(SI->getValueOperand()->getType());
+    }
+
+  // If we didn't find a vector type, nothing to do here.
+  if (CandidateTys.empty())
+    return nullptr;
+
+  // Remove non-integer vector types if we had multiple common element types.
+  // FIXME: It'd be nice to replace them with integer vector types, but we can't
+  // do that until all the backends are known to produce good code for all
+  // integer vector types.
+  if (!HaveCommonEltTy) {
+    CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
+                                      [](VectorType *VTy) {
+                         return !VTy->getElementType()->isIntegerTy();
+                       }),
+                       CandidateTys.end());
+
+    // If there were no integer vector types, give up.
+    if (CandidateTys.empty())
+      return nullptr;
+
+    // Rank the remaining candidate vector types. This is easy because we know
+    // they're all integer vectors. We sort by ascending number of elements.
+    auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+             "Cannot have vector types of different sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return RHSTy->getNumElements() < LHSTy->getNumElements();
+    };
+    std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+    CandidateTys.erase(
+        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+        CandidateTys.end());
+  } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+    for (VectorType *VTy : CandidateTys) {
+      assert(VTy->getElementType() == CommonEltTy &&
+             "Unaccounted for element type!");
+      assert(VTy == CandidateTys[0] &&
+             "Different vector types with the same element type!");
+    }
+#endif
+    CandidateTys.resize(1);
+  }
+
+  // Try each vector type, and return the one which works.
+  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+
+    // While the definition of LLVM vectors is bitpacked, we don't support sizes
+    // that aren't byte sized.
+    if (ElementSize % 8)
+      return false;
+    assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+           "vector size not a multiple of element size?");
+    ElementSize /= 8;
+
+    for (const Slice &S : P)
+      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+        return false;
+
+    for (const Slice *S : P.splitSliceTails())
+      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+        return false;
+
+    return true;
+  };
+  for (VectorType *VTy : CandidateTys)
+    if (CheckVectorTypeForPromotion(VTy))
+      return VTy;
+
+  return nullptr;
+}
+
+/// \brief Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const Slice &S,
+                                            uint64_t AllocBeginOffset,
+                                            Type *AllocaTy,
+                                            const DataLayout &DL,
+                                            bool &WholeAllocaOp) {
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+
+  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
+
+  // We can't reasonably handle cases where the load or store extends past
+  // the end of the alloca's type and into its padding.
+  if (RelEnd > Size)
+    return false;
+
+  Use *U = S.getUse();
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    // We can't handle loads that extend past the allocated memory.
+    if (DL.getTypeStoreSize(LI->getType()) > Size)
+      return false;
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+        return false;
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, AllocaTy, LI->getType())) {
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    Type *ValueTy = SI->getValueOperand()->getType();
+    if (SI->isVolatile())
+      return false;
+    // We can't handle stores that extend past the allocated memory.
+    if (DL.getTypeStoreSize(ValueTy) > Size)
+      return false;
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+        return false;
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, ValueTy, AllocaTy)) {
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+      return false;
+    if (!S.isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
+                                    const DataLayout &DL) {
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
+  // Don't create integer types larger than the maximum bitwidth.
+  if (SizeInBits > IntegerType::MAX_INT_BITS)
+    return false;
+
+  // Don't try to handle allocas with bit-padding.
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
+    return false;
+
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(DL, AllocaTy, IntTy) ||
+      !canConvertValue(DL, IntTy, AllocaTy))
+    return false;
+
+  // While examining uses, we ensure that the alloca has a covering load or
+  // store. We don't want to widen the integer operations only to fail to
+  // promote due to some other unsplittable entry (which we may make splittable
+  // later). However, if there are only splittable uses, go ahead and assume
+  // that we cover the alloca.
+  // FIXME: We shouldn't consider split slices that happen to start in the
+  // partition here...
+  bool WholeAllocaOp =
+      P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
+
+  for (const Slice &S : P)
+    if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
+      return false;
+
+  for (const Slice *S : P.splitSliceTails())
+    if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+                                         WholeAllocaOp))
+      return false;
+
+  return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8 * Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy) {
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  if (Ty != IntTy) {
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+    DEBUG(dbgs() << "    extended: " << *V << "\n");
+  }
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8 * Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+                            unsigned EndIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(V->getType());
+  unsigned NumElements = EndIndex - BeginIndex;
+  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+  if (NumElements == VecTy->getNumElements())
+    return V;
+
+  if (NumElements == 1) {
+    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                 Name + ".extract");
+    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    return V;
+  }
+
+  SmallVector<Constant *, 8> Mask;
+  Mask.reserve(NumElements);
+  for (unsigned i = BeginIndex; i != EndIndex; ++i)
+    Mask.push_back(IRB.getInt32(i));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask), Name + ".extract");
+  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+                           unsigned BeginIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(Old->getType());
+  assert(VecTy && "Can only insert a vector into a vector");
+
+  VectorType *Ty = dyn_cast<VectorType>(V->getType());
+  if (!Ty) {
+    // Single element to insert.
+    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+                                Name + ".insert");
+    DEBUG(dbgs() << "     insert: " << *V << "\n");
+    return V;
+  }
+
+  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+         "Too many elements!");
+  if (Ty->getNumElements() == VecTy->getNumElements()) {
+    assert(V->getType() == VecTy && "Vector type mismatch");
+    return V;
+  }
+  unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+  // When inserting a smaller vector into the larger to store, we first
+  // use a shuffle vector to widen it with undef elements, and then
+  // a second shuffle vector to select between the loaded vector and the
+  // incoming vector.
+  SmallVector<Constant *, 8> Mask;
+  Mask.reserve(VecTy->getNumElements());
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i - BeginIndex));
+    else
+      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask), Name + ".expand");
+  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
+
+  Mask.clear();
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+
+  V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+
+  DEBUG(dbgs() << "    blend: " << *V << "\n");
+  return V;
+}
+
+/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class llvm::sroa::AllocaSliceRewriter
+    : public InstVisitor<AllocaSliceRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
+  typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
+
+  const DataLayout &DL;
+  AllocaSlices &AS;
+  SROA &Pass;
+  AllocaInst &OldAI, &NewAI;
+  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+  Type *NewAllocaTy;
+
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntTy;
+
+  // If we are rewriting an alloca partition which can be written as pure
+  // vector operations, we stash extra information here. When VecTy is
+  // non-null, we have some strict guarantees about the rewritten alloca:
+  //   - The new alloca is exactly the size of the vector type here.
+  //   - The accesses all either map to the entire vector or to a single
+  //     element.
+  //   - The set of accessing instructions is only one of those handled above
+  //     in isVectorPromotionViable. Generally these are the same access kinds
+  //     which are promotable via mem2reg.
+  VectorType *VecTy;
+  Type *ElementTy;
+  uint64_t ElementSize;
+
+  // The original offset of the slice currently being rewritten relative to
+  // the original alloca.
+  uint64_t BeginOffset, EndOffset;
+  // The new offsets of the slice currently being rewritten relative to the
+  // original alloca.
+  uint64_t NewBeginOffset, NewEndOffset;
+
+  uint64_t SliceSize;
+  bool IsSplittable;
+  bool IsSplit;
+  Use *OldUse;
+  Instruction *OldPtr;
+
+  // Track post-rewrite users which are PHI nodes and Selects.
+  SmallPtrSetImpl<PHINode *> &PHIUsers;
+  SmallPtrSetImpl<SelectInst *> &SelectUsers;
+
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
+
+public:
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+                      AllocaInst &OldAI, AllocaInst &NewAI,
+                      uint64_t NewAllocaBeginOffset,
+                      uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+                      VectorType *PromotableVecTy,
+                      SmallPtrSetImpl<PHINode *> &PHIUsers,
+                      SmallPtrSetImpl<SelectInst *> &SelectUsers)
+      : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+        NewAllocaBeginOffset(NewAllocaBeginOffset),
+        NewAllocaEndOffset(NewAllocaEndOffset),
+        NewAllocaTy(NewAI.getAllocatedType()),
+        IntTy(IsIntegerPromotable
+                  ? Type::getIntNTy(
+                        NewAI.getContext(),
+                        DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+                  : nullptr),
+        VecTy(PromotableVecTy),
+        ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
+        OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+        IRB(NewAI.getContext(), ConstantFolder()) {
+    if (VecTy) {
+      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+             "Only multiple-of-8 sized vector elements are viable");
+      ++NumVectorized;
+    }
+    assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
+  }
+
+  bool visit(AllocaSlices::const_iterator I) {
+    bool CanSROA = true;
+    BeginOffset = I->beginOffset();
+    EndOffset = I->endOffset();
+    IsSplittable = I->isSplittable();
+    IsSplit =
+        BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+    DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
+    DEBUG(AS.printSlice(dbgs(), I, ""));
+    DEBUG(dbgs() << "\n");
+
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    SliceSize = NewEndOffset - NewBeginOffset;
+
+    OldUse = I->getUse();
+    OldPtr = cast<Instruction>(OldUse->get());
+
+    Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+    IRB.SetInsertPoint(OldUserI);
+    IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+    IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+    CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+    if (VecTy || IntTy)
+      assert(CanSROA);
+    return CanSROA;
+  }
+
+private:
+  // Make sure the other visit overloads are visible.
+  using Base::visit;
+
+  // Every instruction which can end up as a user must have a rewrite rule.
+  bool visitInstruction(Instruction &I) {
+    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    llvm_unreachable("No rewrite rule for this instruction!");
+  }
+
+  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+    // Note that the offset computation can use BeginOffset or NewBeginOffset
+    // interchangeably for unsplit slices.
+    assert(IsSplit || BeginOffset == NewBeginOffset);
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+    StringRef OldName = OldPtr->getName();
+    // Skip through the last '.sroa.' component of the name.
+    size_t LastSROAPrefix = OldName.rfind(".sroa.");
+    if (LastSROAPrefix != StringRef::npos) {
+      OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+      // Look for an SROA slice index.
+      size_t IndexEnd = OldName.find_first_not_of("0123456789");
+      if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+        // Strip the index and look for the offset.
+        OldName = OldName.substr(IndexEnd + 1);
+        size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+        if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+          // Strip the offset.
+          OldName = OldName.substr(OffsetEnd + 1);
+      }
+    }
+    // Strip any SROA suffixes as well.
+    OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+    return getAdjustedPtr(IRB, DL, &NewAI,
+                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+#ifndef NDEBUG
+                          Twine(OldName) + "."
+#else
+                          Twine()
+#endif
+                          );
+  }
+
+  /// \brief Compute suitable alignment to access this slice of the *new*
+  /// alloca.
+  ///
+  /// You can optionally pass a type to this routine and if that type's ABI
+  /// alignment is itself suitable, this will return zero.
+  unsigned getSliceAlign(Type *Ty = nullptr) {
+    unsigned NewAIAlign = NewAI.getAlignment();
+    if (!NewAIAlign)
+      NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
+    unsigned Align =
+        MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+    return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
+  }
+
+  unsigned getIndex(uint64_t Offset) {
+    assert(VecTy && "Can only call getIndex when rewriting a vector");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+    uint32_t Index = RelOffset / ElementSize;
+    assert(Index * ElementSize == RelOffset);
+    return Index;
+  }
+
+  void deleteIfTriviallyDead(Value *V) {
+    Instruction *I = cast<Instruction>(V);
+    if (isInstructionTriviallyDead(I))
+      Pass.DeadInsts.insert(I);
+  }
+
+  Value *rewriteVectorizedLoadInst() {
+    unsigned BeginIndex = getIndex(NewBeginOffset);
+    unsigned EndIndex = getIndex(NewEndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+  }
+
+  Value *rewriteIntegerLoad(LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
+    assert(!LI.isVolatile());
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+    V = convertValue(DL, IRB, V, IntTy);
+    assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+      IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+    }
+    // It is possible that the extracted type is not the load type. This
+    // happens if there is a load past the end of the alloca, and as
+    // a consequence the slice is narrower but still a candidate for integer
+    // lowering. To handle this case, we just zero extend the extracted
+    // integer.
+    assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+           "Can only handle an extract for an overly wide load");
+    if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+      V = IRB.CreateZExt(V, LI.getType());
+    return V;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    Value *OldOp = LI.getOperand(0);
+    assert(OldOp == OldPtr);
+
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
+                             : LI.getType();
+    const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
+    bool IsPtrAdjusted = false;
+    Value *V;
+    if (VecTy) {
+      V = rewriteVectorizedLoadInst();
+    } else if (IntTy && LI.getType()->isIntegerTy()) {
+      V = rewriteIntegerLoad(LI);
+    } else if (NewBeginOffset == NewAllocaBeginOffset &&
+               NewEndOffset == NewAllocaEndOffset &&
+               (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+                 TargetTy->isIntegerTy()))) {
+      LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                              LI.isVolatile(), LI.getName());
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+      V = NewLI;
+
+      // If this is an integer load past the end of the slice (which means the
+      // bytes outside the slice are undef or this load is dead) just forcibly
+      // fix the integer size with correct handling of endianness.
+      if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+        if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+          if (AITy->getBitWidth() < TITy->getBitWidth()) {
+            V = IRB.CreateZExt(V, TITy, "load.ext");
+            if (DL.isBigEndian())
+              V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+                                "endian_shift");
+          }
+    } else {
+      Type *LTy = TargetTy->getPointerTo();
+      LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+                                              getSliceAlign(TargetTy),
+                                              LI.isVolatile(), LI.getName());
+      if (LI.isVolatile())
+        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      V = NewLI;
+      IsPtrAdjusted = true;
+    }
+    V = convertValue(DL, IRB, V, TargetTy);
+
+    if (IsSplit) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
+             "Split load isn't smaller than original load");
+      assert(LI.getType()->getIntegerBitWidth() ==
+                 DL.getTypeStoreSizeInBits(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder =
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+                        "insert");
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      delete Placeholder;
+    } else {
+      LI.replaceAllUsesWith(V);
+    }
+
+    Pass.DeadInsts.insert(&LI);
+    deleteIfTriviallyDead(OldOp);
+    DEBUG(dbgs() << "          to: " << *V << "\n");
+    return !LI.isVolatile() && !IsPtrAdjusted;
+  }
+
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
+    if (V->getType() != VecTy) {
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+      Type *SliceTy = (NumElements == 1)
+                          ? ElementTy
+                          : VectorType::get(ElementTy, NumElements);
+      if (V->getType() != SliceTy)
+        V = convertValue(DL, IRB, V, SliceTy);
+
+      // Mix in the existing elements.
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+      V = insertVector(IRB, Old, V, BeginIndex, "vec");
+    }
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
+    assert(!SI.isVolatile());
+    if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      Old = convertValue(DL, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
+    }
+    V = convertValue(DL, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    Value *OldOp = SI.getOperand(1);
+    assert(OldOp == OldPtr);
+
+    Value *V = SI.getValueOperand();
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after promoting this alloca.
+    if (V->getType()->isPointerTy())
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+        Pass.PostPromotionWorklist.insert(AI);
+
+    if (SliceSize < DL.getTypeStoreSize(V->getType())) {
+      assert(!SI.isVolatile());
+      assert(V->getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(V->getType()->getIntegerBitWidth() ==
+                 DL.getTypeStoreSizeInBits(V->getType()) &&
+             "Non-byte-multiple bit width");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+                         "extract");
+    }
+
+    if (VecTy)
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
+    if (IntTy && V->getType()->isIntegerTy())
+      return rewriteIntegerStore(V, SI);
+
+    const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
+    StoreInst *NewSI;
+    if (NewBeginOffset == NewAllocaBeginOffset &&
+        NewEndOffset == NewAllocaEndOffset &&
+        (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+         (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+          V->getType()->isIntegerTy()))) {
+      // If this is an integer store past the end of slice (and thus the bytes
+      // past that point are irrelevant or this is unreachable), truncate the
+      // value prior to storing.
+      if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+        if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+          if (VITy->getBitWidth() > AITy->getBitWidth()) {
+            if (DL.isBigEndian())
+              V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+                                 "endian_shift");
+            V = IRB.CreateTrunc(V, AITy, "load.trunc");
+          }
+
+      V = convertValue(DL, IRB, V, NewAllocaTy);
+      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                     SI.isVolatile());
+    } else {
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
+                                     SI.isVolatile());
+    }
+    if (SI.isVolatile())
+      NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
+    Pass.DeadInsts.insert(&SI);
+    deleteIfTriviallyDead(OldOp);
+
+    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+  }
+
+  /// \brief Compute an integer value from splatting an i8 across the given
+  /// number of bytes.
+  ///
+  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+  /// call this routine.
+  /// FIXME: Heed the advice above.
+  ///
+  /// \param V The i8 value to splat.
+  /// \param Size The number of bytes in the output (assuming i8 is one byte)
+  Value *getIntegerSplat(Value *V, unsigned Size) {
+    assert(Size > 0 && "Expected a positive number of bytes.");
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+    if (Size == 1)
+      return V;
+
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+    V = IRB.CreateMul(
+        IRB.CreateZExt(V, SplatIntTy, "zext"),
+        ConstantExpr::getUDiv(
+            Constant::getAllOnesValue(SplatIntTy),
+            ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+                                  SplatIntTy)),
+        "isplat");
+    return V;
+  }
+
+  /// \brief Compute a vector splat for a given element value.
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    return V;
+  }
+
+  bool visitMemSetInst(MemSetInst &II) {
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getRawDest() == OldPtr);
+
+    // If the memset has a variable size, it cannot be split, just adjust the
+    // pointer to the new alloca.
+    if (!isa<Constant>(II.getLength())) {
+      assert(!IsSplit);
+      assert(NewBeginOffset == BeginOffset);
+      II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
+      Type *CstTy = II.getAlignmentCst()->getType();
+      II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
+
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    Type *AllocaTy = NewAI.getAllocatedType();
+    Type *ScalarTy = AllocaTy->getScalarType();
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memset.
+    if (!VecTy && !IntTy &&
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+         SliceSize != DL.getTypeStoreSize(AllocaTy) ||
+         !AllocaTy->isSingleValueType() ||
+         !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
+         DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+      CallInst *New = IRB.CreateMemSet(
+          getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+          getSliceAlign(), II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // If we can represent this as a simple value, we have to build the actual
+    // value to store, which requires expanding the byte present in memset to
+    // a sensible representation for the alloca type. This is essentially
+    // splatting the byte to a sufficiently wide integer, splatting it across
+    // any desired vector width, and bitcasting to the final type.
+    Value *V;
+
+    if (VecTy) {
+      // If this is a memset of a vectorized alloca, insert it.
+      assert(ElementTy == ScalarTy);
+
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+      Value *Splat =
+          getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+      Splat = convertValue(DL, IRB, Splat, ElementTy);
+      if (NumElements > 1)
+        Splat = getVectorSplat(Splat, NumElements);
+
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+    } else if (IntTy) {
+      // If this is a memset on an alloca where we can widen stores, insert the
+      // set integer.
+      assert(!II.isVolatile());
+
+      uint64_t Size = NewEndOffset - NewBeginOffset;
+      V = getIntegerSplat(II.getValue(), Size);
+
+      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                    EndOffset != NewAllocaBeginOffset)) {
+        Value *Old =
+            IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+        Old = convertValue(DL, IRB, Old, IntTy);
+        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(DL, IRB, Old, V, Offset, "insert");
+      } else {
+        assert(V->getType() == IntTy &&
+               "Wrong type for an alloca wide integer!");
+      }
+      V = convertValue(DL, IRB, V, AllocaTy);
+    } else {
+      // Established these invariants above.
+      assert(NewBeginOffset == NewAllocaBeginOffset);
+      assert(NewEndOffset == NewAllocaEndOffset);
+
+      V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
+      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+        V = getVectorSplat(V, AllocaVecTy->getNumElements());
+
+      V = convertValue(DL, IRB, V, AllocaTy);
+    }
+
+    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                        II.isVolatile());
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    // Rewriting of memory transfer instructions can be a bit tricky. We break
+    // them into two categories: split intrinsics and unsplit intrinsics.
+
+    DEBUG(dbgs() << "    original: " << II << "\n");
+
+    bool IsDest = &II.getRawDestUse() == OldUse;
+    assert((IsDest && II.getRawDest() == OldPtr) ||
+           (!IsDest && II.getRawSource() == OldPtr));
+
+    unsigned SliceAlign = getSliceAlign();
+
+    // For unsplit intrinsics, we simply modify the source and destination
+    // pointers in place. This isn't just an optimization, it is a matter of
+    // correctness. With unsplit intrinsics we may be dealing with transfers
+    // within a single alloca before SROA ran, or with transfers that have
+    // a variable length. We may also be dealing with memmove instead of
+    // memcpy, and so simply updating the pointers is the necessary for us to
+    // update both source and dest of a single call.
+    if (!IsSplittable) {
+      Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+      if (IsDest)
+        II.setDest(AdjustedPtr);
+      else
+        II.setSource(AdjustedPtr);
+
+      if (II.getAlignment() > SliceAlign) {
+        Type *CstTy = II.getAlignmentCst()->getType();
+        II.setAlignment(
+            ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+      }
+
+      DEBUG(dbgs() << "          to: " << II << "\n");
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+    // For split transfer intrinsics we have an incredibly useful assurance:
+    // the source and destination do not reside within the same alloca, and at
+    // least one of them does not escape. This means that we can replace
+    // memmove with memcpy, and we don't need to worry about all manner of
+    // downsides to splitting and transforming the operations.
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memcpy.
+    bool EmitMemCpy =
+        !VecTy && !IntTy &&
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+         SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+         !NewAI.getAllocatedType()->isSingleValueType());
+
+    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+    // size hasn't been shrunk based on analysis of the viable range, this is
+    // a no-op.
+    if (EmitMemCpy && &OldAI == &NewAI) {
+      // Ensure the start lines up.
+      assert(NewBeginOffset == BeginOffset);
+
+      // Rewrite the size as needed.
+      if (NewEndOffset != EndOffset)
+        II.setLength(ConstantInt::get(II.getLength()->getType(),
+                                      NewEndOffset - NewBeginOffset));
+      return false;
+    }
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after rewriting this instruction.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+    if (AllocaInst *AI =
+            dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+      assert(AI != &OldAI && AI != &NewAI &&
+             "Splittable transfers cannot reach the same alloca on both ends.");
+      Pass.Worklist.insert(AI);
+    }
+
+    Type *OtherPtrTy = OtherPtr->getType();
+    unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+    // Compute the relative offset for the other pointer within the transfer.
+    unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
+    APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
+                                   OtherOffset.zextOrTrunc(64).getZExtValue());
+
+    if (EmitMemCpy) {
+      // Compute the other pointer, folding as much as possible to produce
+      // a single, simple GEP in most cases.
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                OtherPtr->getName() + ".");
+
+      Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+
+      CallInst *New = IRB.CreateMemCpy(
+          IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
+          MinAlign(SliceAlign, OtherAlign), II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+                         NewEndOffset == NewAllocaEndOffset;
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
+    unsigned NumElements = EndIndex - BeginIndex;
+    IntegerType *SubIntTy =
+        IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
+
+    // Reset the other pointer type to match the register type we're going to
+    // use, but using the address space of the original other pointer.
+    if (VecTy && !IsWholeAlloca) {
+      if (NumElements == 1)
+        OtherPtrTy = VecTy->getElementType();
+      else
+        OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+      OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
+    } else if (IntTy && !IsWholeAlloca) {
+      OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
+    } else {
+      OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
+    }
+
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                   OtherPtr->getName() + ".");
+    unsigned SrcAlign = OtherAlign;
+    Value *DstPtr = &NewAI;
+    unsigned DstAlign = SliceAlign;
+    if (!IsDest) {
+      std::swap(SrcPtr, DstPtr);
+      std::swap(SrcAlign, DstAlign);
+    }
+
+    Value *Src;
+    if (VecTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
+      Src = convertValue(DL, IRB, Src, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
+    } else {
+      Src =
+          IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
+    }
+
+    if (VecTy && !IsWholeAlloca && IsDest) {
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old =
+          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
+      Old = convertValue(DL, IRB, Old, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+      Src = convertValue(DL, IRB, Src, NewAllocaTy);
+    }
+
+    StoreInst *Store = cast<StoreInst>(
+        IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getArgOperand(1) == OldPtr);
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    ConstantInt *Size =
+        ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+                         NewEndOffset - NewBeginOffset);
+    Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+    Value *New;
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+      New = IRB.CreateLifetimeStart(Ptr, Size);
+    else
+      New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return true;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+    assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
+
+    // We would like to compute a new pointer in only one place, but have it be
+    // as local as possible to the PHI. To do that, we re-use the location of
+    // the old pointer, which necessarily must be in the right position to
+    // dominate the PHI.
+    IRBuilderTy PtrBuilder(IRB);
+    if (isa<PHINode>(OldPtr))
+      PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+    else
+      PtrBuilder.SetInsertPoint(OldPtr);
+    PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+
+    Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+    DEBUG(dbgs() << "          to: " << PN << "\n");
+    deleteIfTriviallyDead(OldPtr);
+
+    // PHIs can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    PHIUsers.insert(&PN);
+    return true;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+           "Pointer isn't an operand!");
+    assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
+
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    if (SI.getOperand(1) == OldPtr)
+      SI.setOperand(1, NewPtr);
+    if (SI.getOperand(2) == OldPtr)
+      SI.setOperand(2, NewPtr);
+
+    DEBUG(dbgs() << "          to: " << SI << "\n");
+    deleteIfTriviallyDead(OldPtr);
+
+    // Selects can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    SelectUsers.insert(&SI);
+    return true;
+  }
+};
+
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Use &U : I.uses())
+      if (Visited.insert(U.getUser()).second)
+        Queue.push_back(&U);
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission class.
+  template <typename Derived> class OpSplitter {
+  protected:
+    /// The builder used to form new instructions.
+    IRBuilderTy IRB;
+    /// The indices which to be used with insert- or extractvalue to select the
+    /// appropriate value within the aggregate.
+    SmallVector<unsigned, 4> Indices;
+    /// The indices to a GEP instruction which will move Ptr to the correct slot
+    /// within the aggregate.
+    SmallVector<Value *, 4> GEPIndices;
+    /// The base pointer of the original op, used as a base for GEPing the
+    /// split operations.
+    Value *Ptr;
+
+    /// Initialize the splitter with an insertion point, Ptr and start with a
+    /// single zero GEP index.
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+  public:
+    /// \brief Generic recursive split emission routine.
+    ///
+    /// This method recursively splits an aggregate op (load or store) into
+    /// scalar or vector ops. It splits recursively until it hits a single value
+    /// and emits that single value operation via the template argument.
+    ///
+    /// The logic of this routine relies on GEPs and insertvalue and
+    /// extractvalue all operating with the same fundamental index list, merely
+    /// formatted differently (GEPs need actual values).
+    ///
+    /// \param Ty  The type being split recursively into smaller ops.
+    /// \param Agg The aggregate value being built up or stored, depending on
+    /// whether this is splitting a load or a store respectively.
+    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+      if (Ty->isSingleValueType())
+        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      llvm_unreachable("Only arrays and structs are aggregate loadable types");
+    }
+  };
+
+  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf load of a single value. This is called at the leaves of the
+    /// recursive emission to actually load values.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Load the single value and insert it using the indices.
+      Value *GEP =
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
+      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+      DEBUG(dbgs() << "          to: " << *Load << "\n");
+    }
+  };
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LoadOpSplitter Splitter(&LI, *U);
+    Value *V = UndefValue::get(LI.getType());
+    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+        : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf store of a single value. This is called at the leaves of the
+    /// recursive emission to actually produce stores.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Extract the single value and store it using the indices.
+      Value *Store = IRB.CreateStore(
+          IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"));
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+    }
+  };
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    StoreOpSplitter Splitter(&SI, *U);
+    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return Ty;
+
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+  Type *InnerTy;
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    InnerTy = ArrTy->getElementType();
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    unsigned Index = SL->getElementContainingOffset(0);
+    InnerTy = STy->getElementType(Index);
+  } else {
+    return Ty;
+  }
+
+  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy))
+    return Ty;
+
+  return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+                              uint64_t Size) {
+  if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+    return stripAggregateTypeWrapping(DL, Ty);
+  if (Offset > DL.getTypeAllocSize(Ty) ||
+      (DL.getTypeAllocSize(Ty) - Offset) < Size)
+    return nullptr;
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+    // We can't partition pointers...
+    if (SeqTy->isPointerTy())
+      return nullptr;
+
+    Type *ElementTy = SeqTy->getElementType();
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+    uint64_t NumSkippedElements = Offset / ElementSize;
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
+      if (NumSkippedElements >= ArrTy->getNumElements())
+        return nullptr;
+    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
+      if (NumSkippedElements >= VecTy->getNumElements())
+        return nullptr;
+    }
+    Offset -= NumSkippedElements * ElementSize;
+
+    // First check if we need to recurse.
+    if (Offset > 0 || Size < ElementSize) {
+      // Bail if the partition ends in a different array element.
+      if ((Offset + Size) > ElementSize)
+        return nullptr;
+      // Recurse through the element type trying to peel off offset bytes.
+      return getTypePartition(DL, ElementTy, Offset, Size);
+    }
+    assert(Offset == 0);
+
+    if (Size == ElementSize)
+      return stripAggregateTypeWrapping(DL, ElementTy);
+    assert(Size > ElementSize);
+    uint64_t NumElements = Size / ElementSize;
+    if (NumElements * ElementSize != Size)
+      return nullptr;
+    return ArrayType::get(ElementTy, NumElements);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return nullptr;
+
+  const StructLayout *SL = DL.getStructLayout(STy);
+  if (Offset >= SL->getSizeInBytes())
+    return nullptr;
+  uint64_t EndOffset = Offset + Size;
+  if (EndOffset > SL->getSizeInBytes())
+    return nullptr;
+
+  unsigned Index = SL->getElementContainingOffset(Offset);
+  Offset -= SL->getElementOffset(Index);
+
+  Type *ElementTy = STy->getElementType(Index);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+  if (Offset >= ElementSize)
+    return nullptr; // The offset points into alignment padding.
+
+  // See if any partition must be contained by the element.
+  if (Offset > 0 || Size < ElementSize) {
+    if ((Offset + Size) > ElementSize)
+      return nullptr;
+    return getTypePartition(DL, ElementTy, Offset, Size);
+  }
+  assert(Offset == 0);
+
+  if (Size == ElementSize)
+    return stripAggregateTypeWrapping(DL, ElementTy);
+
+  StructType::element_iterator EI = STy->element_begin() + Index,
+                               EE = STy->element_end();
+  if (EndOffset < SL->getSizeInBytes()) {
+    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+    if (Index == EndIndex)
+      return nullptr; // Within a single element and its padding.
+
+    // Don't try to form "natural" types if the elements don't line up with the
+    // expected size.
+    // FIXME: We could potentially recurse down through the last element in the
+    // sub-struct to find a natural end point.
+    if (SL->getElementOffset(EndIndex) != EndOffset)
+      return nullptr;
+
+    assert(Index < EndIndex);
+    EE = STy->element_begin() + EndIndex;
+  }
+
+  // Try to build up a sub-structure.
+  StructType *SubTy =
+      StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
+  const StructLayout *SubSL = DL.getStructLayout(SubTy);
+  if (Size != SubSL->getSizeInBytes())
+    return nullptr; // The sub-struct doesn't have quite the size needed.
+
+  return SubTy;
+}
+
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+///   %a = alloca [12 x i8]
+///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+///   %iptr1 = bitcast i8* %gep1 to i64*
+///   %iptr2 = bitcast i8* %gep2 to i64*
+///   %fptr1 = bitcast i8* %gep1 to float*
+///   %fptr2 = bitcast i8* %gep2 to float*
+///   %fptr3 = bitcast i8* %gep3 to float*
+///   store float 0.0, float* %fptr1
+///   store float 1.0, float* %fptr2
+///   %v = load i64* %iptr1
+///   store i64 %v, i64* %iptr2
+///   %f1 = load float* %fptr2
+///   %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+  DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+  // Track the loads and stores which are candidates for pre-splitting here, in
+  // the order they first appear during the partition scan. These give stable
+  // iteration order and a basis for tracking which loads and stores we
+  // actually split.
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+
+  // We need to accumulate the splits required of each load or store where we
+  // can find them via a direct lookup. This is important to cross-check loads
+  // and stores against each other. We also track the slice so that we can kill
+  // all the slices that end up split.
+  struct SplitOffsets {
+    Slice *S;
+    std::vector<uint64_t> Splits;
+  };
+  SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+  // Track loads out of this alloca which cannot, for any reason, be pre-split.
+  // This is important as we also cannot pre-split stores of those loads!
+  // FIXME: This is all pretty gross. It means that we can be more aggressive
+  // in pre-splitting when the load feeding the store happens to come from
+  // a separate alloca. Put another way, the effectiveness of SROA would be
+  // decreased by a frontend which just concatenated all of its local allocas
+  // into one big flat alloca. But defeating such patterns is exactly the job
+  // SROA is tasked with! Sadly, to not have this discrepancy we would have
+  // change store pre-splitting to actually force pre-splitting of the load
+  // that feeds it *and all stores*. That makes pre-splitting much harder, but
+  // maybe it would make it more principled?
+  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+  DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  for (auto &P : AS.partitions()) {
+    for (Slice &S : P) {
+      Instruction *I = cast<Instruction>(S.getUse()->getUser());
+      if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
+        // If this was a load we have to track that it can't participate in any
+        // pre-splitting!
+        if (auto *LI = dyn_cast<LoadInst>(I))
+          UnsplittableLoads.insert(LI);
+        continue;
+      }
+      assert(P.endOffset() > S.beginOffset() &&
+             "Empty or backwards partition!");
+
+      // Determine if this is a pre-splittable slice.
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+        // The load must be used exclusively to store into other pointers for
+        // us to be able to arbitrarily pre-split it. The stores must also be
+        // simple to avoid changing semantics.
+        auto IsLoadSimplyStored = [](LoadInst *LI) {
+          for (User *LU : LI->users()) {
+            auto *SI = dyn_cast<StoreInst>(LU);
+            if (!SI || !SI->isSimple())
+              return false;
+          }
+          return true;
+        };
+        if (!IsLoadSimplyStored(LI)) {
+          UnsplittableLoads.insert(LI);
+          continue;
+        }
+
+        Loads.push_back(LI);
+      } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+        if (!SI ||
+            S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+          continue;
+        auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+        if (!StoredLoad || !StoredLoad->isSimple())
+          continue;
+        assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+        Stores.push_back(SI);
+      } else {
+        // Other uses cannot be pre-split.
+        continue;
+      }
+
+      // Record the initial split.
+      DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      auto &Offsets = SplitOffsetsMap[I];
+      assert(Offsets.Splits.empty() &&
+             "Should not have splits the first time we see an instruction!");
+      Offsets.S = &S;
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+    }
+
+    // Now scan the already split slices, and add a split for any of them which
+    // we're going to pre-split.
+    for (Slice *S : P.splitSliceTails()) {
+      auto SplitOffsetsMapI =
+          SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+      if (SplitOffsetsMapI == SplitOffsetsMap.end())
+        continue;
+      auto &Offsets = SplitOffsetsMapI->second;
+
+      assert(Offsets.S == S && "Found a mismatched slice!");
+      assert(!Offsets.Splits.empty() &&
+             "Cannot have an empty set of splits on the second partition!");
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
+             "Previous split does not end where this one begins!");
+
+      // Record each split. The last partition's end isn't needed as the size
+      // of the slice dictates that.
+      if (S->endOffset() > P.endOffset())
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+    }
+  }
+
+  // We may have split loads where some of their stores are split stores. For
+  // such loads and stores, we can only pre-split them if their splits exactly
+  // match relative to their starting offset. We have to verify this prior to
+  // any rewriting.
+  Stores.erase(
+      std::remove_if(Stores.begin(), Stores.end(),
+                     [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
+                       // Lookup the load we are storing in our map of split
+                       // offsets.
+                       auto *LI = cast<LoadInst>(SI->getValueOperand());
+                       // If it was completely unsplittable, then we're done,
+                       // and this store can't be pre-split.
+                       if (UnsplittableLoads.count(LI))
+                         return true;
+
+                       auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+                       if (LoadOffsetsI == SplitOffsetsMap.end())
+                         return false; // Unrelated loads are definitely safe.
+                       auto &LoadOffsets = LoadOffsetsI->second;
+
+                       // Now lookup the store's offsets.
+                       auto &StoreOffsets = SplitOffsetsMap[SI];
+
+                       // If the relative offsets of each split in the load and
+                       // store match exactly, then we can split them and we
+                       // don't need to remove them here.
+                       if (LoadOffsets.Splits == StoreOffsets.Splits)
+                         return false;
+
+                       DEBUG(dbgs()
+                             << "    Mismatched splits for load and store:\n"
+                             << "      " << *LI << "\n"
+                             << "      " << *SI << "\n");
+
+                       // We've found a store and load that we need to split
+                       // with mismatched relative splits. Just give up on them
+                       // and remove both instructions from our list of
+                       // candidates.
+                       UnsplittableLoads.insert(LI);
+                       return true;
+                     }),
+      Stores.end());
+  // Now we have to go *back* through all the stores, because a later store may
+  // have caused an earlier store's load to become unsplittable and if it is
+  // unsplittable for the later store, then we can't rely on it being split in
+  // the earlier store either.
+  Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
+                              [&UnsplittableLoads](StoreInst *SI) {
+                                auto *LI =
+                                    cast<LoadInst>(SI->getValueOperand());
+                                return UnsplittableLoads.count(LI);
+                              }),
+               Stores.end());
+  // Once we've established all the loads that can't be split for some reason,
+  // filter any that made it into our list out.
+  Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+                             [&UnsplittableLoads](LoadInst *LI) {
+                               return UnsplittableLoads.count(LI);
+                             }),
+              Loads.end());
+
+
+  // If no loads or stores are left, there is no pre-splitting to be done for
+  // this alloca.
+  if (Loads.empty() && Stores.empty())
+    return false;
+
+  // From here on, we can't fail and will be building new accesses, so rig up
+  // an IR builder.
+  IRBuilderTy IRB(&AI);
+
+  // Collect the new slices which we will merge into the alloca slices.
+  SmallVector<Slice, 4> NewSlices;
+
+  // Track any allocas we end up splitting loads and stores for so we iterate
+  // on them.
+  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+  // At this point, we have collected all of the loads and stores we can
+  // pre-split, and the specific splits needed for them. We actually do the
+  // splitting in a specific order in order to handle when one of the loads in
+  // the value operand to one of the stores.
+  //
+  // First, we rewrite all of the split loads, and just accumulate each split
+  // load in a parallel structure. We also build the slices for them and append
+  // them to the alloca slices.
+  SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+  std::vector<LoadInst *> SplitLoads;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  for (LoadInst *LI : Loads) {
+    SplitLoads.clear();
+
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t LoadSize = Ty->getBitWidth() / 8;
+    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+    auto &Offsets = SplitOffsetsMap[LI];
+    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + LoadSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+    IRB.SetInsertPoint(LI);
+
+    DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      LoadInst *PLoad = IRB.CreateAlignedLoad(
+          getAdjustedPtr(IRB, DL, BasePtr,
+                         APInt(DL.getPointerSizeInBits(), PartOffset),
+                         PartPtrTy, BasePtr->getName() + "."),
+          getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+          LI->getName());
+
+      // Append this load onto the list of split loads so we can find it later
+      // to rewrite the stores.
+      SplitLoads.push_back(PLoad);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                   << ", " << NewSlices.back().endOffset() << "): " << *PLoad
+                   << "\n");
+
+      // See if we've handled all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+    }
+
+    // Now that we have the split loads, do the slow walk over all uses of the
+    // load and rewrite them as split stores, or save the split loads to use
+    // below if the store is going to be split there anyways.
+    bool DeferredStores = false;
+    for (User *LU : LI->users()) {
+      StoreInst *SI = cast<StoreInst>(LU);
+      if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+        DeferredStores = true;
+        DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
+        continue;
+      }
+
+      Value *StoreBasePtr = SI->getPointerOperand();
+      IRB.SetInsertPoint(SI);
+
+      DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+
+      for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+        LoadInst *PLoad = SplitLoads[Idx];
+        uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+        auto *PartPtrTy =
+            PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+        StoreInst *PStore = IRB.CreateAlignedStore(
+            PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+                                  APInt(DL.getPointerSizeInBits(), PartOffset),
+                                  PartPtrTy, StoreBasePtr->getName() + "."),
+            getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+        (void)PStore;
+        DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+      }
+
+      // We want to immediately iterate on any allocas impacted by splitting
+      // this store, and we have to track any promotable alloca (indicated by
+      // a direct store) as needing to be resplit because it is no longer
+      // promotable.
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     StoreBasePtr->stripInBoundsOffsets())) {
+        Worklist.insert(OtherAI);
+      }
+
+      // Mark the original store as dead.
+      DeadInsts.insert(SI);
+    }
+
+    // Save the split loads if there are deferred stores among the users.
+    if (DeferredStores)
+      SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+    // Mark the original load as dead and kill the original slice.
+    DeadInsts.insert(LI);
+    Offsets.S->kill();
+  }
+
+  // Second, we rewrite all of the split stores. At this point, we know that
+  // all loads from this alloca have been split already. For stores of such
+  // loads, we can simply look up the pre-existing split loads. For stores of
+  // other loads, we split those loads first and then write split stores of
+  // them.
+  for (StoreInst *SI : Stores) {
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t StoreSize = Ty->getBitWidth() / 8;
+    assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+    auto &Offsets = SplitOffsetsMap[SI];
+    assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + StoreSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Value *LoadBasePtr = LI->getPointerOperand();
+    Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+    DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+
+    // Check whether we have an already split load.
+    auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+    std::vector<LoadInst *> *SplitLoads = nullptr;
+    if (SplitLoadsMapI != SplitLoadsMap.end()) {
+      SplitLoads = &SplitLoadsMapI->second;
+      assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+             "Too few split loads for the number of splits in the store!");
+    } else {
+      DEBUG(dbgs() << "          of load: " << *LI << "\n");
+    }
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+      // Either lookup a split load or create one.
+      LoadInst *PLoad;
+      if (SplitLoads) {
+        PLoad = (*SplitLoads)[Idx];
+      } else {
+        IRB.SetInsertPoint(LI);
+        PLoad = IRB.CreateAlignedLoad(
+            getAdjustedPtr(IRB, DL, LoadBasePtr,
+                           APInt(DL.getPointerSizeInBits(), PartOffset),
+                           PartPtrTy, LoadBasePtr->getName() + "."),
+            getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
+            LI->getName());
+      }
+
+      // And store this partition.
+      IRB.SetInsertPoint(SI);
+      StoreInst *PStore = IRB.CreateAlignedStore(
+          PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+                                APInt(DL.getPointerSizeInBits(), PartOffset),
+                                PartPtrTy, StoreBasePtr->getName() + "."),
+          getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+                /*IsSplittable*/ false));
+      DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
+                   << ", " << NewSlices.back().endOffset() << "): " << *PStore
+                   << "\n");
+      if (!SplitLoads) {
+        DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+      }
+
+      // See if we've finished all the splits.
+      if (Idx >= Size)
+        break;
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+    }
+
+    // We want to immediately iterate on any allocas impacted by splitting
+    // this load, which is only relevant if it isn't a load of this alloca and
+    // thus we didn't already split the loads above. We also have to keep track
+    // of any promotable allocas we split loads on as they can no longer be
+    // promoted.
+    if (!SplitLoads) {
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     LoadBasePtr->stripInBoundsOffsets())) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        Worklist.insert(OtherAI);
+      }
+    }
+
+    // Mark the original store as dead now that we've split it up and kill its
+    // slice. Note that we leave the original load in place unless this store
+    // was its only use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
+      DeadInsts.insert(LI);
+    }
+    DeadInsts.insert(SI);
+    Offsets.S->kill();
+  }
+
+  // Remove the killed slices that have ben pre-split.
+  AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+    return S.isDead();
+  }), AS.end());
+
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
+  AS.insert(NewSlices);
+
+  DEBUG(dbgs() << "  Pre-split slices:\n");
+#ifndef NDEBUG
+  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+    DEBUG(AS.print(dbgs(), I, "    "));
+#endif
+
+  // Finally, don't try to promote any allocas that new require re-splitting.
+  // They have already been added to the worklist above.
+  PromotableAllocas.erase(
+      std::remove_if(
+          PromotableAllocas.begin(), PromotableAllocas.end(),
+          [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+      PromotableAllocas.end());
+
+  return true;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                                   Partition &P) {
+  // Try to compute a friendly type for this partition of the alloca. This
+  // won't always succeed, in which case we fall back to a legal integer type
+  // or an i8 array of an appropriate size.
+  Type *SliceTy = nullptr;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
+    if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
+      SliceTy = CommonUseTy;
+  if (!SliceTy)
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+                                                 P.beginOffset(), P.size()))
+      SliceTy = TypePartitionTy;
+  if ((!SliceTy || (SliceTy->isArrayTy() &&
+                    SliceTy->getArrayElementType()->isIntegerTy())) &&
+      DL.isLegalInteger(P.size() * 8))
+    SliceTy = Type::getIntNTy(*C, P.size() * 8);
+  if (!SliceTy)
+    SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+  assert(DL.getTypeAllocSize(SliceTy) >= P.size());
+
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+
+  VectorType *VecTy =
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+  if (VecTy)
+    SliceTy = VecTy;
+
+  // Check for the case where we're going to rewrite to a new alloca of the
+  // exact same type as the original, and with the same access offsets. In that
+  // case, re-use the existing alloca, but still run through the rewriter to
+  // perform phi and select speculation.
+  AllocaInst *NewAI;
+  if (SliceTy == AI.getAllocatedType()) {
+    assert(P.beginOffset() == 0 &&
+           "Non-zero begin offset but same alloca type");
+    NewAI = &AI;
+    // FIXME: We should be able to bail at this point with "nothing changed".
+    // FIXME: We might want to defer PHI speculation until after here.
+    // FIXME: return nullptr;
+  } else {
+    unsigned Alignment = AI.getAlignment();
+    if (!Alignment) {
+      // The minimum alignment which users can rely on when the explicit
+      // alignment is omitted or zero is that required by the ABI for this
+      // type.
+      Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
+    }
+    Alignment = MinAlign(Alignment, P.beginOffset());
+    // If we will get at least this much alignment from the type alone, leave
+    // the alloca's alignment unconstrained.
+    if (Alignment <= DL.getABITypeAlignment(SliceTy))
+      Alignment = 0;
+    NewAI = new AllocaInst(
+        SliceTy, nullptr, Alignment,
+        AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+    ++NumNewAllocas;
+  }
+
+  DEBUG(dbgs() << "Rewriting alloca partition "
+               << "[" << P.beginOffset() << "," << P.endOffset()
+               << ") to: " << *NewAI << "\n");
+
+  // Track the high watermark on the worklist as it is only relevant for
+  // promoted allocas. We will reset it to this point if the alloca is not in
+  // fact scheduled for promotion.
+  unsigned PPWOldSize = PostPromotionWorklist.size();
+  unsigned NumUses = 0;
+  SmallPtrSet<PHINode *, 8> PHIUsers;
+  SmallPtrSet<SelectInst *, 8> SelectUsers;
+
+  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+                               P.endOffset(), IsIntegerPromotable, VecTy,
+                               PHIUsers, SelectUsers);
+  bool Promotable = true;
+  for (Slice *S : P.splitSliceTails()) {
+    Promotable &= Rewriter.visit(S);
+    ++NumUses;
+  }
+  for (Slice &S : P) {
+    Promotable &= Rewriter.visit(&S);
+    ++NumUses;
+  }
+
+  NumAllocaPartitionUses += NumUses;
+  MaxUsesPerAllocaPartition =
+      std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
+
+  // Now that we've processed all the slices in the new partition, check if any
+  // PHIs or Selects would block promotion.
+  for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
+                                            E = PHIUsers.end();
+       I != E; ++I)
+    if (!isSafePHIToSpeculate(**I)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+  for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
+                                               E = SelectUsers.end();
+       I != E; ++I)
+    if (!isSafeSelectToSpeculate(**I)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  if (Promotable) {
+    if (PHIUsers.empty() && SelectUsers.empty()) {
+      // Promote the alloca.
+      PromotableAllocas.push_back(NewAI);
+    } else {
+      // If we have either PHIs or Selects to speculate, add them to those
+      // worklists and re-queue the new alloca so that we promote in on the
+      // next iteration.
+      for (PHINode *PHIUser : PHIUsers)
+        SpeculatablePHIs.insert(PHIUser);
+      for (SelectInst *SelectUser : SelectUsers)
+        SpeculatableSelects.insert(SelectUser);
+      Worklist.insert(NewAI);
+    }
+  } else {
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    if (NewAI != &AI)
+      Worklist.insert(NewAI);
+
+    // Drop any post-promotion work items if promotion didn't happen.
+    while (PostPromotionWorklist.size() > PPWOldSize)
+      PostPromotionWorklist.pop_back();
+  }
+
+  return NewAI;
+}
+
+/// \brief Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+  if (AS.begin() == AS.end())
+    return false;
+
+  unsigned NumPartitions = 0;
+  bool Changed = false;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+
+  // First try to pre-split loads and stores.
+  Changed |= presplitLoadsAndStores(AI, AS);
+
+  // Now that we have identified any pre-splitting opportunities, mark any
+  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+  // to split these during pre-splitting, we want to force them to be
+  // rewritten into a partition.
+  bool IsSorted = true;
+  for (Slice &S : AS) {
+    if (!S.isSplittable())
+      continue;
+    // FIXME: We currently leave whole-alloca splittable loads and stores. This
+    // used to be the only splittable loads and stores and we need to be
+    // confident that the above handling of splittable loads and stores is
+    // completely sufficient before we forcibly disable the remaining handling.
+    if (S.beginOffset() == 0 &&
+        S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
+      continue;
+    if (isa<LoadInst>(S.getUse()->getUser()) ||
+        isa<StoreInst>(S.getUse()->getUser())) {
+      S.makeUnsplittable();
+      IsSorted = false;
+    }
+  }
+  if (!IsSorted)
+    std::sort(AS.begin(), AS.end());
+
+  /// \brief Describes the allocas introduced by rewritePartition
+  /// in order to migrate the debug info.
+  struct Piece {
+    AllocaInst *Alloca;
+    uint64_t Offset;
+    uint64_t Size;
+    Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+      : Alloca(AI), Offset(O), Size(S) {}
+  };
+  SmallVector<Piece, 4> Pieces;
+
+  // Rewrite each partition.
+  for (auto &P : AS.partitions()) {
+    if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+      Changed = true;
+      if (NewAI != &AI) {
+        uint64_t SizeOfByte = 8;
+        uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+        // Don't include any padding.
+        uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+        Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+      }
+    }
+    ++NumPartitions;
+  }
+
+  NumAllocaPartitions += NumPartitions;
+  MaxPartitionsPerAlloca =
+      std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+
+  // Migrate debug information from the old alloca to the new alloca(s)
+  // and the individual partitions.
+  if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
+    auto *Var = DbgDecl->getVariable();
+    auto *Expr = DbgDecl->getExpression();
+    DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+    bool IsSplit = Pieces.size() > 1;
+    for (auto Piece : Pieces) {
+      // Create a piece expression describing the new partition or reuse AI's
+      // expression if there is only one partition.
+      auto *PieceExpr = Expr;
+      if (IsSplit || Expr->isBitPiece()) {
+        // If this alloca is already a scalar replacement of a larger aggregate,
+        // Piece.Offset describes the offset inside the scalar.
+        uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
+        uint64_t Start = Offset + Piece.Offset;
+        uint64_t Size = Piece.Size;
+        if (Expr->isBitPiece()) {
+          uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
+          if (Start >= AbsEnd)
+            // No need to describe a SROAed padding.
+            continue;
+          Size = std::min(Size, AbsEnd - Start);
+        }
+        PieceExpr = DIB.createBitPieceExpression(Start, Size);
+      }
+
+      // Remove any existing dbg.declare intrinsic describing the same alloca.
+      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+        OldDDI->eraseFromParent();
+
+      DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
+                        &AI);
+    }
+  }
+  return Changed;
+}
+
+/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+  Value *OldV = U;
+  // Replace the use with an undef value.
+  U = UndefValue::get(OldV->getType());
+
+  // Check for this making an instruction dead. We have to garbage collect
+  // all the dead instructions to ensure the uses of any alloca end up being
+  // minimal.
+  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+    if (isInstructionTriviallyDead(OldI)) {
+      DeadInsts.insert(OldI);
+    }
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// the slices of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  ++NumAllocasAnalyzed;
+
+  // Special case dead allocas, as they're trivial.
+  if (AI.use_empty()) {
+    AI.eraseFromParent();
+    return true;
+  }
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+
+  // Skip alloca forms that this analysis can't handle.
+  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+      DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
+    return false;
+
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter;
+  Changed |= AggRewriter.rewrite(AI);
+
+  // Build the slices using a recursive instruction-visiting builder.
+  AllocaSlices AS(DL, AI);
+  DEBUG(AS.print(dbgs()));
+  if (AS.isEscaped())
+    return Changed;
+
+  // Delete all the dead users of this alloca before splitting and rewriting it.
+  for (Instruction *DeadUser : AS.getDeadUsers()) {
+    // Free up everything used by this instruction.
+    for (Use &DeadOp : DeadUser->operands())
+      clobberUse(DeadOp);
+
+    // Now replace the uses of this instruction.
+    DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+
+    // And mark it for deletion.
+    DeadInsts.insert(DeadUser);
+    Changed = true;
+  }
+  for (Use *DeadOp : AS.getDeadOperands()) {
+    clobberUse(*DeadOp);
+    Changed = true;
+  }
+
+  // No slices to split. Leave the dead alloca for a later pass to clean up.
+  if (AS.begin() == AS.end())
+    return Changed;
+
+  Changed |= splitAlloca(AI, AS);
+
+  DEBUG(dbgs() << "  Speculating PHIs\n");
+  while (!SpeculatablePHIs.empty())
+    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+  DEBUG(dbgs() << "  Speculating Selects\n");
+  while (!SpeculatableSelects.empty())
+    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+  return Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+void SROA::deleteDeadInstructions(
+    SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+  while (!DeadInsts.empty()) {
+    Instruction *I = DeadInsts.pop_back_val();
+    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+    for (Use &Operand : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(Operand)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        Operand = nullptr;
+        if (isInstructionTriviallyDead(U))
+          DeadInsts.insert(U);
+      }
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      DeletedAllocas.insert(AI);
+      if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
+        DbgDecl->eraseFromParent();
+    }
+
+    ++NumDeleted;
+    I->eraseFromParent();
+  }
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// This function returns whether any promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+  if (PromotableAllocas.empty())
+    return false;
+
+  NumPromoted += PromotableAllocas.size();
+
+  DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+  PromotableAllocas.clear();
+  return true;
+}
+
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+                                AssumptionCache &RunAC) {
+  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  C = &F.getContext();
+  DT = &RunDT;
+  AC = &RunAC;
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+       I != E; ++I) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      Worklist.insert(AI);
+  }
+
+  bool Changed = false;
+  // A set of deleted alloca instruction pointers which should be removed from
+  // the list of promotable allocas.
+  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+  do {
+    while (!Worklist.empty()) {
+      Changed |= runOnAlloca(*Worklist.pop_back_val());
+      deleteDeadInstructions(DeletedAllocas);
+
+      // Remove the deleted allocas from various lists so that we don't try to
+      // continue processing them.
+      if (!DeletedAllocas.empty()) {
+        auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
+        Worklist.remove_if(IsInSet);
+        PostPromotionWorklist.remove_if(IsInSet);
+        PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
+                                               PromotableAllocas.end(),
+                                               IsInSet),
+                                PromotableAllocas.end());
+        DeletedAllocas.clear();
+      }
+    }
+
+    Changed |= promoteAllocas(F);
+
+    Worklist = PostPromotionWorklist;
+    PostPromotionWorklist.clear();
+  } while (!Worklist.empty());
+
+  // FIXME: Even when promoting allocas we should preserve some abstract set of
+  // CFG-specific analyses.
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) {
+  return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F),
+                 AM->getResult<AssumptionAnalysis>(F));
+}
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+  /// The SROA implementation.
+  SROA Impl;
+
+public:
+  SROALegacyPass() : FunctionPass(ID) {
+    initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipOptnoneFunction(F))
+      return false;
+
+    auto PA = Impl.runImpl(
+        F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+    return !PA.areAllPreserved();
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  const char *getPassName() const override { return "SROA"; }
+  static char ID;
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+                      "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 0000000..52d477c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,244 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+  initializeADCELegacyPassPass(Registry);
+  initializeBDCEPass(Registry);
+  initializeAlignmentFromAssumptionsPass(Registry);
+  initializeConstantHoistingPass(Registry);
+  initializeConstantPropagationPass(Registry);
+  initializeCorrelatedValuePropagationPass(Registry);
+  initializeDCEPass(Registry);
+  initializeDeadInstEliminationPass(Registry);
+  initializeScalarizerPass(Registry);
+  initializeDSEPass(Registry);
+  initializeGVNPass(Registry);
+  initializeEarlyCSELegacyPassPass(Registry);
+  initializeFlattenCFGPassPass(Registry);
+  initializeInductiveRangeCheckEliminationPass(Registry);
+  initializeIndVarSimplifyPass(Registry);
+  initializeJumpThreadingPass(Registry);
+  initializeLICMPass(Registry);
+  initializeLoopDeletionPass(Registry);
+  initializeLoopAccessAnalysisPass(Registry);
+  initializeLoopInstSimplifyPass(Registry);
+  initializeLoopInterchangePass(Registry);
+  initializeLoopRotatePass(Registry);
+  initializeLoopStrengthReducePass(Registry);
+  initializeLoopRerollPass(Registry);
+  initializeLoopUnrollPass(Registry);
+  initializeLoopUnswitchPass(Registry);
+  initializeLoopIdiomRecognizePass(Registry);
+  initializeLowerAtomicPass(Registry);
+  initializeLowerExpectIntrinsicPass(Registry);
+  initializeMemCpyOptPass(Registry);
+  initializeMergedLoadStoreMotionPass(Registry);
+  initializeNaryReassociatePass(Registry);
+  initializePartiallyInlineLibCallsPass(Registry);
+  initializeReassociatePass(Registry);
+  initializeRegToMemPass(Registry);
+  initializeRewriteStatepointsForGCPass(Registry);
+  initializeSCCPPass(Registry);
+  initializeIPSCCPPass(Registry);
+  initializeSROALegacyPassPass(Registry);
+  initializeSROA_DTPass(Registry);
+  initializeSROA_SSAUpPass(Registry);
+  initializeCFGSimplifyPassPass(Registry);
+  initializeStructurizeCFGPass(Registry);
+  initializeSinkingPass(Registry);
+  initializeTailCallElimPass(Registry);
+  initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeSpeculativeExecutionPass(Registry);
+  initializeStraightLineStrengthReducePass(Registry);
+  initializeLoadCombinePass(Registry);
+  initializePlaceBackedgeSafepointsImplPass(Registry);
+  initializePlaceSafepointsPass(Registry);
+  initializeFloat2IntPass(Registry);
+  initializeLoopDistributePass(Registry);
+  initializeLoopLoadEliminationPass(Registry);
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+  initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCFGSimplificationPass());
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarizerPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarReplAggregatesPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarReplAggregatesPass(-1, false));
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+                                                  int Threshold) {
+  unwrap(PM)->add(createScalarReplAggregatesPass(Threshold));
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+  // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createEarlyCSEPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTypeBasedAAWrapperPass());
+}
+
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBasicAAWrapperPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
new file mode 100644
index 0000000..114d22d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -0,0 +1,2630 @@
+//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation implements the well known scalar replacement of
+// aggregates transformation.  This xform breaks up alloca instructions of
+// aggregate type (structure or array) into individual alloca instructions for
+// each member (if possible).  Then, if possible, it transforms the individual
+// alloca instructions into nice clean scalar SSA form.
+//
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they
+// often interact, especially for C++ programs.  As such, iterating between
+// SRoA, then Mem2Reg until we run out of things to promote works well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarrepl"
+
+STATISTIC(NumReplaced,  "Number of allocas broken up");
+STATISTIC(NumPromoted,  "Number of allocas promoted");
+STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
+STATISTIC(NumConverted, "Number of aggregates converted to scalar");
+
+namespace {
+#define SROA SROA_
+  struct SROA : public FunctionPass {
+    SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
+      : FunctionPass(ID), HasDomTree(hasDT) {
+      if (T == -1)
+        SRThreshold = 128;
+      else
+        SRThreshold = T;
+      if (ST == -1)
+        StructMemberThreshold = 32;
+      else
+        StructMemberThreshold = ST;
+      if (AT == -1)
+        ArrayElementThreshold = 8;
+      else
+        ArrayElementThreshold = AT;
+      if (SLT == -1)
+        // Do not limit the scalar integer load size if no threshold is given.
+        ScalarLoadThreshold = -1;
+      else
+        ScalarLoadThreshold = SLT;
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    bool performScalarRepl(Function &F);
+    bool performPromotion(Function &F);
+
+  private:
+    bool HasDomTree;
+
+    /// DeadInsts - Keep track of instructions we have made dead, so that
+    /// we can remove them after we are done working.
+    SmallVector<Value*, 32> DeadInsts;
+
+    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
+    /// information about the uses.  All these fields are initialized to false
+    /// and set to true when something is learned.
+    struct AllocaInfo {
+      /// The alloca to promote.
+      AllocaInst *AI;
+
+      /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
+      /// looping and avoid redundant work.
+      SmallPtrSet<PHINode*, 8> CheckedPHIs;
+
+      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
+      bool isUnsafe : 1;
+
+      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
+      bool isMemCpySrc : 1;
+
+      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
+      bool isMemCpyDst : 1;
+
+      /// hasSubelementAccess - This is true if a subelement of the alloca is
+      /// ever accessed, or false if the alloca is only accessed with mem
+      /// intrinsics or load/store that only access the entire alloca at once.
+      bool hasSubelementAccess : 1;
+
+      /// hasALoadOrStore - This is true if there are any loads or stores to it.
+      /// The alloca may just be accessed with memcpy, for example, which would
+      /// not set this.
+      bool hasALoadOrStore : 1;
+
+      explicit AllocaInfo(AllocaInst *ai)
+        : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
+          hasSubelementAccess(false), hasALoadOrStore(false) {}
+    };
+
+    /// SRThreshold - The maximum alloca size to considered for SROA.
+    unsigned SRThreshold;
+
+    /// StructMemberThreshold - The maximum number of members a struct can
+    /// contain to be considered for SROA.
+    unsigned StructMemberThreshold;
+
+    /// ArrayElementThreshold - The maximum number of elements an array can
+    /// have to be considered for SROA.
+    unsigned ArrayElementThreshold;
+
+    /// ScalarLoadThreshold - The maximum size in bits of scalars to load when
+    /// converting to scalar
+    unsigned ScalarLoadThreshold;
+
+    void MarkUnsafe(AllocaInfo &I, Instruction *User) {
+      I.isUnsafe = true;
+      DEBUG(dbgs() << "  Transformation preventing inst: " << *User << '\n');
+    }
+
+    bool isSafeAllocaToScalarRepl(AllocaInst *AI);
+
+    void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info);
+    void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset,
+                                         AllocaInfo &Info);
+    void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
+    void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
+                         Type *MemOpType, bool isStore, AllocaInfo &Info,
+                         Instruction *TheAccess, bool AllowWholeAccess);
+    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
+                          const DataLayout &DL);
+    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
+                                  const DataLayout &DL);
+
+    void DoScalarReplacement(AllocaInst *AI,
+                             std::vector<AllocaInst*> &WorkList);
+    void DeleteDeadInstructions();
+
+    void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                              SmallVectorImpl<AllocaInst *> &NewElts);
+    void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+                        SmallVectorImpl<AllocaInst *> &NewElts);
+    void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+                    SmallVectorImpl<AllocaInst *> &NewElts);
+    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                  uint64_t Offset,
+                                  SmallVectorImpl<AllocaInst *> &NewElts);
+    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+                                      AllocaInst *AI,
+                                      SmallVectorImpl<AllocaInst *> &NewElts);
+    void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+                                       SmallVectorImpl<AllocaInst *> &NewElts);
+    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+                                      SmallVectorImpl<AllocaInst *> &NewElts);
+    bool ShouldAttemptScalarRepl(AllocaInst *AI);
+  };
+
+  // SROA_DT - SROA that uses DominatorTree.
+  struct SROA_DT : public SROA {
+    static char ID;
+  public:
+    SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, true, ID, ST, AT, SLT) {
+      initializeSROA_DTPass(*PassRegistry::getPassRegistry());
+    }
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.setPreservesCFG();
+    }
+  };
+
+  // SROA_SSAUp - SROA that uses SSAUpdater.
+  struct SROA_SSAUp : public SROA {
+    static char ID;
+  public:
+    SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, false, ID, ST, AT, SLT) {
+      initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
+    }
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.setPreservesCFG();
+    }
+  };
+
+}
+
+char SROA_DT::ID = 0;
+char SROA_SSAUp::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
+                "Scalar Replacement of Aggregates (DT)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
+                "Scalar Replacement of Aggregates (DT)", false, false)
+
+INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
+                      "Scalar Replacement of Aggregates (SSAUp)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
+                    "Scalar Replacement of Aggregates (SSAUp)", false, false)
+
+// Public interface to the ScalarReplAggregates pass
+FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
+                                                   bool UseDomTree,
+                                                   int StructMemberThreshold,
+                                                   int ArrayElementThreshold,
+                                                   int ScalarLoadThreshold) {
+  if (UseDomTree)
+    return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold,
+                       ScalarLoadThreshold);
+  return new SROA_SSAUp(Threshold, StructMemberThreshold,
+                        ArrayElementThreshold, ScalarLoadThreshold);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Convert To Scalar Optimization.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// ConvertToScalarInfo - This class implements the "Convert To Scalar"
+/// optimization, which scans the uses of an alloca and determines if it can
+/// rewrite it in terms of a single new alloca that can be mem2reg'd.
+class ConvertToScalarInfo {
+  /// AllocaSize - The size of the alloca being considered in bytes.
+  unsigned AllocaSize;
+  const DataLayout &DL;
+  unsigned ScalarLoadThreshold;
+
+  /// IsNotTrivial - This is set to true if there is some access to the object
+  /// which means that mem2reg can't promote it.
+  bool IsNotTrivial;
+
+  /// ScalarKind - Tracks the kind of alloca being considered for promotion,
+  /// computed based on the uses of the alloca rather than the LLVM type system.
+  enum {
+    Unknown,
+
+    // Accesses via GEPs that are consistent with element access of a vector
+    // type. This will not be converted into a vector unless there is a later
+    // access using an actual vector type.
+    ImplicitVector,
+
+    // Accesses via vector operations and GEPs that are consistent with the
+    // layout of a vector type.
+    Vector,
+
+    // An integer bag-of-bits with bitwise operations for insertion and
+    // extraction. Any combination of types can be converted into this kind
+    // of scalar.
+    Integer
+  } ScalarKind;
+
+  /// VectorTy - This tracks the type that we should promote the vector to if
+  /// it is possible to turn it into a vector.  This starts out null, and if it
+  /// isn't possible to turn into a vector type, it gets set to VoidTy.
+  VectorType *VectorTy;
+
+  /// HadNonMemTransferAccess - True if there is at least one access to the
+  /// alloca that is not a MemTransferInst.  We don't want to turn structs into
+  /// large integers unless there is some potential for optimization.
+  bool HadNonMemTransferAccess;
+
+  /// HadDynamicAccess - True if some element of this alloca was dynamic.
+  /// We don't yet have support for turning a dynamic access into a large
+  /// integer.
+  bool HadDynamicAccess;
+
+public:
+  explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL,
+                               unsigned SLT)
+    : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false),
+    ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false),
+    HadDynamicAccess(false) { }
+
+  AllocaInst *TryConvert(AllocaInst *AI);
+
+private:
+  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
+  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
+  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
+  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+                           Value *NonConstantIdx);
+
+  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
+                                    uint64_t Offset, Value* NonConstantIdx,
+                                    IRBuilder<> &Builder);
+  Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
+                                   uint64_t Offset, Value* NonConstantIdx,
+                                   IRBuilder<> &Builder);
+};
+} // end anonymous namespace.
+
+
+/// TryConvert - Analyze the specified alloca, and if it is safe to do so,
+/// rewrite it to be a new alloca which is mem2reg'able.  This returns the new
+/// alloca if possible or null if not.
+AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
+  // If we can't convert this scalar, or if mem2reg can trivially do it, bail
+  // out.
+  if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial)
+    return nullptr;
+
+  // If an alloca has only memset / memcpy uses, it may still have an Unknown
+  // ScalarKind. Treat it as an Integer below.
+  if (ScalarKind == Unknown)
+    ScalarKind = Integer;
+
+  if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
+    ScalarKind = Integer;
+
+  // If we were able to find a vector type that can handle this with
+  // insert/extract elements, and if there was at least one use that had
+  // a vector type, promote this to a vector.  We don't want to promote
+  // random stuff that doesn't use vectors (e.g. <9 x double>) because then
+  // we just get a lot of insert/extracts.  If at least one vector is
+  // involved, then we probably really do have a union of vector/array.
+  Type *NewTy;
+  if (ScalarKind == Vector) {
+    assert(VectorTy && "Missing type for vector scalar.");
+    DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
+          << *VectorTy << '\n');
+    NewTy = VectorTy;  // Use the vector type.
+  } else {
+    unsigned BitWidth = AllocaSize * 8;
+
+    // Do not convert to scalar integer if the alloca size exceeds the
+    // scalar load threshold.
+    if (BitWidth > ScalarLoadThreshold)
+      return nullptr;
+
+    if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
+        !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth))
+      return nullptr;
+    // Dynamic accesses on integers aren't yet supported.  They need us to shift
+    // by a dynamic amount which could be difficult to work out as we might not
+    // know whether to use a left or right shift.
+    if (ScalarKind == Integer && HadDynamicAccess)
+      return nullptr;
+
+    DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
+    // Create and insert the integer alloca.
+    NewTy = IntegerType::get(AI->getContext(), BitWidth);
+  }
+  AllocaInst *NewAI =
+      new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front());
+  ConvertUsesToScalar(AI, NewAI, 0, nullptr);
+  return NewAI;
+}
+
+/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type
+/// (VectorTy) so far at the offset specified by Offset (which is specified in
+/// bytes).
+///
+/// There are two cases we handle here:
+///   1) A union of vector types of the same size and potentially its elements.
+///      Here we turn element accesses into insert/extract element operations.
+///      This promotes a <4 x float> with a store of float to the third element
+///      into a <4 x float> that uses insert element.
+///   2) A fully general blob of memory, which we turn into some (potentially
+///      large) integer type with extract and insert operations where the loads
+///      and stores would mutate the memory.  We mark this by setting VectorTy
+///      to VoidTy.
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
+                                                    uint64_t Offset) {
+  // If we already decided to turn this into a blob of integer memory, there is
+  // nothing to be done.
+  if (ScalarKind == Integer)
+    return;
+
+  // If this could be contributing to a vector, analyze it.
+
+  // If the In type is a vector that is the same size as the alloca, see if it
+  // matches the existing VecTy.
+  if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
+    if (MergeInVectorType(VInTy, Offset))
+      return;
+  } else if (In->isFloatTy() || In->isDoubleTy() ||
+             (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 &&
+              isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
+    // Full width accesses can be ignored, because they can always be turned
+    // into bitcasts.
+    unsigned EltSize = In->getPrimitiveSizeInBits()/8;
+    if (EltSize == AllocaSize)
+      return;
+
+    // If we're accessing something that could be an element of a vector, see
+    // if the implied vector agrees with what we already have and if Offset is
+    // compatible with it.
+    if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
+        (!VectorTy || EltSize == VectorTy->getElementType()
+                                         ->getPrimitiveSizeInBits()/8)) {
+      if (!VectorTy) {
+        ScalarKind = ImplicitVector;
+        VectorTy = VectorType::get(In, AllocaSize/EltSize);
+      }
+      return;
+    }
+  }
+
+  // Otherwise, we have a case that we can't handle with an optimized vector
+  // form.  We can still turn this into a large integer.
+  ScalarKind = Integer;
+}
+
+/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
+/// returning true if the type was successfully merged and false otherwise.
+bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
+                                            uint64_t Offset) {
+  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+    // If we're storing/loading a vector of the right size, allow it as a
+    // vector.  If this the first vector we see, remember the type so that
+    // we know the element size. If this is a subsequent access, ignore it
+    // even if it is a differing type but the same size. Worst case we can
+    // bitcast the resultant vectors.
+    if (!VectorTy)
+      VectorTy = VInTy;
+    ScalarKind = Vector;
+    return true;
+  }
+
+  return false;
+}
+
+/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
+/// its accesses to a single vector type, return true and set VecTy to
+/// the new type.  If we could convert the alloca into a single promotable
+/// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
+/// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
+/// is the current offset from the base of the alloca being analyzed.
+///
+/// If we see at least one access to the value that is as a vector type, set the
+/// SawVec flag.
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+                                             Value* NonConstantIdx) {
+  for (User *U : V->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+      // Don't break volatile loads.
+      if (!LI->isSimple())
+        return false;
+      // Don't touch MMX operations.
+      if (LI->getType()->isX86_MMXTy())
+        return false;
+      HadNonMemTransferAccess = true;
+      MergeInTypeForLoadOrStore(LI->getType(), Offset);
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Storing the pointer, not into the value?
+      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
+      // Don't touch MMX operations.
+      if (SI->getOperand(0)->getType()->isX86_MMXTy())
+        return false;
+      HadNonMemTransferAccess = true;
+      MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset);
+      continue;
+    }
+
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) {
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        IsNotTrivial = true;  // Can't be mem2reg'd.
+      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
+        return false;
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) {
+      // If this is a GEP with a variable indices, we can't handle it.
+      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+      if (!PtrTy)
+        return false;
+
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      Value *GEPNonConstantIdx = nullptr;
+      if (!GEP->hasAllConstantIndices()) {
+        if (!isa<VectorType>(PtrTy->getElementType()))
+          return false;
+        if (NonConstantIdx)
+          return false;
+        GEPNonConstantIdx = Indices.pop_back_val();
+        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+          return false;
+        HadDynamicAccess = true;
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = DL.getIndexedOffset(PtrTy,
+                                               Indices);
+      // See if all uses can be converted.
+      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
+        return false;
+      IsNotTrivial = true;  // Can't be mem2reg'd.
+      HadNonMemTransferAccess = true;
+      continue;
+    }
+
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // handle it.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
+      // Store of constant value.
+      if (!isa<ConstantInt>(MSI->getValue()))
+        return false;
+
+      // Store of constant size.
+      ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength());
+      if (!Len)
+        return false;
+
+      // If the size differs from the alloca, we can only convert the alloca to
+      // an integer bag-of-bits.
+      // FIXME: This should handle all of the cases that are currently accepted
+      // as vector element insertions.
+      if (Len->getZExtValue() != AllocaSize || Offset != 0)
+        ScalarKind = Integer;
+
+      IsNotTrivial = true;  // Can't be mem2reg'd.
+      HadNonMemTransferAccess = true;
+      continue;
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
+      ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
+      if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0)
+        return false;
+
+      IsNotTrivial = true;  // Can't be mem2reg'd.
+      continue;
+    }
+
+    // If this is a lifetime intrinsic, we can handle it.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        continue;
+      }
+    }
+
+    // Otherwise, we cannot handle this!
+    return false;
+  }
+
+  return true;
+}
+
+/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
+/// directly.  This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.  By the end of this, there should be no uses of Ptr.
+void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
+                                              uint64_t Offset,
+                                              Value* NonConstantIdx) {
+  while (!Ptr->use_empty()) {
+    Instruction *User = cast<Instruction>(Ptr->user_back());
+
+    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
+      CI->eraseFromParent();
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      Value* GEPNonConstantIdx = nullptr;
+      if (!GEP->hasAllConstantIndices()) {
+        assert(!NonConstantIdx &&
+               "Dynamic GEP reading from dynamic GEP unsupported");
+        GEPNonConstantIdx = Indices.pop_back_val();
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(),
+                                               Indices);
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx);
+      GEP->eraseFromParent();
+      continue;
+    }
+
+    IRBuilder<> Builder(User);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // The load is a bit extract from NewAI shifted right by Offset bits.
+      Value *LoadedVal = Builder.CreateLoad(NewAI);
+      Value *NewLoadVal
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+                                     NonConstantIdx, Builder);
+      LI->replaceAllUsesWith(NewLoadVal);
+      LI->eraseFromParent();
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      assert(SI->getOperand(0) != Ptr && "Consistency error!");
+      Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
+      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
+                                             NonConstantIdx, Builder);
+      Builder.CreateStore(New, NewAI);
+      SI->eraseFromParent();
+
+      // If the load we just inserted is now dead, then the inserted store
+      // overwrote the entire thing.
+      if (Old->use_empty())
+        Old->eraseFromParent();
+      continue;
+    }
+
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // transform it into a store of the expanded constant value.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
+      int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
+      if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
+        unsigned NumBytes = static_cast<unsigned>(SNumBytes);
+        unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
+
+        // Compute the value replicated the right number of times.
+        APInt APVal(NumBytes*8, Val);
+
+        // Splat the value if non-zero.
+        if (Val)
+          for (unsigned i = 1; i != NumBytes; ++i)
+            APVal |= APVal << 8;
+
+        Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
+        Value *New = ConvertScalar_InsertValue(
+                                    ConstantInt::get(User->getContext(), APVal),
+                                               Old, Offset, nullptr, Builder);
+        Builder.CreateStore(New, NewAI);
+
+        // If the load we just inserted is now dead, then the memset overwrote
+        // the entire thing.
+        if (Old->use_empty())
+          Old->eraseFromParent();
+      }
+      MSI->eraseFromParent();
+      continue;
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      assert(Offset == 0 && "must be store to start of alloca");
+      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
+
+      // If the source and destination are both to the same alloca, then this is
+      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
+      // as appropriate.
+      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0));
+
+      if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) {
+        // Dest must be OrigAI, change this to be a load from the original
+        // pointer (bitcasted), then a store to our new alloca.
+        assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
+        Value *SrcPtr = MTI->getSource();
+        PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
+          AIPTy = PointerType::get(AIPTy->getElementType(),
+                                   SPTy->getAddressSpace());
+        }
+        SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy);
+
+        LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
+        SrcVal->setAlignment(MTI->getAlignment());
+        Builder.CreateStore(SrcVal, NewAI);
+      } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) {
+        // Src must be OrigAI, change this to be a load from NewAI then a store
+        // through the original dest pointer (bitcasted).
+        assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
+        LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
+
+        PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
+          AIPTy = PointerType::get(AIPTy->getElementType(),
+                                   DPTy->getAddressSpace());
+        }
+        Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy);
+
+        StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
+        NewStore->setAlignment(MTI->getAlignment());
+      } else {
+        // Noop transfer. Src == Dst
+      }
+
+      MTI->eraseFromParent();
+      continue;
+    }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        // There's no need to preserve these, as the resulting alloca will be
+        // converted to a register anyways.
+        II->eraseFromParent();
+        continue;
+      }
+    }
+
+    llvm_unreachable("Unsupported operation!");
+  }
+}
+
+/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
+/// or vector value FromVal, extracting the bits from the offset specified by
+/// Offset.  This returns the value, which is of type ToType.
+///
+/// This happens when we are converting an "integer union" to a single
+/// integer scalar, or when we are converting a "vector union" to a vector with
+/// insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *ConvertToScalarInfo::
+ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
+                           uint64_t Offset, Value* NonConstantIdx,
+                           IRBuilder<> &Builder) {
+  // If the load is of the whole new alloca, no conversion is needed.
+  Type *FromType = FromVal->getType();
+  if (FromType == ToType && Offset == 0)
+    return FromVal;
+
+  // If the result alloca is a vector type, this is either an element
+  // access or a bitcast to another vector type of the same size.
+  if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+    unsigned FromTypeSize = DL.getTypeAllocSize(FromType);
+    unsigned ToTypeSize = DL.getTypeAllocSize(ToType);
+    if (FromTypeSize == ToTypeSize)
+        return Builder.CreateBitCast(FromVal, ToType);
+
+    // Otherwise it must be an element access.
+    unsigned Elt = 0;
+    if (Offset) {
+      unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
+      Elt = Offset/EltSize;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
+    }
+    // Return the element extracted out of it.
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    Value *V = Builder.CreateExtractElement(FromVal, Idx);
+    if (V->getType() != ToType)
+      V = Builder.CreateBitCast(V, ToType);
+    return V;
+  }
+
+  // If ToType is a first class aggregate, extract out each of the pieces and
+  // use insertvalue's to form the FCA.
+  if (StructType *ST = dyn_cast<StructType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
+    const StructLayout &Layout = *DL.getStructLayout(ST);
+    Value *Res = UndefValue::get(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
+                                        Offset+Layout.getElementOffsetInBits(i),
+                                              nullptr, Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i);
+    }
+    return Res;
+  }
+
+  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
+    uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
+    Value *Res = UndefValue::get(AT);
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
+                                              Offset+i*EltSize, nullptr,
+                                              Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i);
+    }
+    return Res;
+  }
+
+  // Otherwise, this must be a union that was converted to an integer value.
+  IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+
+  // If this is a big-endian system and the load is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (DL.isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = DL.getTypeStoreSizeInBits(NTy) -
+            DL.getTypeStoreSizeInBits(ToType) - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shl) which are not defined.
+  // We do this to support (f.e.) loads off the end of a structure where
+  // only some bits are used.
+  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateLShr(FromVal,
+                                 ConstantInt::get(FromVal->getType(), ShAmt));
+  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateShl(FromVal,
+                                ConstantInt::get(FromVal->getType(), -ShAmt));
+
+  // Finally, unconditionally truncate the integer to the right width.
+  unsigned LIBitWidth = DL.getTypeSizeInBits(ToType);
+  if (LIBitWidth < NTy->getBitWidth())
+    FromVal =
+      Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
+                                                    LIBitWidth));
+  else if (LIBitWidth > NTy->getBitWidth())
+    FromVal =
+       Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
+                                                    LIBitWidth));
+
+  // If the result is an integer, this is a trunc or bitcast.
+  if (ToType->isIntegerTy()) {
+    // Should be done.
+  } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
+    // Just do a bitcast, we know the sizes match up.
+    FromVal = Builder.CreateBitCast(FromVal, ToType);
+  } else {
+    // Otherwise must be a pointer.
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
+  }
+  assert(FromVal->getType() == ToType && "Didn't convert right?");
+  return FromVal;
+}
+
+/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
+/// or vector value "Old" at the offset specified by Offset.
+///
+/// This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value.  If this is 0 then all GEPs used to find this insert address
+/// are constant.
+Value *ConvertToScalarInfo::
+ConvertScalar_InsertValue(Value *SV, Value *Old,
+                          uint64_t Offset, Value* NonConstantIdx,
+                          IRBuilder<> &Builder) {
+  // Convert the stored type to the actual type, shift it left to insert
+  // then 'or' into place.
+  Type *AllocaType = Old->getType();
+  LLVMContext &Context = Old->getContext();
+
+  if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+    uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy);
+    uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType());
+
+    // Changing the whole vector with memset or with an access of a different
+    // vector type?
+    if (ValSize == VecSize)
+        return Builder.CreateBitCast(SV, AllocaType);
+
+    // Must be an element insertion.
+    Type *EltTy = VTy->getElementType();
+    if (SV->getType() != EltTy)
+      SV = Builder.CreateBitCast(SV, EltTy);
+    uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy);
+    unsigned Elt = Offset/EltSize;
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    return Builder.CreateInsertElement(Old, SV, Idx);
+  }
+
+  // If SV is a first-class aggregate value, insert each value recursively.
+  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
+    const StructLayout &Layout = *DL.getStructLayout(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i);
+      Old = ConvertScalar_InsertValue(Elt, Old,
+                                      Offset+Layout.getElementOffsetInBits(i),
+                                      nullptr, Builder);
+    }
+    return Old;
+  }
+
+  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
+    uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr,
+                                      Builder);
+    }
+    return Old;
+  }
+
+  // If SV is a float, convert it to the appropriate integer type.
+  // If it is a pointer, do the same.
+  unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType());
+  unsigned DestWidth = DL.getTypeSizeInBits(AllocaType);
+  unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType());
+  unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType);
+  if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
+  else if (SV->getType()->isPointerTy())
+    SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType()));
+
+  // Zero extend or truncate the value if needed.
+  if (SV->getType() != AllocaType) {
+    if (SV->getType()->getPrimitiveSizeInBits() <
+             AllocaType->getPrimitiveSizeInBits())
+      SV = Builder.CreateZExt(SV, AllocaType);
+    else {
+      // Truncation may be needed if storing more than the alloca can hold
+      // (undefined behavior).
+      SV = Builder.CreateTrunc(SV, AllocaType);
+      SrcWidth = DestWidth;
+      SrcStoreWidth = DestStoreWidth;
+    }
+  }
+
+  // If this is a big-endian system and the store is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (DL.isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shr) which are not defined.
+  // We do this to support (f.e.) stores off the end of a structure where
+  // only some bits in the structure are set.
+  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
+  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
+    Mask <<= ShAmt;
+  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
+    Mask = Mask.lshr(-ShAmt);
+  }
+
+  // Mask out the bits we are about to insert from the old value, and or
+  // in the new bits.
+  if (SrcWidth != DestWidth) {
+    assert(DestWidth > SrcWidth);
+    Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask");
+    SV = Builder.CreateOr(Old, SV, "ins");
+  }
+  return SV;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SRoA Driver
+//===----------------------------------------------------------------------===//
+
+
+bool SROA::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  bool Changed = performPromotion(F);
+
+  while (1) {
+    bool LocalChange = performScalarRepl(F);
+    if (!LocalChange) break;   // No need to repromote if no scalarrepl
+    Changed = true;
+    LocalChange = performPromotion(F);
+    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
+  }
+
+  return Changed;
+}
+
+namespace {
+class AllocaPromoter : public LoadAndStorePromoter {
+  AllocaInst *AI;
+  DIBuilder *DIB;
+  SmallVector<DbgDeclareInst *, 4> DDIs;
+  SmallVector<DbgValueInst *, 4> DVIs;
+public:
+  AllocaPromoter(ArrayRef<Instruction*> Insts, SSAUpdater &S,
+                 DIBuilder *DB)
+    : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {}
+
+  void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
+    // Remember which alloca we're promoting (for isInstInList).
+    this->AI = AI;
+    if (auto *L = LocalAsMetadata::getIfExists(AI)) {
+      if (auto *DINode = MetadataAsValue::getIfExists(AI->getContext(), L)) {
+        for (User *U : DINode->users())
+          if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+            DDIs.push_back(DDI);
+          else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+            DVIs.push_back(DVI);
+      }
+    }
+
+    LoadAndStorePromoter::run(Insts);
+    AI->eraseFromParent();
+    for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      DDI->eraseFromParent();
+    }
+    for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      DVI->eraseFromParent();
+    }
+  }
+
+  bool isInstInList(Instruction *I,
+                    const SmallVectorImpl<Instruction*> &Insts) const override {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      return LI->getOperand(0) == AI;
+    return cast<StoreInst>(I)->getPointerOperand() == AI;
+  }
+
+  void updateDebugInfo(Instruction *Inst) const override {
+    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
+    }
+    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      Value *Arg = nullptr;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        // If an argument is zero extended then use argument directly. The ZExt
+        // may be zapped by an optimization pass in future.
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
+        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (!Arg)
+          Arg = SI->getOperand(0);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Arg = LI->getOperand(0);
+      } else {
+        continue;
+      }
+      DIB->insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(),
+                                   DVI->getExpression(), DVI->getDebugLoc(),
+                                   Inst);
+    }
+  }
+};
+} // end anon namespace
+
+/// isSafeSelectToSpeculate - Select instructions that use an alloca and are
+/// subsequently loaded can be rewritten to load both input pointers and then
+/// select between the result, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand to
+/// the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst *SI) {
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool TDerefable = isDereferenceablePointer(SI->getTrueValue(), DL);
+  bool FDerefable = isDereferenceablePointer(SI->getFalseValue(), DL);
+
+  for (User *U : SI->users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple()) return false;
+
+    // Both operands to the select need to be dereferencable, either absolutely
+    // (e.g. allocas) or at this point because we can see other accesses to it.
+    if (!TDerefable &&
+        !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
+                                     LI->getAlignment()))
+      return false;
+    if (!FDerefable &&
+        !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
+                                     LI->getAlignment()))
+      return false;
+  }
+
+  return true;
+}
+
+/// isSafePHIToSpeculate - PHI instructions that use an alloca and are
+/// subsequently loaded can be rewritten to load both input pointers in the pred
+/// blocks and then PHI the results, allowing the load of the alloca to be
+/// promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operand to
+/// the select can be loaded unconditionally.
+static bool isSafePHIToSpeculate(PHINode *PN) {
+  // For now, we can only do this promotion if the load is in the same block as
+  // the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN->getParent();
+  unsigned MaxAlign = 0;
+  for (User *U : PN->users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
+    if (!LI || !LI->isSimple()) return false;
+
+    // For now we only allow loads in the same block as the PHI.  This is a
+    // common case that happens when instcombine merges two loads through a PHI.
+    if (LI->getParent() != BB) return false;
+
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+  }
+
+  const DataLayout &DL = PN->getModule()->getDataLayout();
+
+  // Okay, we know that we have one or more loads in the same block as the PHI.
+  // We can transform this if it is safe to push the loads into the predecessor
+  // blocks.  The only thing to watch out for is that we can't put a possibly
+  // trapping load in the predecessor if it is a critical edge.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = PN->getIncomingBlock(i);
+    Value *InVal = PN->getIncomingValue(i);
+
+    // If the terminator of the predecessor has side-effects (an invoke),
+    // there is no safe place to put a load in the predecessor.
+    if (Pred->getTerminator()->mayHaveSideEffects())
+      return false;
+
+    // If the value is produced by the terminator of the predecessor
+    // (an invoke), there is no valid place to put a load in the predecessor.
+    if (Pred->getTerminator() == InVal)
+      return false;
+
+    // If the predecessor has a single successor, then the edge isn't critical.
+    if (Pred->getTerminator()->getNumSuccessors() == 1)
+      continue;
+
+    // If this pointer is always safe to load, or if we can prove that there is
+    // already a load in the block, then we can move the load to the pred block.
+    if (isDereferenceablePointer(InVal, DL) ||
+        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+
+/// tryToMakeAllocaBePromotable - This returns true if the alloca only has
+/// direct (non-volatile) loads and stores to it.  If the alloca is close but
+/// not quite there, this will transform the code to allow promotion.  As such,
+/// it is a non-pure predicate.
+static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) {
+  SetVector<Instruction*, SmallVector<Instruction*, 4>,
+            SmallPtrSet<Instruction*, 4> > InstsToRewrite;
+  for (User *U : AI->users()) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (!LI->isSimple())
+        return false;
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI || !SI->isSimple())
+        return false;   // Don't allow a store OF the AI, only INTO the AI.
+      continue;
+    }
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(U)) {
+      // If the condition being selected on is a constant, fold the select, yes
+      // this does (rarely) happen early on.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) {
+        Value *Result = SI->getOperand(1+CI->isZero());
+        SI->replaceAllUsesWith(Result);
+        SI->eraseFromParent();
+
+        // This is very rare and we just scrambled the use list of AI, start
+        // over completely.
+        return tryToMakeAllocaBePromotable(AI, DL);
+      }
+
+      // If it is safe to turn "load (select c, AI, ptr)" into a select of two
+      // loads, then we can transform this by rewriting the select.
+      if (!isSafeSelectToSpeculate(SI))
+        return false;
+
+      InstsToRewrite.insert(SI);
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      if (PN->use_empty()) {  // Dead PHIs can be stripped.
+        InstsToRewrite.insert(PN);
+        continue;
+      }
+
+      // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
+      // in the pred blocks, then we can transform this by rewriting the PHI.
+      if (!isSafePHIToSpeculate(PN))
+        return false;
+
+      InstsToRewrite.insert(PN);
+      continue;
+    }
+
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (onlyUsedByLifetimeMarkers(BCI)) {
+        InstsToRewrite.insert(BCI);
+        continue;
+      }
+    }
+
+    return false;
+  }
+
+  // If there are no instructions to rewrite, then all uses are load/stores and
+  // we're done!
+  if (InstsToRewrite.empty())
+    return true;
+
+  // If we have instructions that need to be rewritten for this to be promotable
+  // take care of it now.
+  for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+      // This could only be a bitcast used by nothing but lifetime intrinsics.
+      for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end();
+           I != E;)
+        cast<Instruction>(*I++)->eraseFromParent();
+      BCI->eraseFromParent();
+      continue;
+    }
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
+      // Selects in InstsToRewrite only have load uses.  Rewrite each as two
+      // loads with a new select.
+      while (!SI->use_empty()) {
+        LoadInst *LI = cast<LoadInst>(SI->user_back());
+
+        IRBuilder<> Builder(LI);
+        LoadInst *TrueLoad =
+          Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
+        LoadInst *FalseLoad =
+          Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
+
+        // Transfer alignment and AA info if present.
+        TrueLoad->setAlignment(LI->getAlignment());
+        FalseLoad->setAlignment(LI->getAlignment());
+
+        AAMDNodes Tags;
+        LI->getAAMetadata(Tags);
+        if (Tags) {
+          TrueLoad->setAAMetadata(Tags);
+          FalseLoad->setAAMetadata(Tags);
+        }
+
+        Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
+        V->takeName(LI);
+        LI->replaceAllUsesWith(V);
+        LI->eraseFromParent();
+      }
+
+      // Now that all the loads are gone, the select is gone too.
+      SI->eraseFromParent();
+      continue;
+    }
+
+    // Otherwise, we have a PHI node which allows us to push the loads into the
+    // predecessors.
+    PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
+    if (PN->use_empty()) {
+      PN->eraseFromParent();
+      continue;
+    }
+
+    Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
+    PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
+                                     PN->getName()+".ld", PN);
+
+    // Get the AA tags and alignment to use from one of the loads.  It doesn't
+    // matter which one we get and if any differ, it doesn't matter.
+    LoadInst *SomeLoad = cast<LoadInst>(PN->user_back());
+
+    AAMDNodes AATags;
+    SomeLoad->getAAMetadata(AATags);
+    unsigned Align = SomeLoad->getAlignment();
+
+    // Rewrite all loads of the PN to use the new PHI.
+    while (!PN->use_empty()) {
+      LoadInst *LI = cast<LoadInst>(PN->user_back());
+      LI->replaceAllUsesWith(NewPN);
+      LI->eraseFromParent();
+    }
+
+    // Inject loads into all of the pred blocks.  Keep track of which blocks we
+    // insert them into in case we have multiple edges from the same block.
+    DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *Pred = PN->getIncomingBlock(i);
+      LoadInst *&Load = InsertedLoads[Pred];
+      if (!Load) {
+        Load = new LoadInst(PN->getIncomingValue(i),
+                            PN->getName() + "." + Pred->getName(),
+                            Pred->getTerminator());
+        Load->setAlignment(Align);
+        if (AATags) Load->setAAMetadata(AATags);
+      }
+
+      NewPN->addIncoming(Load, Pred);
+    }
+
+    PN->eraseFromParent();
+  }
+
+  ++NumAdjusted;
+  return true;
+}
+
+bool SROA::performPromotion(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  DominatorTree *DT = nullptr;
+  if (HasDomTree)
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AssumptionCache &AC =
+      getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+  DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
+  bool Changed = false;
+  SmallVector<Instruction*, 64> Insts;
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (tryToMakeAllocaBePromotable(AI, DL))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    if (HasDomTree)
+      PromoteMemToReg(Allocas, *DT, nullptr, &AC);
+    else {
+      SSAUpdater SSA;
+      for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+        AllocaInst *AI = Allocas[i];
+
+        // Build list of instructions to promote.
+        for (User *U : AI->users())
+          Insts.push_back(cast<Instruction>(U));
+        AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts);
+        Insts.clear();
+      }
+    }
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
+/// SROA.  It must be a struct or array type with a small number of elements.
+bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
+  Type *T = AI->getAllocatedType();
+  // Do not promote any struct that has too many members.
+  if (StructType *ST = dyn_cast<StructType>(T))
+    return ST->getNumElements() <= StructMemberThreshold;
+  // Do not promote any array that has too many elements.
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return AT->getNumElements() <= ArrayElementThreshold;
+  return false;
+}
+
+// performScalarRepl - This algorithm is a simple worklist driven algorithm,
+// which runs on all of the alloca instructions in the entry block, removing
+// them if they are only used by getelementptr instructions.
+//
+bool SROA::performScalarRepl(Function &F) {
+  std::vector<AllocaInst*> WorkList;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Scan the entry basic block, adding allocas to the worklist.
+  BasicBlock &BB = F.getEntryBlock();
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    if (AllocaInst *A = dyn_cast<AllocaInst>(I))
+      WorkList.push_back(A);
+
+  // Process the worklist
+  bool Changed = false;
+  while (!WorkList.empty()) {
+    AllocaInst *AI = WorkList.back();
+    WorkList.pop_back();
+
+    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
+    // with unused elements.
+    if (AI->use_empty()) {
+      AI->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    // If this alloca is impossible for us to promote, reject it early.
+    if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
+      continue;
+
+    // Check to see if we can perform the core SROA transformation.  We cannot
+    // transform the allocation instruction if it is an array allocation
+    // (allocations OF arrays are ok though), and an allocation of a scalar
+    // value cannot be decomposed at all.
+    uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+
+    // Do not promote [0 x %struct].
+    if (AllocaSize == 0) continue;
+
+    // Do not promote any struct whose size is too big.
+    if (AllocaSize > SRThreshold) continue;
+
+    // If the alloca looks like a good candidate for scalar replacement, and if
+    // all its users can be transformed, then split up the aggregate into its
+    // separate elements.
+    if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) {
+      DoScalarReplacement(AI, WorkList);
+      Changed = true;
+      continue;
+    }
+
+    // If we can turn this aggregate value (potentially with casts) into a
+    // simple scalar value that can be mem2reg'd into a register value.
+    // IsNotTrivial tracks whether this is something that mem2reg could have
+    // promoted itself.  If so, we don't want to transform it needlessly.  Note
+    // that we can't just check based on the type: the alloca may be of an i32
+    // but that has pointer arithmetic to set byte 3 of it or something.
+    if (AllocaInst *NewAI =
+            ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold)
+                .TryConvert(AI)) {
+      NewAI->takeName(AI);
+      AI->eraseFromParent();
+      ++NumConverted;
+      Changed = true;
+      continue;
+    }
+
+    // Otherwise, couldn't process this alloca.
+  }
+
+  return Changed;
+}
+
+/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
+/// predicate, do SROA now.
+void SROA::DoScalarReplacement(AllocaInst *AI,
+                               std::vector<AllocaInst*> &WorkList) {
+  DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
+  SmallVector<AllocaInst*, 32> ElementAllocas;
+  if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    ElementAllocas.reserve(ST->getNumContainedTypes());
+    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr,
+                                      AI->getAlignment(),
+                                      AI->getName() + "." + Twine(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  } else {
+    ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ElementAllocas.reserve(AT->getNumElements());
+    Type *ElTy = AT->getElementType();
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(),
+                                      AI->getName() + "." + Twine(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  }
+
+  // Now that we have created the new alloca instructions, rewrite all the
+  // uses of the old alloca.
+  RewriteForScalarRepl(AI, AI, 0, ElementAllocas);
+
+  // Now erase any instructions that were made dead while rewriting the alloca.
+  DeleteDeadInstructions();
+  AI->eraseFromParent();
+
+  ++NumReplaced;
+}
+
+/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
+/// recursively including all their operands that become trivially dead.
+void SROA::DeleteDeadInstructions() {
+  while (!DeadInsts.empty()) {
+    Instruction *I = cast<Instruction>(DeadInsts.pop_back_val());
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        // (But, don't add allocas to the dead instruction list -- they are
+        // already on the worklist and will be deleted separately.)
+        *OI = nullptr;
+        if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
+          DeadInsts.push_back(U);
+      }
+
+    I->eraseFromParent();
+  }
+}
+
+/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
+/// performing scalar replacement of alloca AI.  The results are flagged in
+/// the Info parameter.  Offset indicates the position within AI that is
+/// referenced by this instruction.
+void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
+                               AllocaInfo &Info) {
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  for (Use &U : I->uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      isSafeForScalarRepl(BC, Offset, Info);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      uint64_t GEPOffset = Offset;
+      isSafeGEP(GEPI, GEPOffset, Info);
+      if (!Info.isUnsafe)
+        isSafeForScalarRepl(GEPI, GEPOffset, Info);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+      if (!Length || Length->isNegative())
+        return MarkUnsafe(Info, User);
+
+      isSafeMemAccess(Offset, Length->getZExtValue(), nullptr,
+                      U.getOperandNo() == 0, Info, MI,
+                      true /*AllowWholeAccess*/);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      if (!LI->isSimple())
+        return MarkUnsafe(Info, User);
+      Type *LIType = LI->getType();
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
+                      LI, true /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (!SI->isSimple() || SI->getOperand(0) == I)
+        return MarkUnsafe(Info, User);
+
+      Type *SIType = SI->getOperand(0)->getType();
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
+                      SI, true /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return MarkUnsafe(Info, User);
+    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
+      isSafePHISelectUseForScalarRepl(User, Offset, Info);
+    } else {
+      return MarkUnsafe(Info, User);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+
+/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
+/// derived from the alloca, we can often still split the alloca into elements.
+/// This is useful if we have a large alloca where one element is phi'd
+/// together somewhere: we can SRoA and promote all the other elements even if
+/// we end up not being able to promote this one.
+///
+/// All we require is that the uses of the PHI do not index into other parts of
+/// the alloca.  The most important use case for this is single load and stores
+/// that are PHI'd together, which can happen due to code sinking.
+void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
+                                           AllocaInfo &Info) {
+  // If we've already checked this PHI, don't do it again.
+  if (PHINode *PN = dyn_cast<PHINode>(I))
+    if (!Info.CheckedPHIs.insert(PN).second)
+      return;
+
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  for (User *U : I->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) {
+      isSafePHISelectUseForScalarRepl(BC, Offset, Info);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
+      // Only allow "bitcast" GEPs for simplicity.  We could generalize this,
+      // but would have to prove that we're staying inside of an element being
+      // promoted.
+      if (!GEPI->hasAllZeroIndices())
+        return MarkUnsafe(Info, UI);
+      isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+      if (!LI->isSimple())
+        return MarkUnsafe(Info, UI);
+      Type *LIType = LI->getType();
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
+                      LI, false /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (!SI->isSimple() || SI->getOperand(0) == I)
+        return MarkUnsafe(Info, UI);
+
+      Type *SIType = SI->getOperand(0)->getType();
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
+                      SI, false /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+    } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) {
+      isSafePHISelectUseForScalarRepl(UI, Offset, Info);
+    } else {
+      return MarkUnsafe(Info, UI);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+/// isSafeGEP - Check if a GEP instruction can be handled for scalar
+/// replacement.  It is safe when all the indices are constant, in-bounds
+/// references, and when the resulting offset corresponds to an element within
+/// the alloca type.  The results are flagged in the Info parameter.  Upon
+/// return, Offset is adjusted as specified by the GEP indices.
+void SROA::isSafeGEP(GetElementPtrInst *GEPI,
+                     uint64_t &Offset, AllocaInfo &Info) {
+  gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+  if (GEPIt == E)
+    return;
+  bool NonConstant = false;
+  unsigned NonConstantIdxSize = 0;
+
+  // Walk through the GEP type indices, checking the types that this indexes
+  // into.
+  for (; GEPIt != E; ++GEPIt) {
+    // Ignore struct elements, no extra checking needed for these.
+    if ((*GEPIt)->isStructTy())
+      continue;
+
+    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
+    if (!IdxVal)
+      return MarkUnsafe(Info, GEPI);
+  }
+
+  // Compute the offset due to this GEP and check if the alloca has a
+  // component element at that offset.
+  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If this GEP is non-constant then the last operand must have been a
+  // dynamic index into a vector.  Pop this now as it has no impact on the
+  // constant part of the offset.
+  if (NonConstant)
+    Indices.pop_back();
+
+  const DataLayout &DL = GEPI->getModule()->getDataLayout();
+  Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize,
+                        DL))
+    MarkUnsafe(Info, GEPI);
+}
+
+/// isHomogeneousAggregate - Check if type T is a struct or array containing
+/// elements of the same type (which is always true for arrays).  If so,
+/// return true with NumElts and EltTy set to the number of elements and the
+/// element type, respectively.
+static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
+                                   Type *&EltTy) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    NumElts = AT->getNumElements();
+    EltTy = (NumElts == 0 ? nullptr : AT->getElementType());
+    return true;
+  }
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    NumElts = ST->getNumContainedTypes();
+    EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0));
+    for (unsigned n = 1; n < NumElts; ++n) {
+      if (ST->getContainedType(n) != EltTy)
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
+/// "homogeneous" aggregates with the same element type and number of elements.
+static bool isCompatibleAggregate(Type *T1, Type *T2) {
+  if (T1 == T2)
+    return true;
+
+  unsigned NumElts1, NumElts2;
+  Type *EltTy1, *EltTy2;
+  if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
+      isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
+      NumElts1 == NumElts2 &&
+      EltTy1 == EltTy2)
+    return true;
+
+  return false;
+}
+
+/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI
+/// alloca or has an offset and size that corresponds to a component element
+/// within it.  The offset checked here may have been formed from a GEP with a
+/// pointer bitcasted to a different type.
+///
+/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
+/// unit.  If false, it only allows accesses known to be in a single element.
+void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
+                           Type *MemOpType, bool isStore,
+                           AllocaInfo &Info, Instruction *TheAccess,
+                           bool AllowWholeAccess) {
+  const DataLayout &DL = TheAccess->getModule()->getDataLayout();
+  // Check if this is a load/store of the entire alloca.
+  if (Offset == 0 && AllowWholeAccess &&
+      MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) {
+    // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
+    // loads/stores (which are essentially the same as the MemIntrinsics with
+    // regard to copying padding between elements).  But, if an alloca is
+    // flagged as both a source and destination of such operations, we'll need
+    // to check later for padding between elements.
+    if (!MemOpType || MemOpType->isIntegerTy()) {
+      if (isStore)
+        Info.isMemCpyDst = true;
+      else
+        Info.isMemCpySrc = true;
+      return;
+    }
+    // This is also safe for references using a type that is compatible with
+    // the type of the alloca, so that loads/stores can be rewritten using
+    // insertvalue/extractvalue.
+    if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) {
+      Info.hasSubelementAccess = true;
+      return;
+    }
+  }
+  // Check if the offset/size correspond to a component within the alloca type.
+  Type *T = Info.AI->getAllocatedType();
+  if (TypeHasComponent(T, Offset, MemSize, DL)) {
+    Info.hasSubelementAccess = true;
+    return;
+  }
+
+  return MarkUnsafe(Info, TheAccess);
+}
+
+/// TypeHasComponent - Return true if T has a component type with the
+/// specified offset and size.  If Size is zero, do not check the size.
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
+                            const DataLayout &DL) {
+  Type *EltTy;
+  uint64_t EltSize;
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    unsigned EltIdx = Layout->getElementContainingOffset(Offset);
+    EltTy = ST->getContainedType(EltIdx);
+    EltSize = DL.getTypeAllocSize(EltTy);
+    Offset -= Layout->getElementOffset(EltIdx);
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    EltTy = AT->getElementType();
+    EltSize = DL.getTypeAllocSize(EltTy);
+    if (Offset >= AT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
+  } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
+    EltTy = VT->getElementType();
+    EltSize = DL.getTypeAllocSize(EltTy);
+    if (Offset >= VT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
+  } else {
+    return false;
+  }
+  if (Offset == 0 && (Size == 0 || EltSize == Size))
+    return true;
+  // Check if the component spans multiple elements.
+  if (Offset + Size > EltSize)
+    return false;
+  return TypeHasComponent(EltTy, Offset, Size, DL);
+}
+
+/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
+/// the instruction I, which references it, to use the separate elements.
+/// Offset indicates the position within AI that is referenced by this
+/// instruction.
+void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                                SmallVectorImpl<AllocaInst *> &NewElts) {
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
+    Use &TheUse = *UI++;
+    Instruction *User = cast<Instruction>(TheUse.getUser());
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCast(BC, AI, Offset, NewElts);
+      continue;
+    }
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      RewriteGEP(GEPI, AI, Offset, NewElts);
+      continue;
+    }
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+      uint64_t MemSize = Length->getZExtValue();
+      if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType()))
+        RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
+      // Otherwise the intrinsic can only touch a single element and the
+      // address operand will be updated, so nothing else needs to be done.
+      continue;
+    }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+      }
+      continue;
+    }
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      Type *LIType = LI->getType();
+
+      if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
+        // Replace:
+        //   %res = load { i32, i32 }* %alloc
+        // with:
+        //   %load.0 = load i32* %alloc.0
+        //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0
+        //   %load.1 = load i32* %alloc.1
+        //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
+        // (Also works for arrays instead of structs)
+        Value *Insert = UndefValue::get(LIType);
+        IRBuilder<> Builder(LI);
+        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+          Value *Load = Builder.CreateLoad(NewElts[i], "load");
+          Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
+        }
+        LI->replaceAllUsesWith(Insert);
+        DeadInsts.push_back(LI);
+      } else if (LIType->isIntegerTy() &&
+                 DL.getTypeAllocSize(LIType) ==
+                     DL.getTypeAllocSize(AI->getAllocatedType())) {
+        // If this is a load of the entire alloca to an integer, rewrite it.
+        RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
+      }
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      Value *Val = SI->getOperand(0);
+      Type *SIType = Val->getType();
+      if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
+        // Replace:
+        //   store { i32, i32 } %val, { i32, i32 }* %alloc
+        // with:
+        //   %val.0 = extractvalue { i32, i32 } %val, 0
+        //   store i32 %val.0, i32* %alloc.0
+        //   %val.1 = extractvalue { i32, i32 } %val, 1
+        //   store i32 %val.1, i32* %alloc.1
+        // (Also works for arrays instead of structs)
+        IRBuilder<> Builder(SI);
+        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+          Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
+          Builder.CreateStore(Extract, NewElts[i]);
+        }
+        DeadInsts.push_back(SI);
+      } else if (SIType->isIntegerTy() &&
+                 DL.getTypeAllocSize(SIType) ==
+                     DL.getTypeAllocSize(AI->getAllocatedType())) {
+        // If this is a store of the entire alloca from an integer, rewrite it.
+        RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
+      }
+      continue;
+    }
+
+    if (isa<SelectInst>(User) || isa<PHINode>(User)) {
+      // If we have a PHI user of the alloca itself (as opposed to a GEP or
+      // bitcast) we have to rewrite it.  GEP and bitcast uses will be RAUW'd to
+      // the new pointer.
+      if (!isa<AllocaInst>(I)) continue;
+
+      assert(Offset == 0 && NewElts[0] &&
+             "Direct alloca use should have a zero offset");
+
+      // If we have a use of the alloca, we know the derived uses will be
+      // utilizing just the first element of the scalarized result.  Insert a
+      // bitcast of the first alloca before the user as required.
+      AllocaInst *NewAI = NewElts[0];
+      BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI);
+      NewAI->moveBefore(BCI);
+      TheUse = BCI;
+      continue;
+    }
+  }
+}
+
+/// RewriteBitCast - Update a bitcast reference to the alloca being replaced
+/// and recursively continue updating all of its uses.
+void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+                          SmallVectorImpl<AllocaInst *> &NewElts) {
+  RewriteForScalarRepl(BC, AI, Offset, NewElts);
+  if (BC->getOperand(0) != AI)
+    return;
+
+  // The bitcast references the original alloca.  Replace its uses with
+  // references to the alloca containing offset zero (which is normally at
+  // index zero, but might not be in cases involving structs with elements
+  // of size zero).
+  Type *T = AI->getAllocatedType();
+  uint64_t EltOffset = 0;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy,
+                                      BC->getModule()->getDataLayout());
+  Instruction *Val = NewElts[Idx];
+  if (Val->getType() != BC->getDestTy()) {
+    Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
+    Val->takeName(BC);
+  }
+  BC->replaceAllUsesWith(Val);
+  DeadInsts.push_back(BC);
+}
+
+/// FindElementAndOffset - Return the index of the element containing Offset
+/// within the specified type, which must be either a struct or an array.
+/// Sets T to the type of the element and Offset to the offset within that
+/// element.  IdxTy is set to the type of the index result to be used in a
+/// GEP instruction.
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
+                                    const DataLayout &DL) {
+  uint64_t Idx = 0;
+
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    const StructLayout *Layout = DL.getStructLayout(ST);
+    Idx = Layout->getElementContainingOffset(Offset);
+    T = ST->getContainedType(Idx);
+    Offset -= Layout->getElementOffset(Idx);
+    IdxTy = Type::getInt32Ty(T->getContext());
+    return Idx;
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    T = AT->getElementType();
+    uint64_t EltSize = DL.getTypeAllocSize(T);
+    Idx = Offset / EltSize;
+    Offset -= Idx * EltSize;
+    IdxTy = Type::getInt64Ty(T->getContext());
+    return Idx;
+  }
+  VectorType *VT = cast<VectorType>(T);
+  T = VT->getElementType();
+  uint64_t EltSize = DL.getTypeAllocSize(T);
+  Idx = Offset / EltSize;
+  Offset -= Idx * EltSize;
+  IdxTy = Type::getInt64Ty(T->getContext());
+  return Idx;
+}
+
+/// RewriteGEP - Check if this GEP instruction moves the pointer across
+/// elements of the alloca that are being split apart, and if so, rewrite
+/// the GEP to be relative to the new element.
+void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+                      SmallVectorImpl<AllocaInst *> &NewElts) {
+  uint64_t OldOffset = Offset;
+  const DataLayout &DL = GEPI->getModule()->getDataLayout();
+  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If the GEP was dynamic then it must have been a dynamic vector lookup.
+  // In this case, it must be the last GEP operand which is dynamic so keep that
+  // aside until we've found the constant GEP offset then add it back in at the
+  // end.
+  Value* NonConstantIdx = nullptr;
+  if (!GEPI->hasAllConstantIndices())
+    NonConstantIdx = Indices.pop_back_val();
+  Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+
+  RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
+
+  Type *T = AI->getAllocatedType();
+  Type *IdxTy;
+  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL);
+  if (GEPI->getOperand(0) == AI)
+    OldIdx = ~0ULL; // Force the GEP to be rewritten.
+
+  T = AI->getAllocatedType();
+  uint64_t EltOffset = Offset;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
+
+  // If this GEP does not move the pointer across elements of the alloca
+  // being split, then it does not needs to be rewritten.
+  if (Idx == OldIdx)
+    return;
+
+  Type *i32Ty = Type::getInt32Ty(AI->getContext());
+  SmallVector<Value*, 8> NewArgs;
+  NewArgs.push_back(Constant::getNullValue(i32Ty));
+  while (EltOffset != 0) {
+    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
+    NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
+  }
+  if (NonConstantIdx) {
+    Type* GepTy = T;
+    // This GEP has a dynamic index.  We need to add "i32 0" to index through
+    // any structs or arrays in the original type until we get to the vector
+    // to index.
+    while (!isa<VectorType>(GepTy)) {
+      NewArgs.push_back(Constant::getNullValue(i32Ty));
+      GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U);
+    }
+    NewArgs.push_back(NonConstantIdx);
+  }
+  Instruction *Val = NewElts[Idx];
+  if (NewArgs.size() > 1) {
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
+    Val->takeName(GEPI);
+  }
+  if (Val->getType() != GEPI->getType())
+    Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI);
+  GEPI->replaceAllUsesWith(Val);
+  DeadInsts.push_back(GEPI);
+}
+
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                    uint64_t Offset,
+                                    SmallVectorImpl<AllocaInst *> &NewElts) {
+  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+  // Put matching lifetime markers on everything from Offset up to
+  // Offset+OldSize.
+  Type *AIType = AI->getAllocatedType();
+  const DataLayout &DL = II->getModule()->getDataLayout();
+  uint64_t NewOffset = Offset;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL);
+
+  IRBuilder<> Builder(II);
+  uint64_t Size = OldSize->getLimitedValue();
+
+  if (NewOffset) {
+    // Splice the first element and index 'NewOffset' bytes in.  SROA will
+    // split the alloca again later.
+    unsigned AS = AI->getType()->getAddressSpace();
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS));
+    V = Builder.CreateGEP(Builder.getInt8Ty(), V, Builder.getInt64(NewOffset));
+
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset;
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+    ++Idx;
+  }
+
+  for (; Idx != NewElts.size() && Size; ++Idx) {
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = DL.getTypeAllocSize(IdxTy);
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(NewElts[Idx],
+                                  Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(NewElts[Idx],
+                                Builder.getInt64(EltSize));
+  }
+  DeadInsts.push_back(II);
+}
+
+/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
+/// Rewrite it to copy or set the elements of the scalarized memory.
+void
+SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+                                   AllocaInst *AI,
+                                   SmallVectorImpl<AllocaInst *> &NewElts) {
+  // If this is a memcpy/memmove, construct the other pointer as the
+  // appropriate type.  The "Other" pointer is the pointer that goes to memory
+  // that doesn't have anything to do with the alloca that we are promoting. For
+  // memset, this Value* stays null.
+  Value *OtherPtr = nullptr;
+  unsigned MemAlignment = MI->getAlignment();
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
+    if (Inst == MTI->getRawDest())
+      OtherPtr = MTI->getRawSource();
+    else {
+      assert(Inst == MTI->getRawSource());
+      OtherPtr = MTI->getRawDest();
+    }
+  }
+
+  // If there is an other pointer, we want to convert it to the same pointer
+  // type as AI has, so we can GEP through it safely.
+  if (OtherPtr) {
+    unsigned AddrSpace =
+      cast<PointerType>(OtherPtr->getType())->getAddressSpace();
+
+    // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
+    // optimization, but it's also required to detect the corner case where
+    // both pointer operands are referencing the same memory, and where
+    // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
+    // function is only called for mem intrinsics that access the whole
+    // aggregate, so non-zero GEPs are not an issue here.)
+    OtherPtr = OtherPtr->stripPointerCasts();
+
+    // Copying the alloca to itself is a no-op: just delete it.
+    if (OtherPtr == AI || OtherPtr == NewElts[0]) {
+      // This code will run twice for a no-op memcpy -- once for each operand.
+      // Put only one reference to MI on the DeadInsts list.
+      for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(),
+             E = DeadInsts.end(); I != E; ++I)
+        if (*I == MI) return;
+      DeadInsts.push_back(MI);
+      return;
+    }
+
+    // If the pointer is not the right type, insert a bitcast to the right
+    // type.
+    Type *NewTy =
+      PointerType::get(AI->getType()->getElementType(), AddrSpace);
+
+    if (OtherPtr->getType() != NewTy)
+      OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
+  }
+
+  // Process each element of the aggregate.
+  bool SROADest = MI->getRawDest() == Inst;
+
+  Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
+  const DataLayout &DL = MI->getModule()->getDataLayout();
+
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // If this is a memcpy/memmove, emit a GEP of the other element address.
+    Value *OtherElt = nullptr;
+    unsigned OtherEltAlign = MemAlignment;
+
+    if (OtherPtr) {
+      Value *Idx[2] = { Zero,
+                      ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
+                                              OtherPtr->getName()+"."+Twine(i),
+                                                   MI);
+      uint64_t EltOffset;
+      PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      Type *OtherTy = OtherPtrTy->getElementType();
+      if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
+        EltOffset = DL.getStructLayout(ST)->getElementOffset(i);
+      } else {
+        Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
+        EltOffset = DL.getTypeAllocSize(EltTy) * i;
+      }
+
+      // The alignment of the other pointer is the guaranteed alignment of the
+      // element, which is affected by both the known alignment of the whole
+      // mem intrinsic and the alignment of the element.  If the alignment of
+      // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
+      // known alignment is just 4 bytes.
+      OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
+    }
+
+    Value *EltPtr = NewElts[i];
+    Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+
+    // If we got down to a scalar, insert a load or store as appropriate.
+    if (EltTy->isSingleValueType()) {
+      if (isa<MemTransferInst>(MI)) {
+        if (SROADest) {
+          // From Other to Alloca.
+          Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
+          new StoreInst(Elt, EltPtr, MI);
+        } else {
+          // From Alloca to Other.
+          Value *Elt = new LoadInst(EltPtr, "tmp", MI);
+          new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
+        }
+        continue;
+      }
+      assert(isa<MemSetInst>(MI));
+
+      // If the stored element is zero (common case), just store a null
+      // constant.
+      Constant *StoreVal;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
+        if (CI->isZero()) {
+          StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
+        } else {
+          // If EltTy is a vector type, get the element type.
+          Type *ValTy = EltTy->getScalarType();
+
+          // Construct an integer with the right value.
+          unsigned EltSize = DL.getTypeSizeInBits(ValTy);
+          APInt OneVal(EltSize, CI->getZExtValue());
+          APInt TotalVal(OneVal);
+          // Set each byte.
+          for (unsigned i = 0; 8*i < EltSize; ++i) {
+            TotalVal = TotalVal.shl(8);
+            TotalVal |= OneVal;
+          }
+
+          // Convert the integer value to the appropriate type.
+          StoreVal = ConstantInt::get(CI->getContext(), TotalVal);
+          if (ValTy->isPointerTy())
+            StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
+          else if (ValTy->isFloatingPointTy())
+            StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
+          assert(StoreVal->getType() == ValTy && "Type mismatch!");
+
+          // If the requested value was a vector constant, create it.
+          if (EltTy->isVectorTy()) {
+            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
+            StoreVal = ConstantVector::getSplat(NumElts, StoreVal);
+          }
+        }
+        new StoreInst(StoreVal, EltPtr, MI);
+        continue;
+      }
+      // Otherwise, if we're storing a byte variable, use a memset call for
+      // this element.
+    }
+
+    unsigned EltSize = DL.getTypeAllocSize(EltTy);
+    if (!EltSize)
+      continue;
+
+    IRBuilder<> Builder(MI);
+
+    // Finally, insert the meminst for this element.
+    if (isa<MemSetInst>(MI)) {
+      Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize,
+                           MI->isVolatile());
+    } else {
+      assert(isa<MemTransferInst>(MI));
+      Value *Dst = SROADest ? EltPtr : OtherElt;  // Dest ptr
+      Value *Src = SROADest ? OtherElt : EltPtr;  // Src ptr
+
+      if (isa<MemCpyInst>(MI))
+        Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile());
+      else
+        Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile());
+    }
+  }
+  DeadInsts.push_back(MI);
+}
+
+/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
+/// overwrites the entire allocation.  Extract out the pieces of the stored
+/// integer and store them individually.
+void
+SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+                                    SmallVectorImpl<AllocaInst *> &NewElts) {
+  // Extract each element out of the integer according to its structure offset
+  // and store the element value to the individual alloca.
+  Value *SrcVal = SI->getOperand(0);
+  Type *AllocaEltTy = AI->getAllocatedType();
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
+
+  IRBuilder<> Builder(SI);
+
+  // Handle tail padding by extending the operand
+  if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+    SrcVal = Builder.CreateZExt(SrcVal,
+                            IntegerType::get(SI->getContext(), AllocaSizeBits));
+
+  DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI
+               << '\n');
+
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    const StructLayout *Layout = DL.getStructLayout(EltSTy);
+
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Get the number of bits to shift SrcVal to get the value.
+      Type *FieldTy = EltSTy->getElementType(i);
+      uint64_t Shift = Layout->getElementOffsetInBits(i);
+
+      if (DL.isBigEndian())
+        Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy);
+
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
+      }
+
+      // Truncate down to an integer of the right size.
+      uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
+
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (FieldSizeBits == 0) continue;
+
+      if (FieldSizeBits != AllocaSizeBits)
+        EltVal = Builder.CreateTrunc(EltVal,
+                             IntegerType::get(SI->getContext(), FieldSizeBits));
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == FieldTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = Builder.CreateBitCast(EltVal, FieldTy);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = Builder.CreateBitCast(DestField,
+                                     PointerType::getUnqual(EltVal->getType()));
+      }
+      new StoreInst(EltVal, DestField, SI);
+    }
+
+  } else {
+    ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    Type *ArrayEltTy = ATy->getElementType();
+    uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
+    uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy);
+
+    uint64_t Shift;
+
+    if (DL.isBigEndian())
+      Shift = AllocaSizeBits-ElementOffset;
+    else
+      Shift = 0;
+
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (ElementSizeBits == 0) continue;
+
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
+      }
+
+      // Truncate down to an integer of the right size.
+      if (ElementSizeBits != AllocaSizeBits)
+        EltVal = Builder.CreateTrunc(EltVal,
+                                     IntegerType::get(SI->getContext(),
+                                                      ElementSizeBits));
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == ArrayEltTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (ArrayEltTy->isFloatingPointTy() ||
+                 ArrayEltTy->isVectorTy()) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = Builder.CreateBitCast(DestField,
+                                     PointerType::getUnqual(EltVal->getType()));
+      }
+      new StoreInst(EltVal, DestField, SI);
+
+      if (DL.isBigEndian())
+        Shift -= ElementOffset;
+      else
+        Shift += ElementOffset;
+    }
+  }
+
+  DeadInsts.push_back(SI);
+}
+
+/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
+/// an integer.  Load the individual pieces to form the aggregate value.
+void
+SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+                                   SmallVectorImpl<AllocaInst *> &NewElts) {
+  // Extract each element out of the NewElts according to its structure offset
+  // and form the result value.
+  Type *AllocaEltTy = AI->getAllocatedType();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
+
+  DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
+               << '\n');
+
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  const StructLayout *Layout = nullptr;
+  uint64_t ArrayEltBitOffset = 0;
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    Layout = DL.getStructLayout(EltSTy);
+  } else {
+    Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
+  }
+
+  Value *ResultVal =
+    Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits));
+
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // Load the value from the alloca.  If the NewElt is an aggregate, cast
+    // the pointer to an integer of the same size before doing the load.
+    Value *SrcField = NewElts[i];
+    Type *FieldTy =
+      cast<PointerType>(SrcField->getType())->getElementType();
+    uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
+
+    // Ignore zero sized fields like {}, they obviously contain no data.
+    if (FieldSizeBits == 0) continue;
+
+    IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
+                                                     FieldSizeBits);
+    if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
+        !FieldTy->isVectorTy())
+      SrcField = new BitCastInst(SrcField,
+                                 PointerType::getUnqual(FieldIntTy),
+                                 "", LI);
+    SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
+
+    // If SrcField is a fp or vector of the right size but that isn't an
+    // integer type, bitcast to an integer so we can shift it.
+    if (SrcField->getType() != FieldIntTy)
+      SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
+
+    // Zero extend the field to be the same size as the final alloca so that
+    // we can shift and insert it.
+    if (SrcField->getType() != ResultVal->getType())
+      SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
+
+    // Determine the number of bits to shift SrcField.
+    uint64_t Shift;
+    if (Layout) // Struct case.
+      Shift = Layout->getElementOffsetInBits(i);
+    else  // Array case.
+      Shift = i*ArrayEltBitOffset;
+
+    if (DL.isBigEndian())
+      Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
+
+    if (Shift) {
+      Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
+      SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
+    }
+
+    // Don't create an 'or x, 0' on the first iteration.
+    if (!isa<Constant>(ResultVal) ||
+        !cast<Constant>(ResultVal)->isNullValue())
+      ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+    else
+      ResultVal = SrcField;
+  }
+
+  // Handle tail padding by truncating the result
+  if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+    ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
+
+  LI->replaceAllUsesWith(ResultVal);
+  DeadInsts.push_back(LI);
+}
+
+/// HasPadding - Return true if the specified type has any structure or
+/// alignment padding in between the elements that would be split apart
+/// by SROA; return false otherwise.
+static bool HasPadding(Type *Ty, const DataLayout &DL) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Ty = ATy->getElementType();
+    return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty);
+  }
+
+  // SROA currently handles only Arrays and Structs.
+  StructType *STy = cast<StructType>(Ty);
+  const StructLayout *SL = DL.getStructLayout(STy);
+  unsigned PrevFieldBitOffset = 0;
+  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
+
+    // Check to see if there is any padding between this element and the
+    // previous one.
+    if (i) {
+      unsigned PrevFieldEnd =
+        PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1));
+      if (PrevFieldEnd < FieldBitOffset)
+        return true;
+    }
+    PrevFieldBitOffset = FieldBitOffset;
+  }
+  // Check for tail padding.
+  if (unsigned EltCount = STy->getNumElements()) {
+    unsigned PrevFieldEnd = PrevFieldBitOffset +
+      DL.getTypeSizeInBits(STy->getElementType(EltCount-1));
+    if (PrevFieldEnd < SL->getSizeInBits())
+      return true;
+  }
+  return false;
+}
+
+/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
+/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
+/// or 1 if safe after canonicalization has been performed.
+bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
+  // Loop over the use list of the alloca.  We can only transform it if all of
+  // the users are safe to transform.
+  AllocaInfo Info(AI);
+
+  isSafeForScalarRepl(AI, 0, Info);
+  if (Info.isUnsafe) {
+    DEBUG(dbgs() << "Cannot transform: " << *AI << '\n');
+    return false;
+  }
+
+  const DataLayout &DL = AI->getModule()->getDataLayout();
+
+  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
+  // source and destination, we have to be careful.  In particular, the memcpy
+  // could be moving around elements that live in structure padding of the LLVM
+  // types, but may actually be used.  In these cases, we refuse to promote the
+  // struct.
+  if (Info.isMemCpySrc && Info.isMemCpyDst &&
+      HasPadding(AI->getAllocatedType(), DL))
+    return false;
+
+  // If the alloca never has an access to just *part* of it, but is accessed
+  // via loads and stores, then we should use ConvertToScalarInfo to promote
+  // the alloca instead of promoting each piece at a time and inserting fission
+  // and fusion code.
+  if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
+    // If the struct/array just has one element, use basic SRoA.
+    if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+      if (ST->getNumElements() > 1) return false;
+    } else {
+      if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
+        return false;
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
new file mode 100644
index 0000000..054bacd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -0,0 +1,678 @@
+//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarizer"
+
+namespace {
+// Used to store the scattered form of a vector.
+typedef SmallVector<Value *, 8> ValueVector;
+
+// Used to map a vector Value to its scattered form.  We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+typedef std::map<Value *, ValueVector> ScatterMap;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+  Scatterer() {}
+
+  // Scatter V into Size components.  If new instructions are needed,
+  // insert them before BBI in BB.  If Cache is nonnull, use it to cache
+  // the results.
+  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+            ValueVector *cachePtr = nullptr);
+
+  // Return component I, creating a new Value for it if necessary.
+  Value *operator[](unsigned I);
+
+  // Return the number of components.
+  unsigned size() const { return Size; }
+
+private:
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+  Value *V;
+  ValueVector *CachePtr;
+  PointerType *PtrTy;
+  ValueVector Tmp;
+  unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+  FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+  }
+  FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+  ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+  }
+  ICmpInst &ICI;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+  BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+  }
+  BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+  VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {}
+
+  // Return the alignment of element I.
+  uint64_t getElemAlign(unsigned I) {
+    return MinAlign(VecAlign, I * ElemSize);
+  }
+
+  // The type of the vector.
+  VectorType *VecTy;
+
+  // The type of each element.
+  Type *ElemTy;
+
+  // The alignment of the vector.
+  uint64_t VecAlign;
+
+  // The size of each element.
+  uint64_t ElemSize;
+};
+
+class Scalarizer : public FunctionPass,
+                   public InstVisitor<Scalarizer, bool> {
+public:
+  static char ID;
+
+  Scalarizer() :
+    FunctionPass(ID) {
+    initializeScalarizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  // InstVisitor methods.  They return true if the instruction was scalarized,
+  // false if nothing changed.
+  bool visitInstruction(Instruction &) { return false; }
+  bool visitSelectInst(SelectInst &SI);
+  bool visitICmpInst(ICmpInst &);
+  bool visitFCmpInst(FCmpInst &);
+  bool visitBinaryOperator(BinaryOperator &);
+  bool visitGetElementPtrInst(GetElementPtrInst &);
+  bool visitCastInst(CastInst &);
+  bool visitBitCastInst(BitCastInst &);
+  bool visitShuffleVectorInst(ShuffleVectorInst &);
+  bool visitPHINode(PHINode &);
+  bool visitLoadInst(LoadInst &);
+  bool visitStoreInst(StoreInst &);
+
+  static void registerOptions() {
+    // This is disabled by default because having separate loads and stores
+    // makes it more likely that the -combiner-alias-analysis limits will be
+    // reached.
+    OptionRegistry::registerOption<bool, Scalarizer,
+                                 &Scalarizer::ScalarizeLoadStore>(
+        "scalarize-load-store",
+        "Allow the scalarizer pass to scalarize loads and store", false);
+  }
+
+private:
+  Scatterer scatter(Instruction *, Value *);
+  void gather(Instruction *, const ValueVector &);
+  bool canTransferMetadata(unsigned Kind);
+  void transferMetadata(Instruction *, const ValueVector &);
+  bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &);
+  bool finish();
+
+  template<typename T> bool splitBinary(Instruction &, const T &);
+
+  ScatterMap Scattered;
+  GatherList Gathered;
+  unsigned ParallelLoopAccessMDKind;
+  bool ScalarizeLoadStore;
+};
+
+char Scalarizer::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
+                             "Scalarize vector operations", false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+                     ValueVector *cachePtr)
+  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+  Type *Ty = V->getType();
+  PtrTy = dyn_cast<PointerType>(Ty);
+  if (PtrTy)
+    Ty = PtrTy->getElementType();
+  Size = Ty->getVectorNumElements();
+  if (!CachePtr)
+    Tmp.resize(Size, nullptr);
+  else if (CachePtr->empty())
+    CachePtr->resize(Size, nullptr);
+  else
+    assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+  ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+  // Try to reuse a previous value.
+  if (CV[I])
+    return CV[I];
+  IRBuilder<> Builder(BB, BBI);
+  if (PtrTy) {
+    if (!CV[0]) {
+      Type *Ty =
+        PointerType::get(PtrTy->getElementType()->getVectorElementType(),
+                         PtrTy->getAddressSpace());
+      CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0");
+    }
+    if (I != 0)
+      CV[I] = Builder.CreateConstGEP1_32(nullptr, CV[0], I,
+                                         V->getName() + ".i" + Twine(I));
+  } else {
+    // Search through a chain of InsertElementInsts looking for element I.
+    // Record other elements in the cache.  The new V is still suitable
+    // for all uncached indices.
+    for (;;) {
+      InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+      if (!Insert)
+        break;
+      ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+      if (!Idx)
+        break;
+      unsigned J = Idx->getZExtValue();
+      V = Insert->getOperand(0);
+      if (I == J) {
+        CV[J] = Insert->getOperand(1);
+        return CV[J];
+      } else if (!CV[J]) {
+        // Only cache the first entry we find for each index we're not actively
+        // searching for. This prevents us from going too far up the chain and
+        // caching incorrect entries.
+        CV[J] = Insert->getOperand(1);
+      }
+    }
+    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+                                         V->getName() + ".i" + Twine(I));
+  }
+  return CV[I];
+}
+
+bool Scalarizer::doInitialization(Module &M) {
+  ParallelLoopAccessMDKind =
+      M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  ScalarizeLoadStore =
+      M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
+  return false;
+}
+
+bool Scalarizer::runOnFunction(Function &F) {
+  assert(Gathered.empty() && Scattered.empty());
+  for (BasicBlock &BB : F) {
+    for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
+      Instruction *I = &*II;
+      bool Done = visit(I);
+      ++II;
+      if (Done && I->getType()->isVoidTy())
+        I->eraseFromParent();
+    }
+  }
+  return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point.  V must be a
+// vector or a pointer to a vector.
+Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
+  if (Argument *VArg = dyn_cast<Argument>(V)) {
+    // Put the scattered form of arguments in the entry block,
+    // so that it can be used everywhere.
+    Function *F = VArg->getParent();
+    BasicBlock *BB = &F->getEntryBlock();
+    return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+  }
+  if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+    // Put the scattered form of an instruction directly after the
+    // instruction.
+    BasicBlock *BB = VOp->getParent();
+    return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+                     V, &Scattered[V]);
+  }
+  // In the fallback case, just put the scattered before Point and
+  // keep the result local to Point.
+  return Scatterer(Point->getParent(), Point->getIterator(), V);
+}
+
+// Replace Op with the gathered form of the components in CV.  Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
+  // Since we're not deleting Op yet, stub out its operands, so that it
+  // doesn't make anything live unnecessarily.
+  for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
+    Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
+
+  transferMetadata(Op, CV);
+
+  // If we already have a scattered form of Op (created from ExtractElements
+  // of Op itself), replace them with the new form.
+  ValueVector &SV = Scattered[Op];
+  if (!SV.empty()) {
+    for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+      Instruction *Old = cast<Instruction>(SV[I]);
+      CV[I]->takeName(Old);
+      Old->replaceAllUsesWith(CV[I]);
+      Old->eraseFromParent();
+    }
+  }
+  SV = CV;
+  Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool Scalarizer::canTransferMetadata(unsigned Tag) {
+  return (Tag == LLVMContext::MD_tbaa
+          || Tag == LLVMContext::MD_fpmath
+          || Tag == LLVMContext::MD_tbaa_struct
+          || Tag == LLVMContext::MD_invariant_load
+          || Tag == LLVMContext::MD_alias_scope
+          || Tag == LLVMContext::MD_noalias
+          || Tag == ParallelLoopAccessMDKind);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  Op->getAllMetadataOtherThanDebugLoc(MDs);
+  for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+    if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+      for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
+               MI = MDs.begin(),
+               ME = MDs.end();
+           MI != ME; ++MI)
+        if (canTransferMetadata(MI->first))
+          New->setMetadata(MI->first, MI->second);
+      New->setDebugLoc(Op->getDebugLoc());
+    }
+  }
+}
+
+// Try to fill in Layout from Ty, returning true on success.  Alignment is
+// the alignment of the vector, or 0 if the ABI default should be used.
+bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
+                                 VectorLayout &Layout, const DataLayout &DL) {
+  // Make sure we're dealing with a vector.
+  Layout.VecTy = dyn_cast<VectorType>(Ty);
+  if (!Layout.VecTy)
+    return false;
+
+  // Check that we're dealing with full-byte elements.
+  Layout.ElemTy = Layout.VecTy->getElementType();
+  if (DL.getTypeSizeInBits(Layout.ElemTy) !=
+      DL.getTypeStoreSizeInBits(Layout.ElemTy))
+    return false;
+
+  if (Alignment)
+    Layout.VecAlign = Alignment;
+  else
+    Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy);
+  Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
+  return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&I);
+  Scatterer Op0 = scatter(&I, I.getOperand(0));
+  Scatterer Op1 = scatter(&I, I.getOperand(1));
+  assert(Op0.size() == NumElems && "Mismatched binary operation");
+  assert(Op1.size() == NumElems && "Mismatched binary operation");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+    Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
+                      I.getName() + ".i" + Twine(Elem));
+  gather(&I, Res);
+  return true;
+}
+
+bool Scalarizer::visitSelectInst(SelectInst &SI) {
+  VectorType *VT = dyn_cast<VectorType>(SI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&SI);
+  Scatterer Op1 = scatter(&SI, SI.getOperand(1));
+  Scatterer Op2 = scatter(&SI, SI.getOperand(2));
+  assert(Op1.size() == NumElems && "Mismatched select");
+  assert(Op2.size() == NumElems && "Mismatched select");
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (SI.getOperand(0)->getType()->isVectorTy()) {
+    Scatterer Op0 = scatter(&SI, SI.getOperand(0));
+    assert(Op0.size() == NumElems && "Mismatched select");
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  } else {
+    Value *Op0 = SI.getOperand(0);
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  }
+  gather(&SI, Res);
+  return true;
+}
+
+bool Scalarizer::visitICmpInst(ICmpInst &ICI) {
+  return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool Scalarizer::visitFCmpInst(FCmpInst &FCI) {
+  return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) {
+  return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+  if (!VT)
+    return false;
+
+  IRBuilder<> Builder(&GEPI);
+  unsigned NumElems = VT->getNumElements();
+  unsigned NumIndices = GEPI.getNumIndices();
+
+  Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+
+  SmallVector<Scatterer, 8> Ops;
+  Ops.resize(NumIndices);
+  for (unsigned I = 0; I < NumIndices; ++I)
+    Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    SmallVector<Value *, 8> Indices;
+    Indices.resize(NumIndices);
+    for (unsigned J = 0; J < NumIndices; ++J)
+      Indices[J] = Ops[J][I];
+    Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
+                               GEPI.getName() + ".i" + Twine(I));
+    if (GEPI.isInBounds())
+      if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+        NewGEPI->setIsInBounds();
+  }
+  gather(&GEPI, Res);
+  return true;
+}
+
+bool Scalarizer::visitCastInst(CastInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&CI);
+  Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+  assert(Op0.size() == NumElems && "Mismatched cast");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+                                CI.getName() + ".i" + Twine(I));
+  gather(&CI, Res);
+  return true;
+}
+
+bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
+  VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+  VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+  if (!DstVT || !SrcVT)
+    return false;
+
+  unsigned DstNumElems = DstVT->getNumElements();
+  unsigned SrcNumElems = SrcVT->getNumElements();
+  IRBuilder<> Builder(&BCI);
+  Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+  ValueVector Res;
+  Res.resize(DstNumElems);
+
+  if (DstNumElems == SrcNumElems) {
+    for (unsigned I = 0; I < DstNumElems; ++I)
+      Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+                                     BCI.getName() + ".i" + Twine(I));
+  } else if (DstNumElems > SrcNumElems) {
+    // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the
+    // individual elements to the destination.
+    unsigned FanOut = DstNumElems / SrcNumElems;
+    Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+    unsigned ResI = 0;
+    for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+      Value *V = Op0[Op0I];
+      Instruction *VI;
+      // Look through any existing bitcasts before converting to <N x t2>.
+      // In the best case, the resulting conversion might be a no-op.
+      while ((VI = dyn_cast<Instruction>(V)) &&
+             VI->getOpcode() == Instruction::BitCast)
+        V = VI->getOperand(0);
+      V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+      Scatterer Mid = scatter(&BCI, V);
+      for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+        Res[ResI++] = Mid[MidI];
+    }
+  } else {
+    // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2.
+    unsigned FanIn = SrcNumElems / DstNumElems;
+    Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+    unsigned Op0I = 0;
+    for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+      Value *V = UndefValue::get(MidTy);
+      for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+        V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+                                        BCI.getName() + ".i" + Twine(ResI)
+                                        + ".upto" + Twine(MidI));
+      Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+                                        BCI.getName() + ".i" + Twine(ResI));
+    }
+  }
+  gather(&BCI, Res);
+  return true;
+}
+
+bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+  Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I) {
+    int Selector = SVI.getMaskValue(I);
+    if (Selector < 0)
+      Res[I] = UndefValue::get(VT->getElementType());
+    else if (unsigned(Selector) < Op0.size())
+      Res[I] = Op0[Selector];
+    else
+      Res[I] = Op1[Selector - Op0.size()];
+  }
+  gather(&SVI, Res);
+  return true;
+}
+
+bool Scalarizer::visitPHINode(PHINode &PHI) {
+  VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(&PHI);
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  unsigned NumOps = PHI.getNumOperands();
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+                               PHI.getName() + ".i" + Twine(I));
+
+  for (unsigned I = 0; I < NumOps; ++I) {
+    Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+    BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+    for (unsigned J = 0; J < NumElems; ++J)
+      cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+  }
+  gather(&PHI, Res);
+  return true;
+}
+
+bool Scalarizer::visitLoadInst(LoadInst &LI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!LI.isSimple())
+    return false;
+
+  VectorLayout Layout;
+  if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout,
+                       LI.getModule()->getDataLayout()))
+    return false;
+
+  unsigned NumElems = Layout.VecTy->getNumElements();
+  IRBuilder<> Builder(&LI);
+  Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I),
+                                       LI.getName() + ".i" + Twine(I));
+  gather(&LI, Res);
+  return true;
+}
+
+bool Scalarizer::visitStoreInst(StoreInst &SI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!SI.isSimple())
+    return false;
+
+  VectorLayout Layout;
+  Value *FullValue = SI.getValueOperand();
+  if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout,
+                       SI.getModule()->getDataLayout()))
+    return false;
+
+  unsigned NumElems = Layout.VecTy->getNumElements();
+  IRBuilder<> Builder(&SI);
+  Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
+  Scatterer Val = scatter(&SI, FullValue);
+
+  ValueVector Stores;
+  Stores.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    unsigned Align = Layout.getElemAlign(I);
+    Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align);
+  }
+  transferMetadata(&SI, Stores);
+  return true;
+}
+
+// Delete the instructions that we scalarized.  If a full vector result
+// is still needed, recreate it using InsertElements.
+bool Scalarizer::finish() {
+  // The presence of data in Gathered or Scattered indicates changes
+  // made to the Function.
+  if (Gathered.empty() && Scattered.empty())
+    return false;
+  for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();
+       GMI != GME; ++GMI) {
+    Instruction *Op = GMI->first;
+    ValueVector &CV = *GMI->second;
+    if (!Op->use_empty()) {
+      // The value is still needed, so recreate it using a series of
+      // InsertElements.
+      Type *Ty = Op->getType();
+      Value *Res = UndefValue::get(Ty);
+      BasicBlock *BB = Op->getParent();
+      unsigned Count = Ty->getVectorNumElements();
+      IRBuilder<> Builder(Op);
+      if (isa<PHINode>(Op))
+        Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+      for (unsigned I = 0; I < Count; ++I)
+        Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+                                          Op->getName() + ".upto" + Twine(I));
+      Res->takeName(Op);
+      Op->replaceAllUsesWith(Res);
+    }
+    Op->eraseFromParent();
+  }
+  Gathered.clear();
+  Scattered.clear();
+  return true;
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+  return new Scalarizer();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
new file mode 100644
index 0000000..86a10d2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -0,0 +1,1265 @@
+//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+//   for (int j = 0; j < 2; ++j) {
+//     ...
+//     ... = a[x + i][y + j];
+//     ...
+//   }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32         %r1, %x;
+// mov.u32         %r2, %y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1  * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32         %r1, %tid.x;
+// mov.u32         %r2, %tid.y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32   %f2, [%rl6+4]; // much better
+// ld.global.f32   %f3, [%rl6+128]; // much better
+// ld.global.f32   %f4, [%rl6+132]; // much better
+//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+//     CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+//  BB1:
+//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+//    load %p
+//    ...
+//  BB2:
+//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+//    load %p2
+//    ...
+//
+// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+//  BB1:
+//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = add i64 %1, %2                          ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = add i64 %3, %4
+//    %6 = add i64 %3, struct_field_3              ; Constant offset
+//    %p = inttoptr i64 %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = add i64 %7, %8                          ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = add i64 %9, %10
+//    %12 = add i64 %11, struct_field_2            ; Constant offset
+//    %p = inttoptr i64 %12 to i32*
+//    load %p2
+//    ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+//  BB1:
+//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = getelementptr i8* %3, i64 %4
+//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
+//    %p = bitcast i8* %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = getelementptr i8* %9, i64 %10
+//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
+//    %p2 = bitcast i8* %12 to i32*
+//    load %p2
+//    ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+    "disable-separate-const-offset-from-gep", cl::init(false),
+    cl::desc("Do not separate the constant offset from a GEP instruction"),
+    cl::Hidden);
+// Setting this flag may emit false positives when the input module already
+// contains dead instructions. Therefore, we set it only in unit tests that are
+// free of dead code.
+static cl::opt<bool>
+    VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
+                     cl::desc("Verify this pass produces no dead code"),
+                     cl::Hidden);
+
+namespace {
+
+/// \brief A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+public:
+  /// Extracts a constant offset from the given GEP index. It returns the
+  /// new index representing the remainder (equal to the original index minus
+  /// the constant offset), or nullptr if we cannot extract a constant offset.
+  /// \p Idx The given GEP index
+  /// \p GEP The given GEP
+  /// \p UserChainTail Outputs the tail of UserChain so that we can
+  ///                  garbage-collect unused instructions in UserChain.
+  static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
+                        User *&UserChainTail, const DominatorTree *DT);
+  /// Looks for a constant offset from the given GEP index without extracting
+  /// it. It returns the numeric value of the extracted constant offset (0 if
+  /// failed). The meaning of the arguments are the same as Extract.
+  static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
+                      const DominatorTree *DT);
+
+private:
+  ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
+      : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
+  }
+  /// Searches the expression that computes V for a non-zero constant C s.t.
+  /// V can be reassociated into the form V' + C. If the searching is
+  /// successful, returns C and update UserChain as a def-use chain from C to V;
+  /// otherwise, UserChain is empty.
+  ///
+  /// \p V            The given expression
+  /// \p SignExtended Whether V will be sign-extended in the computation of the
+  ///                 GEP index
+  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+  ///                 GEP index
+  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
+  ///                 an index of an inbounds GEP is guaranteed to be
+  ///                 non-negative. Levaraging this, we can better split
+  ///                 inbounds GEPs.
+  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+  /// A helper function to look into both operands of a binary operator.
+  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+                            bool ZeroExtended);
+  /// After finding the constant offset C from the GEP index I, we build a new
+  /// index I' s.t. I' + C = I. This function builds and returns the new
+  /// index I' according to UserChain produced by function "find".
+  ///
+  /// The building conceptually takes two steps:
+  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+  /// that computes I
+  /// 2) reassociate the expression tree to the form I' + C.
+  ///
+  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+  /// sext to a, b and 5 so that we have
+  ///   sext(a) + (sext(b) + 5).
+  /// Then, we reassociate it to
+  ///   (sext(a) + sext(b)) + 5.
+  /// Given this form, we know I' is sext(a) + sext(b).
+  Value *rebuildWithoutConstOffset();
+  /// After the first step of rebuilding the GEP index without the constant
+  /// offset, distribute s/zext to the operands of all operators in UserChain.
+  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+  ///
+  /// The function also updates UserChain to point to new subexpressions after
+  /// distributing s/zext. e.g., the old UserChain of the above example is
+  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+  /// and the new UserChain is
+  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+  ///
+  /// \p ChainIndex The index to UserChain. ChainIndex is initially
+  ///               UserChain.size() - 1, and is decremented during
+  ///               the recursion.
+  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+  /// Reassociates the GEP index to the form I' + C and returns I'.
+  Value *removeConstOffset(unsigned ChainIndex);
+  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+  /// returns "sext i32 (zext i16 V to i32) to i64".
+  Value *applyExts(Value *V);
+
+  /// A helper function that returns whether we can trace into the operands
+  /// of binary operator BO for a constant offset.
+  ///
+  /// \p SignExtended Whether BO is surrounded by sext
+  /// \p ZeroExtended Whether BO is surrounded by zext
+  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+  ///                array index.
+  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+                    bool NonNegative);
+
+  /// The path from the constant offset to the old GEP index. e.g., if the GEP
+  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+  ///
+  /// This path helps to rebuild the new GEP index.
+  SmallVector<User *, 8> UserChain;
+  /// A data structure used in rebuildWithoutConstOffset. Contains all
+  /// sext/zext instructions along UserChain.
+  SmallVector<CastInst *, 16> ExtInsts;
+  Instruction *IP;  /// Insertion position of cloned instructions.
+  const DataLayout &DL;
+  const DominatorTree *DT;
+};
+
+/// \brief A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP : public FunctionPass {
+public:
+  static char ID;
+  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+                             bool LowerGEP = false)
+      : FunctionPass(ID), DL(nullptr), DT(nullptr), TM(TM), LowerGEP(LowerGEP) {
+    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// Tries to split the given GEP into a variadic base and a constant offset,
+  /// and returns true if the splitting succeeds.
+  bool splitGEP(GetElementPtrInst *GEP);
+  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of GEPs with a single index and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+                              int64_t AccumulativeByteOffset);
+  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of arithmetic operations and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToArithmetics(GetElementPtrInst *Variadic,
+                          int64_t AccumulativeByteOffset);
+  /// Finds the constant offset within each index and accumulates them. If
+  /// LowerGEP is true, it finds in indices of both sequential and structure
+  /// types, otherwise it only finds in sequential indices. The output
+  /// NeedsExtraction indicates whether we successfully find a non-zero constant
+  /// offset.
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+  /// Canonicalize array indices to pointer-size integers. This helps to
+  /// simplify the logic of splitting a GEP. For example, if a + b is a
+  /// pointer-size integer, we have
+  ///   gep base, a + b = gep (gep base, a), b
+  /// However, this equality may not hold if the size of a + b is smaller than
+  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+  /// pointer size before computing the address
+  /// (http://llvm.org/docs/LangRef.html#id181).
+  ///
+  /// This canonicalization is very likely already done in clang and
+  /// instcombine. Therefore, the program will probably remain the same.
+  ///
+  /// Returns true if the module changes.
+  ///
+  /// Verified in @i32_add in split-gep.ll
+  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+  /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+  /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+  /// the constant offset. After extraction, it becomes desirable to reunion the
+  /// distributed sexts. For example,
+  ///
+  ///                              &a[sext(i +nsw (j +nsw 5)]
+  ///   => distribute              &a[sext(i) +nsw (sext(j) +nsw 5)]
+  ///   => constant extraction     &a[sext(i) + sext(j)] + 5
+  ///   => reunion                 &a[sext(i +nsw j)] + 5
+  bool reuniteExts(Function &F);
+  /// A helper that reunites sexts in an instruction.
+  bool reuniteExts(Instruction *I);
+  /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+  Instruction *findClosestMatchingDominator(const SCEV *Key,
+                                            Instruction *Dominatee);
+  /// Verify F is free of dead code.
+  void verifyNoDeadCode(Function &F);
+
+  bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+  // Swap the index operand of two GEP.
+  void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+  // Check if it is safe to swap operand of two GEP.
+  bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+                            Loop *CurLoop);
+
+  const DataLayout *DL;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const TargetMachine *TM;
+
+  LoopInfo *LI;
+  TargetLibraryInfo *TLI;
+  /// Whether to lower a GEP with multiple indices into arithmetic operations or
+  /// multiple GEPs with a single index.
+  bool LowerGEP;
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
+};
+}  // anonymous namespace
+
+char SeparateConstOffsetFromGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+                                           bool LowerGEP) {
+  return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+}
+
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+                                            bool ZeroExtended,
+                                            BinaryOperator *BO,
+                                            bool NonNegative) {
+  // We only consider ADD, SUB and OR, because a non-zero constant found in
+  // expressions composed of these operations can be easily hoisted as a
+  // constant offset by reassociation.
+  if (BO->getOpcode() != Instruction::Add &&
+      BO->getOpcode() != Instruction::Sub &&
+      BO->getOpcode() != Instruction::Or) {
+    return false;
+  }
+
+  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+  if (BO->getOpcode() == Instruction::Or &&
+      !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
+    return false;
+
+  // In addition, tracing into BO requires that its surrounding s/zext (if
+  // any) is distributable to both operands.
+  //
+  // Suppose BO = A op B.
+  //  SignExtended | ZeroExtended | Distributable?
+  // --------------+--------------+----------------------------------
+  //       0       |      0       | true because no s/zext exists
+  //       0       |      1       | zext(BO) == zext(A) op zext(B)
+  //       1       |      0       | sext(BO) == sext(A) op sext(B)
+  //       1       |      1       | zext(sext(BO)) ==
+  //               |              |     zext(sext(A)) op zext(sext(B))
+  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+    // If a + b >= 0 and (a >= 0 or b >= 0), then
+    //   sext(a + b) = sext(a) + sext(b)
+    // even if the addition is not marked nsw.
+    //
+    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+    // index if the constant offset is non-negative.
+    //
+    // Verified in @sext_add in split-gep.ll.
+    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+      if (!ConstLHS->isNegative())
+        return true;
+    }
+    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+      if (!ConstRHS->isNegative())
+        return true;
+    }
+  }
+
+  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+  if (BO->getOpcode() == Instruction::Add ||
+      BO->getOpcode() == Instruction::Sub) {
+    if (SignExtended && !BO->hasNoSignedWrap())
+      return false;
+    if (ZeroExtended && !BO->hasNoUnsignedWrap())
+      return false;
+  }
+
+  return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+                                                   bool SignExtended,
+                                                   bool ZeroExtended) {
+  // BO being non-negative does not shed light on whether its operands are
+  // non-negative. Clear the NonNegative flag here.
+  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+                              /* NonNegative */ false);
+  // If we found a constant offset in the left operand, stop and return that.
+  // This shortcut might cause us to miss opportunities of combining the
+  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+  // However, such cases are probably already handled by -instcombine,
+  // given this pass runs after the standard optimizations.
+  if (ConstantOffset != 0) return ConstantOffset;
+  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+                        /* NonNegative */ false);
+  // If U is a sub operator, negate the constant offset found in the right
+  // operand.
+  if (BO->getOpcode() == Instruction::Sub)
+    ConstantOffset = -ConstantOffset;
+  return ConstantOffset;
+}
+
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+                                    bool ZeroExtended, bool NonNegative) {
+  // TODO(jingyue): We could trace into integer/pointer casts, such as
+  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+  // integers because it gives good enough results for our benchmarks.
+  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+  // We cannot do much with Values that are not a User, such as an Argument.
+  User *U = dyn_cast<User>(V);
+  if (U == nullptr) return APInt(BitWidth, 0);
+
+  APInt ConstantOffset(BitWidth, 0);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // Hooray, we found it!
+    ConstantOffset = CI->getValue();
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+    // Trace into subexpressions for more hoisting opportunities.
+    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
+      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+  } else if (isa<SExtInst>(V)) {
+    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+                          ZeroExtended, NonNegative).sext(BitWidth);
+  } else if (isa<ZExtInst>(V)) {
+    // As an optimization, we can clear the SignExtended flag because
+    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+    //
+    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+    ConstantOffset =
+        find(U->getOperand(0), /* SignExtended */ false,
+             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+  }
+
+  // If we found a non-zero constant offset, add it to the path for
+  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+  // help this optimization.
+  if (ConstantOffset != 0)
+    UserChain.push_back(U);
+  return ConstantOffset;
+}
+
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+  Value *Current = V;
+  // ExtInsts is built in the use-def order. Therefore, we apply them to V
+  // in the reversed order.
+  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+    if (Constant *C = dyn_cast<Constant>(Current)) {
+      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+    } else {
+      Instruction *Ext = (*I)->clone();
+      Ext->setOperand(0, Current);
+      Ext->insertBefore(IP);
+      Current = Ext;
+    }
+  }
+  return Current;
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+  distributeExtsAndCloneChain(UserChain.size() - 1);
+  // Remove all nullptrs (used to be s/zext) from UserChain.
+  unsigned NewSize = 0;
+  for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
+    if (*I != nullptr) {
+      UserChain[NewSize] = *I;
+      NewSize++;
+    }
+  }
+  UserChain.resize(NewSize);
+  return removeConstOffset(UserChain.size() - 1);
+}
+
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+  User *U = UserChain[ChainIndex];
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(U));
+    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+    assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
+           "We only traced into two types of CastInst: sext and zext");
+    ExtInsts.push_back(Cast);
+    UserChain[ChainIndex] = nullptr;
+    return distributeExtsAndCloneChain(ChainIndex - 1);
+  }
+
+  // Function find only trace into BinaryOperator and CastInst.
+  BinaryOperator *BO = cast<BinaryOperator>(U);
+  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+  BinaryOperator *NewBO = nullptr;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+                                   BO->getName(), IP);
+  } else {
+    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+                                   BO->getName(), IP);
+  }
+  return UserChain[ChainIndex] = NewBO;
+}
+
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+  if (ChainIndex == 0) {
+    assert(isa<ConstantInt>(UserChain[ChainIndex]));
+    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+  }
+
+  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+  assert(BO->getNumUses() <= 1 &&
+         "distributeExtsAndCloneChain clones each BinaryOperator in "
+         "UserChain, so no one should be used more than "
+         "once");
+
+  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+  Value *NextInChain = removeConstOffset(ChainIndex - 1);
+  Value *TheOther = BO->getOperand(1 - OpNo);
+
+  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+  // sub-expression to be just TheOther.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+      return TheOther;
+  }
+
+  BinaryOperator::BinaryOps NewOp = BO->getOpcode();
+  if (BO->getOpcode() == Instruction::Or) {
+    // Rebuild "or" as "add", because "or" may be invalid for the new
+    // epxression.
+    //
+    // For instance, given
+    //   a | (b + 5) where a and b + 5 have no common bits,
+    // we can extract 5 as the constant offset.
+    //
+    // However, reusing the "or" in the new index would give us
+    //   (a | b) + 5
+    // which does not equal a | (b + 5).
+    //
+    // Replacing the "or" with "add" is fine, because
+    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
+    NewOp = Instruction::Add;
+  }
+
+  BinaryOperator *NewBO;
+  if (OpNo == 0) {
+    NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
+  } else {
+    NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
+  }
+  NewBO->takeName(BO);
+  return NewBO;
+}
+
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
+                                        User *&UserChainTail,
+                                        const DominatorTree *DT) {
+  ConstantOffsetExtractor Extractor(GEP, DT);
+  // Find a non-zero constant offset first.
+  APInt ConstantOffset =
+      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+                     GEP->isInBounds());
+  if (ConstantOffset == 0) {
+    UserChainTail = nullptr;
+    return nullptr;
+  }
+  // Separates the constant offset from the GEP index.
+  Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
+  UserChainTail = Extractor.UserChain.back();
+  return IdxWithoutConstOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+                                      const DominatorTree *DT) {
+  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+  return ConstantOffsetExtractor(GEP, DT)
+      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+            GEP->isInBounds())
+      .getSExtValue();
+}
+
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+    GetElementPtrInst *GEP) {
+  bool Changed = false;
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+       I != E; ++I, ++GTI) {
+    // Skip struct member indices which must be i32.
+    if (isa<SequentialType>(*GTI)) {
+      if ((*I)->getType() != IntPtrTy) {
+        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+                                                 bool &NeedsExtraction) {
+  NeedsExtraction = false;
+  int64_t AccumulativeByteOffset = 0;
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      // Tries to extract a constant offset from this GEP index.
+      int64_t ConstantOffset =
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+      if (ConstantOffset != 0) {
+        NeedsExtraction = true;
+        // A GEP may have multiple indices.  We accumulate the extracted
+        // constant offset to a byte offset, and later offset the remainder of
+        // the original GEP with this byte offset.
+        AccumulativeByteOffset +=
+            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+      }
+    } else if (LowerGEP) {
+      StructType *StTy = cast<StructType>(*GTI);
+      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+      // Skip field 0 as the offset is always 0.
+      if (Field != 0) {
+        NeedsExtraction = true;
+        AccumulativeByteOffset +=
+            DL->getStructLayout(StTy)->getElementOffset(Field);
+      }
+    }
+  }
+  return AccumulativeByteOffset;
+}
+
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Type *I8PtrTy =
+      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+  Value *ResultPtr = Variadic->getOperand(0);
+  Loop *L = LI->getLoopFor(Variadic->getParent());
+  // Check if the base is not loop invariant or used more than once.
+  bool isSwapCandidate =
+      L && L->isLoopInvariant(ResultPtr) &&
+      !hasMoreThanOneUseInLoop(ResultPtr, L);
+  Value *FirstResult = nullptr;
+
+  if (ResultPtr->getType() != I8PtrTy)
+    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create an ugly GEP for each sequential index. We don't create GEPs for
+  // structure indices, as they are accumulated in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ugly GEP with a single index for each index.
+      ResultPtr =
+          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+      if (FirstResult == nullptr)
+        FirstResult = ResultPtr;
+    }
+  }
+
+  // Create a GEP with the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+    ResultPtr =
+        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
+  } else
+    isSwapCandidate = false;
+
+  // If we created a GEP with constant index, and the base is loop invariant,
+  // then we swap the first one with it, so LICM can move constant GEP out
+  // later.
+  GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult);
+  GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
+  if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+    swapGEPOperand(FirstGEP, SecondGEP);
+
+  if (ResultPtr->getType() != Variadic->getType())
+    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+                                               int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+  // don't create arithmetics for structure indices, as they are accumulated
+  // in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ADD for each index.
+      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+    }
+  }
+
+  // Create an ADD for the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    ResultPtr = Builder.CreateAdd(
+        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+  }
+
+  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+  // Skip vector GEPs.
+  if (GEP->getType()->isVectorTy())
+    return false;
+
+  // The backend can already nicely handle the case where all indices are
+  // constant.
+  if (GEP->hasAllConstantIndices())
+    return false;
+
+  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+
+  if (!NeedsExtraction)
+    return Changed;
+  // If LowerGEP is disabled, before really splitting the GEP, check whether the
+  // backend supports the addressing mode we are about to produce. If no, this
+  // splitting probably won't be beneficial.
+  // If LowerGEP is enabled, even the extracted constant offset can not match
+  // the addressing mode, we can still do optimizations to other lowered parts
+  // of variable indices. Therefore, we don't check for addressing modes in that
+  // case.
+  if (!LowerGEP) {
+    TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+            *GEP->getParent()->getParent());
+    unsigned AddrSpace = GEP->getPointerAddressSpace();
+    if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
+                                   /*HasBaseReg=*/true, /*Scale=*/0,
+                                   AddrSpace)) {
+      return Changed;
+    }
+  }
+
+  // Remove the constant offset in each sequential index. The resultant GEP
+  // computes the variadic base.
+  // Notice that we don't remove struct field indices here. If LowerGEP is
+  // disabled, a structure index is not accumulated and we still use the old
+  // one. If LowerGEP is enabled, a structure index is accumulated in the
+  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+  // handle the constant offset and won't need a new structure index.
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      // Splits this GEP index into a variadic part and a constant offset, and
+      // uses the variadic part as the new index.
+      Value *OldIdx = GEP->getOperand(I);
+      User *UserChainTail;
+      Value *NewIdx =
+          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+      if (NewIdx != nullptr) {
+        // Switches to the index with the constant offset removed.
+        GEP->setOperand(I, NewIdx);
+        // After switching to the new index, we can garbage-collect UserChain
+        // and the old index if they are not used.
+        RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
+        RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+      }
+    }
+  }
+
+  // Clear the inbounds attribute because the new index may be off-bound.
+  // e.g.,
+  //
+  //   b     = add i64 a, 5
+  //   addr  = gep inbounds float, float* p, i64 b
+  //
+  // is transformed to:
+  //
+  //   addr2 = gep float, float* p, i64 a ; inbounds removed
+  //   addr  = gep inbounds float, float* addr2, i64 5
+  //
+  // If a is -4, although the old index b is in bounds, the new index a is
+  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+  // inbounds keyword is not present, the offsets are added to the base
+  // address with silently-wrapping two's complement arithmetic".
+  // Therefore, the final code will be a semantically equivalent.
+  //
+  // TODO(jingyue): do some range analysis to keep as many inbounds as
+  // possible. GEPs with inbounds are more friendly to alias analysis.
+  bool GEPWasInBounds = GEP->isInBounds();
+  GEP->setIsInBounds(false);
+
+  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+  if (LowerGEP) {
+    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+    // arithmetic operations if the target uses alias analysis in codegen.
+    if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
+      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+    else
+      lowerToArithmetics(GEP, AccumulativeByteOffset);
+    return true;
+  }
+
+  // No need to create another GEP if the accumulative byte offset is 0.
+  if (AccumulativeByteOffset == 0)
+    return true;
+
+  // Offsets the base with the accumulative byte offset.
+  //
+  //   %gep                        ; the base
+  //   ... %gep ...
+  //
+  // => add the offset
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   %gep                        ; will be removed
+  //   ... %gep ...
+  //
+  // => replace all uses of %gep with %new.gep and remove %gep
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   ... %new.gep ...
+  //
+  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+  // type of %gep.
+  //
+  //   %gep2                       ; clone of %gep
+  //   %0       = bitcast %gep2 to i8*
+  //   %uglygep = gep %0, <offset>
+  //   %new.gep = bitcast %uglygep to <type of %gep>
+  //   ... %new.gep ...
+  Instruction *NewGEP = GEP->clone();
+  NewGEP->insertBefore(GEP);
+
+  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+  // used with unsigned integers later.
+  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+      DL->getTypeAllocSize(GEP->getType()->getElementType()));
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+    // Very likely. As long as %gep is natually aligned, the byte offset we
+    // extracted should be a multiple of sizeof(*%gep).
+    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
+    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+                                       ConstantInt::get(IntPtrTy, Index, true),
+                                       GEP->getName(), GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+  } else {
+    // Unlikely but possible. For example,
+    // #pragma pack(1)
+    // struct S {
+    //   int a[3];
+    //   int64 b[8];
+    // };
+    // #pragma pack()
+    //
+    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+    // sizeof(int64).
+    //
+    // Emit an uglygep in this case.
+    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+                                       GEP->getPointerAddressSpace());
+    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+    NewGEP = GetElementPtrInst::Create(
+        Type::getInt8Ty(GEP->getContext()), NewGEP,
+        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+        GEP);
+    // Inherit the inbounds attribute of the original GEP.
+    cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+    if (GEP->getType() != I8PtrTy)
+      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+  }
+
+  GEP->replaceAllUsesWith(NewGEP);
+  GEP->eraseFromParent();
+
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  if (DisableSeparateConstOffsetFromGEP)
+    return false;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;)
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+        Changed |= splitGEP(GEP);
+    // No need to split GEP ConstantExprs because all its indices are constant
+    // already.
+  }
+
+  Changed |= reuniteExts(F);
+
+  if (VerifyNoDeadCode)
+    verifyNoDeadCode(F);
+
+  return Changed;
+}
+
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+    const SCEV *Key, Instruction *Dominatee) {
+  auto Pos = DominatingExprs.find(Key);
+  if (Pos == DominatingExprs.end())
+    return nullptr;
+
+  auto &Candidates = Pos->second;
+  // Because we process the basic blocks in pre-order of the dominator tree, a
+  // candidate that doesn't dominate the current instruction won't dominate any
+  // future instruction either. Therefore, we pop it out of the stack. This
+  // optimization makes the algorithm O(n).
+  while (!Candidates.empty()) {
+    Instruction *Candidate = Candidates.back();
+    if (DT->dominates(Candidate, Dominatee))
+      return Candidate;
+    Candidates.pop_back();
+  }
+  return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  //   Dom: LHS+RHS
+  //   I: sext(LHS)+sext(RHS)
+  // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+  // TODO: handle zext
+  Value *LHS = nullptr, *RHS = nullptr;
+  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
+      match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+    if (LHS->getType() == RHS->getType()) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      if (auto *Dom = findClosestMatchingDominator(Key, I)) {
+        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+        NewSExt->takeName(I);
+        I->replaceAllUsesWith(NewSExt);
+        RecursivelyDeleteTriviallyDeadInstructions(I);
+        return true;
+      }
+    }
+  }
+
+  // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
+      match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+    if (isKnownNotFullPoison(I)) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      DominatingExprs[Key].push_back(I);
+    }
+  }
+  return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+  bool Changed = false;
+  DominatingExprs.clear();
+  for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+       Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) {
+    BasicBlock *BB = Node->getBlock();
+    for (auto I = BB->begin(); I != BB->end(); ) {
+      Instruction *Cur = &*I++;
+      Changed |= reuniteExts(Cur);
+    }
+  }
+  return Changed;
+}
+
+void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
+  for (auto &B : F) {
+    for (auto &I : B) {
+      if (isInstructionTriviallyDead(&I)) {
+        std::string ErrMessage;
+        raw_string_ostream RSO(ErrMessage);
+        RSO << "Dead instruction detected!\n" << I << "\n";
+        llvm_unreachable(RSO.str().c_str());
+      }
+    }
+  }
+}
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+    GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+  if (!FirstGEP || !FirstGEP->hasOneUse())
+    return false;
+
+  if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+    return false;
+
+  if (FirstGEP == SecondGEP)
+    return false;
+
+  unsigned FirstNum = FirstGEP->getNumOperands();
+  unsigned SecondNum = SecondGEP->getNumOperands();
+  // Give up if the number of operands are not 2.
+  if (FirstNum != SecondNum || FirstNum != 2)
+    return false;
+
+  Value *FirstBase = FirstGEP->getOperand(0);
+  Value *SecondBase = SecondGEP->getOperand(0);
+  Value *FirstOffset = FirstGEP->getOperand(1);
+  // Give up if the index of the first GEP is loop invariant.
+  if (CurLoop->isLoopInvariant(FirstOffset))
+    return false;
+
+  // Give up if base doesn't have same type.
+  if (FirstBase->getType() != SecondBase->getType())
+    return false;
+
+  Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+  // Check if the second operand of first GEP has constant coefficient.
+  // For an example, for the following code,  we won't gain anything by
+  // hoisting the second GEP out because the second GEP can be folded away.
+  //   %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+  //   %67 = shl i64 %scevgep.sum.ur159, 2
+  //   %uglygep160 = getelementptr i8* %65, i64 %67
+  //   %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+  // Skip constant shift instruction which may be generated by Splitting GEPs.
+  if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+      isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+    FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+  // Give up if FirstOffsetDef is an Add or Sub with constant.
+  // Because it may not profitable at all due to constant folding.
+  if (FirstOffsetDef)
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+      unsigned opc = BO->getOpcode();
+      if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+          (isa<ConstantInt>(BO->getOperand(0)) ||
+           isa<ConstantInt>(BO->getOperand(1))))
+        return false;
+    }
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+  int UsesInLoop = 0;
+  for (User *U : V->users()) {
+    if (Instruction *User = dyn_cast<Instruction>(U))
+      if (L->contains(User))
+        if (++UsesInLoop > 1)
+          return true;
+  }
+  return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+                                                GetElementPtrInst *Second) {
+  Value *Offset1 = First->getOperand(1);
+  Value *Offset2 = Second->getOperand(1);
+  First->setOperand(1, Offset2);
+  Second->setOperand(1, Offset1);
+
+  // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+  const DataLayout &DAL = First->getModule()->getDataLayout();
+  APInt Offset(DAL.getPointerSizeInBits(
+                   cast<PointerType>(First->getType())->getAddressSpace()),
+               0);
+  Value *NewBase =
+      First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+  uint64_t ObjectSize;
+  if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+     Offset.ugt(ObjectSize)) {
+    First->setIsInBounds(false);
+    Second->setIsInBounds(false);
+  } else
+    First->setIsInBounds(true);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 0000000..63c8836
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,239 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "simplifycfg"
+
+static cl::opt<unsigned>
+UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
+   cl::desc("Control the number of bonus instructions (default = 1)"));
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+/// If we have more than one empty (other than phi node) return blocks,
+/// merge them together to promote recursive block merging.
+static bool mergeEmptyReturnBlocks(Function &F) {
+  bool Changed = false;
+
+  BasicBlock *RetBlock = nullptr;
+
+  // Scan all the blocks in the function, looking for empty return blocks.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+    BasicBlock &BB = *BBI++;
+
+    // Only look at return blocks.
+    ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (!Ret) continue;
+
+    // Only look at the block if it is empty or the only other thing in it is a
+    // single PHI node that is the operand to the return.
+    if (Ret != &BB.front()) {
+      // Check for something else in the block.
+      BasicBlock::iterator I(Ret);
+      --I;
+      // Skip over debug info.
+      while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+        --I;
+      if (!isa<DbgInfoIntrinsic>(I) &&
+          (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+           Ret->getOperand(0) != &*I))
+        continue;
+    }
+
+    // If this is the first returning block, remember it and keep going.
+    if (!RetBlock) {
+      RetBlock = &BB;
+      continue;
+    }
+
+    // Otherwise, we found a duplicate return block.  Merge the two.
+    Changed = true;
+
+    // Case when there is no input to the return or when the returned values
+    // agree is trivial.  Note that they can't agree if there are phis in the
+    // blocks.
+    if (Ret->getNumOperands() == 0 ||
+        Ret->getOperand(0) ==
+          cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+      BB.replaceAllUsesWith(RetBlock);
+      BB.eraseFromParent();
+      continue;
+    }
+
+    // If the canonical return block has no PHI node, create one now.
+    PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+    if (!RetBlockPHI) {
+      Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+      pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+      RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+                                    std::distance(PB, PE), "merge",
+                                    &RetBlock->front());
+
+      for (pred_iterator PI = PB; PI != PE; ++PI)
+        RetBlockPHI->addIncoming(InVal, *PI);
+      RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+    }
+
+    // Turn BB into a block that just unconditionally branches to the return
+    // block.  This handles the case when the two return blocks have a common
+    // predecessor but that return different things.
+    RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+    BB.getTerminator()->eraseFromParent();
+    BranchInst::Create(RetBlock, &BB);
+  }
+
+  return Changed;
+}
+
+/// Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+                                   AssumptionCache *AC,
+                                   unsigned BonusInstThreshold) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded.
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
+                                AssumptionCache *AC, int BonusInstThreshold) {
+  bool EverChanged = removeUnreachableBlocks(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocks is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid rerunning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocks doesn't do anything.
+  if (!removeUnreachableBlocks(F))
+    return true;
+
+  do {
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+    EverChanged |= removeUnreachableBlocks(F);
+  } while (EverChanged);
+
+  return true;
+}
+
+SimplifyCFGPass::SimplifyCFGPass()
+    : BonusInstThreshold(UserBonusInstThreshold) {}
+
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
+    : BonusInstThreshold(BonusInstThreshold) {}
+
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+                                       AnalysisManager<Function> *AM) {
+  auto &TTI = AM->getResult<TargetIRAnalysis>(F);
+  auto &AC = AM->getResult<AssumptionAnalysis>(F);
+
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  unsigned BonusInstThreshold;
+  std::function<bool(const Function &)> PredicateFtor;
+
+  CFGSimplifyPass(int T = -1,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+      : FunctionPass(ID), PredicateFtor(Ftor) {
+    BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (PredicateFtor && !PredicateFtor(F))
+      return false;
+
+    if (skipOptnoneFunction(F))
+      return false;
+
+    AssumptionCache *AC =
+        &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                    false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *
+llvm::createCFGSimplificationPass(int Threshold,
+                                  std::function<bool(const Function &)> Ftor) {
+  return new CFGSimplifyPass(Threshold, Ftor);
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
new file mode 100644
index 0000000..64109b2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -0,0 +1,290 @@
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sink"
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+namespace {
+  class Sinking : public FunctionPass {
+    DominatorTree *DT;
+    LoopInfo *LI;
+    AliasAnalysis *AA;
+
+  public:
+    static char ID; // Pass identification
+    Sinking() : FunctionPass(ID) {
+      initializeSinkingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+    }
+  private:
+    bool ProcessBlock(BasicBlock &BB);
+    bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores);
+    bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
+    bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
+  };
+} // end anonymous namespace
+
+char Sinking::ID = 0;
+INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified value
+/// occur in blocks dominated by the specified block.
+bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
+                                      BasicBlock *BB) const {
+  // Ignoring debug uses is necessary so debug info doesn't affect the code.
+  // This may leave a referencing dbg_value in the original block, before
+  // the definition of the vreg.  Dwarf generator handles this although the
+  // user might not get the right info at runtime.
+  for (Use &U : Inst->uses()) {
+    // Determine the block of the use.
+    Instruction *UseInst = cast<Instruction>(U.getUser());
+    BasicBlock *UseBlock = UseInst->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+      UseBlock = PN->getIncomingBlock(Num);
+    }
+    // Check that it dominates.
+    if (!DT->dominates(BB, UseBlock))
+      return false;
+  }
+  return true;
+}
+
+bool Sinking::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  bool MadeChange, EverMadeChange = false;
+
+  do {
+    MadeChange = false;
+    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+    // Process all basic blocks.
+    for (Function::iterator I = F.begin(), E = F.end();
+         I != E; ++I)
+      MadeChange |= ProcessBlock(*I);
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
+  return EverMadeChange;
+}
+
+bool Sinking::ProcessBlock(BasicBlock &BB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT->isReachableFromEntry(&BB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  BasicBlock::iterator I = BB.end();
+  --I;
+  bool ProcessedBegin = false;
+  SmallPtrSet<Instruction *, 8> Stores;
+  do {
+    Instruction *Inst = &*I; // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == BB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    if (SinkInstruction(Inst, Stores))
+      ++NumSunk, MadeChange = true;
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
+                         SmallPtrSetImpl<Instruction *> &Stores) {
+
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+    MemoryLocation Loc = MemoryLocation::get(L);
+    for (Instruction *S : Stores)
+      if (AA->getModRefInfo(S, Loc) & MRI_Mod)
+        return false;
+  }
+
+  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() ||
+      Inst->mayThrow())
+    return false;
+
+  // Convergent operations cannot be made control-dependent on additional
+  // values.
+  if (auto CS = CallSite(Inst)) {
+    if (CS.hasFnAttr(Attribute::Convergent))
+      return false;
+  }
+
+  return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+bool Sinking::IsAcceptableTarget(Instruction *Inst,
+                                 BasicBlock *SuccToSinkTo) const {
+  assert(Inst && "Instruction to be sunk is null");
+  assert(SuccToSinkTo && "Candidate sink target is null");
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (Inst->getParent() == SuccToSinkTo)
+    return false;
+
+  // It's never legal to sink an instruction into a block which terminates in an
+  // EH-pad.
+  if (SuccToSinkTo->getTerminator()->isExceptional())
+    return false;
+
+  // If the block has multiple predecessors, this would introduce computation
+  // on different code paths.  We could split the critical edge, but for now we
+  // just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    if (!isSafeToSpeculativelyExecute(Inst))
+      return false;
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT->dominates(Inst->getParent(), SuccToSinkTo))
+      return false;
+
+    // Don't sink instructions into a loop.
+    Loop *succ = LI->getLoopFor(SuccToSinkTo);
+    Loop *cur = LI->getLoopFor(Inst->getParent());
+    if (succ != nullptr && succ != cur)
+      return false;
+  }
+
+  // Finally, check that all the uses of the instruction are actually
+  // dominated by the candidate
+  return AllUsesDominatedByBlock(Inst, SuccToSinkTo);
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool Sinking::SinkInstruction(Instruction *Inst,
+                              SmallPtrSetImpl<Instruction *> &Stores) {
+
+  // Don't sink static alloca instructions.  CodeGen assumes allocas outside the
+  // entry block are dynamically sized stack objects.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+    if (AI->isStaticAlloca())
+      return false;
+
+  // Check if it's safe to move the instruction.
+  if (!isSafeToMove(Inst, AA, Stores))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  BasicBlock *SuccToSinkTo = nullptr;
+
+  // Instructions can only be sunk if all their uses are in blocks
+  // dominated by one of the successors.
+  // Look at all the postdominators and see if we can sink it in one.
+  DomTreeNode *DTN = DT->getNode(Inst->getParent());
+  for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+      I != E && SuccToSinkTo == nullptr; ++I) {
+    BasicBlock *Candidate = (*I)->getBlock();
+    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
+        IsAcceptableTarget(Inst, Candidate))
+      SuccToSinkTo = Candidate;
+  }
+
+  // If no suitable postdominator was found, look at all the successors and
+  // decide which one we should sink to, if any.
+  for (succ_iterator I = succ_begin(Inst->getParent()),
+      E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
+    if (IsAcceptableTarget(Inst, *I))
+      SuccToSinkTo = *I;
+  }
+
+  // If we couldn't find a block to sink to, ignore this instruction.
+  if (!SuccToSinkTo)
+    return false;
+
+  DEBUG(dbgs() << "Sink" << *Inst << " (";
+        Inst->getParent()->printAsOperand(dbgs(), false);
+        dbgs() << " -> ";
+        SuccToSinkTo->printAsOperand(dbgs(), false);
+        dbgs() << ")\n");
+
+  // Move the instruction.
+  Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
new file mode 100644
index 0000000..147d615
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -0,0 +1,243 @@
+//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists instructions to enable speculative execution on
+// targets where branches are expensive. This is aimed at GPUs. It
+// currently works on simple if-then and if-then-else
+// patterns.
+//
+// Removing branches is not the only motivation for this
+// pass. E.g. consider this code and assume that there is no
+// addressing mode for multiplying by sizeof(*a):
+//
+//   if (b > 0)
+//     c = a[i + 1]
+//   if (d > 0)
+//     e = a[i + 2]
+//
+// turns into
+//
+//   p = &a[i + 1];
+//   if (b > 0)
+//     c = *p;
+//   q = &a[i + 2];
+//   if (d > 0)
+//     e = *q;
+//
+// which could later be optimized to
+//
+//   r = &a[i];
+//   if (b > 0)
+//     c = r[1];
+//   if (d > 0)
+//     e = r[2];
+//
+// Later passes sink back much of the speculated code that did not enable
+// further optimization.
+//
+// This pass is more aggressive than the function SpeculativeyExecuteBB in
+// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
+// it will speculate at most one instruction. It also will not speculate if
+// there is a value defined in the if-block that is only used in the then-block.
+// These restrictions make sense since the speculation in SimplifyCFG seems
+// aimed at introducing cheap selects, while this pass is intended to do more
+// aggressive speculation while counting on later passes to either capitalize on
+// that or clean it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "speculative-execution"
+
+// The risk that speculation will not pay off increases with the
+// number of instructions speculated, so we put a limit on that.
+static cl::opt<unsigned> SpecExecMaxSpeculationCost(
+    "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
+    cl::desc("Speculative execution is not applied to basic blocks where "
+             "the cost of the instructions to speculatively execute "
+             "exceeds this limit."));
+
+// Speculating just a few instructions from a larger block tends not
+// to be profitable and this limit prevents that. A reason for that is
+// that small basic blocks are more likely to be candidates for
+// further optimization.
+static cl::opt<unsigned> SpecExecMaxNotHoisted(
+    "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
+    cl::desc("Speculative execution is not applied to basic blocks where the "
+             "number of instructions that would not be speculatively executed "
+             "exceeds this limit."));
+
+namespace {
+class SpeculativeExecution : public FunctionPass {
+ public:
+  static char ID;
+  SpeculativeExecution(): FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+ private:
+  bool runOnBasicBlock(BasicBlock &B);
+  bool considerHoistingFromTo(BasicBlock &FromBlock, BasicBlock &ToBlock);
+
+  const TargetTransformInfo *TTI = nullptr;
+};
+} // namespace
+
+char SpeculativeExecution::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecution, "speculative-execution",
+                      "Speculatively execute instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SpeculativeExecution, "speculative-execution",
+                      "Speculatively execute instructions", false, false)
+
+void SpeculativeExecution::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+bool SpeculativeExecution::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool Changed = false;
+  for (auto& B : F) {
+    Changed |= runOnBasicBlock(B);
+  }
+  return Changed;
+}
+
+bool SpeculativeExecution::runOnBasicBlock(BasicBlock &B) {
+  BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
+  if (BI == nullptr)
+    return false;
+
+  if (BI->getNumSuccessors() != 2)
+    return false;
+  BasicBlock &Succ0 = *BI->getSuccessor(0);
+  BasicBlock &Succ1 = *BI->getSuccessor(1);
+
+  if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
+    return false;
+  }
+
+  // Hoist from if-then (triangle).
+  if (Succ0.getSinglePredecessor() != nullptr &&
+      Succ0.getSingleSuccessor() == &Succ1) {
+    return considerHoistingFromTo(Succ0, B);
+  }
+
+  // Hoist from if-else (triangle).
+  if (Succ1.getSinglePredecessor() != nullptr &&
+      Succ1.getSingleSuccessor() == &Succ0) {
+    return considerHoistingFromTo(Succ1, B);
+  }
+
+  // Hoist from if-then-else (diamond), but only if it is equivalent to
+  // an if-else or if-then due to one of the branches doing nothing.
+  if (Succ0.getSinglePredecessor() != nullptr &&
+      Succ1.getSinglePredecessor() != nullptr &&
+      Succ1.getSingleSuccessor() != nullptr &&
+      Succ1.getSingleSuccessor() != &B &&
+      Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
+    // If a block has only one instruction, then that is a terminator
+    // instruction so that the block does nothing. This does happen.
+    if (Succ1.size() == 1) // equivalent to if-then
+      return considerHoistingFromTo(Succ0, B);
+    if (Succ0.size() == 1) // equivalent to if-else
+      return considerHoistingFromTo(Succ1, B);
+  }
+
+  return false;
+}
+
+static unsigned ComputeSpeculationCost(const Instruction *I,
+                                       const TargetTransformInfo &TTI) {
+  switch (Operator::getOpcode(I)) {
+    case Instruction::GetElementPtr:
+    case Instruction::Add:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Select:
+    case Instruction::Shl:
+    case Instruction::Sub:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Xor:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return TTI.getUserCost(I);
+
+    default:
+      return UINT_MAX; // Disallow anything not whitelisted.
+  }
+}
+
+bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock,
+                                                  BasicBlock &ToBlock) {
+  SmallSet<const Instruction *, 8> NotHoisted;
+  const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
+    for (Value* V : U->operand_values()) {
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (NotHoisted.count(I) > 0)
+          return false;
+      }
+    }
+    return true;
+  };
+
+  unsigned TotalSpeculationCost = 0;
+  for (auto& I : FromBlock) {
+    const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
+    if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
+        AllPrecedingUsesFromBlockHoisted(&I)) {
+      TotalSpeculationCost += Cost;
+      if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
+        return false;  // too much to hoist
+    } else {
+      NotHoisted.insert(&I);
+      if (NotHoisted.size() > SpecExecMaxNotHoisted)
+        return false; // too much left behind
+    }
+  }
+
+  if (TotalSpeculationCost == 0)
+    return false; // nothing to hoist
+
+  for (auto I = FromBlock.begin(); I != FromBlock.end();) {
+    // We have to increment I before moving Current as moving Current
+    // changes the list that I is iterating through.
+    auto Current = I;
+    ++I;
+    if (!NotHoisted.count(&*Current)) {
+      Current->moveBefore(ToBlock.getTerminator());
+    }
+  }
+  return true;
+}
+
+namespace llvm {
+
+FunctionPass *createSpeculativeExecutionPass() {
+  return new SpeculativeExecution();
+}
+
+}  // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
new file mode 100644
index 0000000..1faa65e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,724 @@
+//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
+//
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
+//
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
+// in a simpler way with respect to S1. For example,
+//
+// S1: X = B + i * S
+// S2: Y = B + i' * S   => X + (i' - i) * S
+//
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S => X + (i' - i) * S
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S]   => &X[(i' - i) * S]
+//
+// Note: (i' - i) * S is folded to the extent possible.
+//
+// This rewriting is in general a good idea. The code patterns we focus on
+// usually come from loop unrolling, so (i' - i) * S is likely the same
+// across iterations and can be reused. When that happens, the optimized form
+// takes only one add starting from the second iteration.
+//
+// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
+// multiple bases, we choose to rewrite S2 with respect to its "immediate"
+// basis, the basis that is the closest ancestor in the dominator tree.
+//
+// TODO:
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+//   left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+//   SLSR.
+#include <vector>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+class StraightLineStrengthReduce : public FunctionPass {
+public:
+  // SLSR candidate. Such a candidate must be in one of the forms described in
+  // the header comments.
+  struct Candidate : public ilist_node<Candidate> {
+    enum Kind {
+      Invalid, // reserved for the default constructor
+      Add,     // B + i * S
+      Mul,     // (B + i) * S
+      GEP,     // &B[..][i * S][..]
+    };
+
+    Candidate()
+        : CandidateKind(Invalid), Base(nullptr), Index(nullptr),
+          Stride(nullptr), Ins(nullptr), Basis(nullptr) {}
+    Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+              Instruction *I)
+        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
+          Basis(nullptr) {}
+    Kind CandidateKind;
+    const SCEV *Base;
+    // Note that Index and Stride of a GEP candidate do not necessarily have the
+    // same integer type. In that case, during rewriting, Stride will be
+    // sign-extended or truncated to Index's type.
+    ConstantInt *Index;
+    Value *Stride;
+    // The instruction this candidate corresponds to. It helps us to rewrite a
+    // candidate with respect to its immediate basis. Note that one instruction
+    // can correspond to multiple candidates depending on how you associate the
+    // expression. For instance,
+    //
+    // (a + 1) * (b + 2)
+    //
+    // can be treated as
+    //
+    // <Base: a, Index: 1, Stride: b + 2>
+    //
+    // or
+    //
+    // <Base: b, Index: 2, Stride: a + 1>
+    Instruction *Ins;
+    // Points to the immediate basis of this candidate, or nullptr if we cannot
+    // find any basis for this candidate.
+    Candidate *Basis;
+  };
+
+  static char ID;
+
+  StraightLineStrengthReduce()
+      : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) {
+    initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    // We do not modify the shape of the CFG.
+    AU.setPreservesCFG();
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+  // share the same base and stride.
+  bool isBasisFor(const Candidate &Basis, const Candidate &C);
+  // Returns whether the candidate can be folded into an addressing mode.
+  bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+                  const DataLayout *DL);
+  // Returns true if C is already in a simplest form and not worth being
+  // rewritten.
+  bool isSimplestForm(const Candidate &C);
+  // Checks whether I is in a candidate form. If so, adds all the matching forms
+  // to Candidates, and tries to find the immediate basis for each of them.
+  void allocateCandidatesAndFindBasis(Instruction *I);
+  // Allocate candidates and find bases for Add instructions.
+  void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+  // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
+  // candidate.
+  void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
+                                            Instruction *I);
+  // Allocate candidates and find bases for Mul instructions.
+  void allocateCandidatesAndFindBasisForMul(Instruction *I);
+  // Splits LHS into Base + Index and, if succeeds, calls
+  // allocateCandidatesAndFindBasis.
+  void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
+                                            Instruction *I);
+  // Allocate candidates and find bases for GetElementPtr instructions.
+  void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+  // A helper function that scales Idx with ElementSize before invoking
+  // allocateCandidatesAndFindBasis.
+  void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+                                            Value *S, uint64_t ElementSize,
+                                            Instruction *I);
+  // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+  // basis.
+  void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
+                                      ConstantInt *Idx, Value *S,
+                                      Instruction *I);
+  // Rewrites candidate C with respect to Basis.
+  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+  // A helper function that factors ArrayIdx to a product of a stride and a
+  // constant index, and invokes allocateCandidatesAndFindBasis with the
+  // factorings.
+  void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+                        GetElementPtrInst *GEP);
+  // Emit code that computes the "bump" from Basis to C. If the candidate is a
+  // GEP and the bump is not divisible by the element size of the GEP, this
+  // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+  // basis using an ugly GEP.
+  static Value *emitBump(const Candidate &Basis, const Candidate &C,
+                         IRBuilder<> &Builder, const DataLayout *DL,
+                         bool &BumpWithUglyGEP);
+
+  const DataLayout *DL;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
+  ilist<Candidate> Candidates;
+  // Temporarily holds all instructions that are unlinked (but not deleted) by
+  // rewriteCandidateWithBasis. These instructions will be actually removed
+  // after all rewriting finishes.
+  std::vector<Instruction *> UnlinkedInstructions;
+};
+}  // anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+                      "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+                    "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+  return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+                                            const Candidate &C) {
+  return (Basis.Ins != C.Ins && // skip the same instruction
+          // They must have the same type too. Basis.Base == C.Base doesn't
+          // guarantee their types are the same (PR23975).
+          Basis.Ins->getType() == C.Ins->getType() &&
+          // Basis must dominate C in order to rewrite C with respect to Basis.
+          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+          // They share the same base, stride, and candidate kind.
+          Basis.Base == C.Base && Basis.Stride == C.Stride &&
+          Basis.CandidateKind == C.CandidateKind);
+}
+
+// TODO: use TTI->getGEPCost.
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+                          const TargetTransformInfo *TTI,
+                          const DataLayout *DL) {
+  GlobalVariable *BaseGV = nullptr;
+  int64_t BaseOffset = 0;
+  bool HasBaseReg = false;
+  int64_t Scale = 0;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
+    BaseGV = GV;
+  else
+    HasBaseReg = true;
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+      if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+        BaseOffset += ConstIdx->getSExtValue() * ElementSize;
+      } else {
+        // Needs scale register.
+        if (Scale != 0) {
+          // No addressing mode takes two scale registers.
+          return false;
+        }
+        Scale = ElementSize;
+      }
+    } else {
+      StructType *STy = cast<StructType>(*GTI);
+      uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+      BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
+    }
+  }
+
+  unsigned AddrSpace = GEP->getPointerAddressSpace();
+  return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
+                                    BaseOffset, HasBaseReg, Scale, AddrSpace);
+}
+
+// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
+static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
+                          TargetTransformInfo *TTI) {
+  return TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+                                    Index->getSExtValue());
+}
+
+bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
+                                            TargetTransformInfo *TTI,
+                                            const DataLayout *DL) {
+  if (C.CandidateKind == Candidate::Add)
+    return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
+  if (C.CandidateKind == Candidate::GEP)
+    return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI, DL);
+  return false;
+}
+
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+  unsigned NumNonZeroIndices = 0;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+    ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+    if (ConstIdx == nullptr || !ConstIdx->isZero())
+      ++NumNonZeroIndices;
+  }
+  return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+  if (C.CandidateKind == Candidate::Add) {
+    // B + 1 * S or B + (-1) * S
+    return C.Index->isOne() || C.Index->isMinusOne();
+  }
+  if (C.CandidateKind == Candidate::Mul) {
+    // (B + 0) * S
+    return C.Index->isZero();
+  }
+  if (C.CandidateKind == Candidate::GEP) {
+    // (char*)B + S or (char*)B - S
+    return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+            hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+  }
+  return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+    Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+    Instruction *I) {
+  Candidate C(CT, B, Idx, S, I);
+  // SLSR can complicate an instruction in two cases:
+  //
+  // 1. If we can fold I into an addressing mode, computing I is likely free or
+  // takes only one instruction.
+  //
+  // 2. I is already in a simplest form. For example, when
+  //      X = B + 8 * S
+  //      Y = B + S,
+  //    rewriting Y to X - 7 * S is probably a bad idea.
+  //
+  // In the above cases, we still add I to the candidate list so that I can be
+  // the basis of other candidates, but we leave I's basis blank so that I
+  // won't be rewritten.
+  if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+    // Try to compute the immediate basis of C.
+    unsigned NumIterations = 0;
+    // Limit the scan radius to avoid running in quadratice time.
+    static const unsigned MaxNumIterations = 50;
+    for (auto Basis = Candidates.rbegin();
+         Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+         ++Basis, ++NumIterations) {
+      if (isBasisFor(*Basis, C)) {
+        C.Basis = &(*Basis);
+        break;
+      }
+    }
+  }
+  // Regardless of whether we find a basis for C, we need to push C to the
+  // candidate list so that it can be the basis of other candidates.
+  Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+    Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    allocateCandidatesAndFindBasisForAdd(I);
+    break;
+  case Instruction::Mul:
+    allocateCandidatesAndFindBasisForMul(I);
+    break;
+  case Instruction::GetElementPtr:
+    allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+    break;
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+    Instruction *I) {
+  // Try matching B + i * S.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  assert(I->getNumOperands() == 2 && "isn't I an add?");
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
+  if (LHS != RHS)
+    allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+    Value *LHS, Value *RHS, Instruction *I) {
+  Value *S = nullptr;
+  ConstantInt *Idx = nullptr;
+  if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
+    // I = LHS + RHS = LHS + Idx * S
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+  } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
+    // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
+    APInt One(Idx->getBitWidth(), 1);
+    Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+  } else {
+    // At least, I = LHS + 1 * RHS
+    ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
+    allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
+                                   I);
+  }
+}
+
+// Returns true if A matches B + C where C is constant.
+static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
+  return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+}
+
+// Returns true if A matches B | C where C is constant.
+static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
+  return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+          match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+    Value *LHS, Value *RHS, Instruction *I) {
+  Value *B = nullptr;
+  ConstantInt *Idx = nullptr;
+  if (matchesAdd(LHS, B, Idx)) {
+    // If LHS is in the form of "Base + Index", then I is in the form of
+    // "(Base + Index) * RHS".
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
+    // If LHS is in the form of "Base | Index" and Base and Index have no common
+    // bits set, then
+    //   Base | Index = Base + Index
+    // and I is thus in the form of "(Base + Index) * RHS".
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else {
+    // Otherwise, at least try the form (LHS + 0) * RHS.
+    ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+    allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+                                   I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+    Instruction *I) {
+  // Try matching (B + i) * S.
+  // TODO: we could extend SLSR to float and vector types.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  assert(I->getNumOperands() == 2 && "isn't I a mul?");
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
+  if (LHS != RHS) {
+    // Symmetrically, try to split RHS to Base + Index.
+    allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+    const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+    Instruction *I) {
+  // I = B + sext(Idx *nsw S) * ElementSize
+  //   = B + (sext(Idx) * sext(S)) * ElementSize
+  //   = B + (sext(Idx) * ElementSize) * sext(S)
+  // Casting to IntegerType is safe because we skipped vector GEPs.
+  IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+  ConstantInt *ScaledIdx = ConstantInt::get(
+      IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+  allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+                                                  const SCEV *Base,
+                                                  uint64_t ElementSize,
+                                                  GetElementPtrInst *GEP) {
+  // At least, ArrayIdx = ArrayIdx *nsw 1.
+  allocateCandidatesAndFindBasisForGEP(
+      Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+      ArrayIdx, ElementSize, GEP);
+  Value *LHS = nullptr;
+  ConstantInt *RHS = nullptr;
+  // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+  // itself. This would allow us to handle the shl case for free. However,
+  // matching SCEVs has two issues:
+  //
+  // 1. this would complicate rewriting because the rewriting procedure
+  // would have to translate SCEVs back to IR instructions. This translation
+  // is difficult when LHS is further evaluated to a composite SCEV.
+  //
+  // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+  // to strip nsw/nuw flags which are critical for SLSR to trace into
+  // sext'ed multiplication.
+  if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // SLSR is currently unsafe if i * S may overflow.
+    // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+    allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+  } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+    //     = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+    APInt One(RHS->getBitWidth(), 1);
+    ConstantInt *PowerOf2 =
+        ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+    allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+    GetElementPtrInst *GEP) {
+  // TODO: handle vector GEPs
+  if (GEP->getType()->isVectorTy())
+    return;
+
+  SmallVector<const SCEV *, 4> IndexExprs;
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+    IndexExprs.push_back(SE->getSCEV(*I));
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) {
+    if (!isa<SequentialType>(*GTI++))
+      continue;
+
+    const SCEV *OrigIndexExpr = IndexExprs[I - 1];
+    IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
+
+    // The base of this candidate is GEP's base plus the offsets of all
+    // indices except this current one.
+    const SCEV *BaseExpr = SE->getGEPExpr(GEP->getSourceElementType(),
+                                          SE->getSCEV(GEP->getPointerOperand()),
+                                          IndexExprs, GEP->isInBounds());
+    Value *ArrayIdx = GEP->getOperand(I);
+    uint64_t ElementSize = DL->getTypeAllocSize(*GTI);
+    factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+    // When ArrayIdx is the sext of a value, we try to factor that value as
+    // well.  Handling this case is important because array indices are
+    // typically sign-extended to the pointer size.
+    Value *TruncatedArrayIdx = nullptr;
+    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))))
+      factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+
+    IndexExprs[I - 1] = OrigIndexExpr;
+  }
+}
+
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+  if (A.getBitWidth() < B.getBitWidth())
+    A = A.sext(B.getBitWidth());
+  else if (A.getBitWidth() > B.getBitWidth())
+    B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+                                            const Candidate &C,
+                                            IRBuilder<> &Builder,
+                                            const DataLayout *DL,
+                                            bool &BumpWithUglyGEP) {
+  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+  unifyBitWidth(Idx, BasisIdx);
+  APInt IndexOffset = Idx - BasisIdx;
+
+  BumpWithUglyGEP = false;
+  if (Basis.CandidateKind == Candidate::GEP) {
+    APInt ElementSize(
+        IndexOffset.getBitWidth(),
+        DL->getTypeAllocSize(
+            cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType()));
+    APInt Q, R;
+    APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+    if (R.getSExtValue() == 0)
+      IndexOffset = Q;
+    else
+      BumpWithUglyGEP = true;
+  }
+
+  // Compute Bump = C - Basis = (i' - i) * S.
+  // Common case 1: if (i' - i) is 1, Bump = S.
+  if (IndexOffset.getSExtValue() == 1)
+    return C.Stride;
+  // Common case 2: if (i' - i) is -1, Bump = -S.
+  if (IndexOffset.getSExtValue() == -1)
+    return Builder.CreateNeg(C.Stride);
+
+  // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+  // have different bit widths.
+  IntegerType *DeltaType =
+      IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+  Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+  if (IndexOffset.isPowerOf2()) {
+    // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+    ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+    return Builder.CreateShl(ExtendedStride, Exponent);
+  }
+  if ((-IndexOffset).isPowerOf2()) {
+    // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+    ConstantInt *Exponent =
+        ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+    return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+  }
+  Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+  return Builder.CreateMul(ExtendedStride, Delta);
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+    const Candidate &C, const Candidate &Basis) {
+  assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+         C.Stride == Basis.Stride);
+  // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+  // basis of a candidate cannot be unlinked before the candidate.
+  assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+  // An instruction can correspond to multiple candidates. Therefore, instead of
+  // simply deleting an instruction when we rewrite it, we mark its parent as
+  // nullptr (i.e. unlink it) so that we can skip the candidates whose
+  // instruction is already rewritten.
+  if (!C.Ins->getParent())
+    return;
+
+  IRBuilder<> Builder(C.Ins);
+  bool BumpWithUglyGEP;
+  Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+  Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+  switch (C.CandidateKind) {
+  case Candidate::Add:
+  case Candidate::Mul:
+    // C = Basis + Bump
+    if (BinaryOperator::isNeg(Bump)) {
+      // If Bump is a neg instruction, emit C = Basis - (-Bump).
+      Reduced =
+          Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump));
+      // We only use the negative argument of Bump, and Bump itself may be
+      // trivially dead.
+      RecursivelyDeleteTriviallyDeadInstructions(Bump);
+    } else {
+      // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+      // usually unsound, e.g.,
+      //
+      // X = (-2 +nsw 1) *nsw INT_MAX
+      // Y = (-2 +nsw 3) *nsw INT_MAX
+      //   =>
+      // Y = X + 2 * INT_MAX
+      //
+      // Neither + and * in the resultant expression are nsw.
+      Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+    }
+    break;
+  case Candidate::GEP:
+    {
+      Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+      bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+      if (BumpWithUglyGEP) {
+        // C = (char *)Basis + Bump
+        unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+        Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+        Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+        if (InBounds)
+          Reduced =
+              Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
+        else
+          Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+        Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+      } else {
+        // C = gep Basis, Bump
+        // Canonicalize bump to pointer size.
+        Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+        if (InBounds)
+          Reduced = Builder.CreateInBoundsGEP(nullptr, Basis.Ins, Bump);
+        else
+          Reduced = Builder.CreateGEP(nullptr, Basis.Ins, Bump);
+      }
+    }
+    break;
+  default:
+    llvm_unreachable("C.CandidateKind is invalid");
+  };
+  Reduced->takeName(C.Ins);
+  C.Ins->replaceAllUsesWith(Reduced);
+  // Unlink C.Ins so that we can skip other candidates also corresponding to
+  // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+  C.Ins->removeFromParent();
+  UnlinkedInstructions.push_back(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  // Traverse the dominator tree in the depth-first order. This order makes sure
+  // all bases of a candidate are in Candidates when we process it.
+  for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+       node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
+    for (auto &I : *node->getBlock())
+      allocateCandidatesAndFindBasis(&I);
+  }
+
+  // Rewrite candidates in the reverse depth-first order. This order makes sure
+  // a candidate being rewritten is not a basis for any other candidate.
+  while (!Candidates.empty()) {
+    const Candidate &C = Candidates.back();
+    if (C.Basis != nullptr) {
+      rewriteCandidateWithBasis(C, *C.Basis);
+    }
+    Candidates.pop_back();
+  }
+
+  // Delete all unlink instructions.
+  for (auto *UnlinkedInst : UnlinkedInstructions) {
+    for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+      Value *Op = UnlinkedInst->getOperand(I);
+      UnlinkedInst->setOperand(I, nullptr);
+      RecursivelyDeleteTriviallyDeadInstructions(Op);
+    }
+    delete UnlinkedInst;
+  }
+  bool Ret = !UnlinkedInstructions.empty();
+  UnlinkedInstructions.clear();
+  return Ret;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
new file mode 100644
index 0000000..662513c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -0,0 +1,953 @@
+//===-- StructurizeCFG.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "structurizecfg"
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
+
+typedef SmallVector<RegionNode*, 8> RNVector;
+typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BranchInst*, 8> BranchVector;
+typedef SmallVector<BBValuePair, 2> BBValueVector;
+
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+
+typedef MapVector<PHINode *, BBValueVector> PhiMap;
+typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+
+typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
+
+// The name for newly created blocks.
+
+static const char *const FlowBlockName = "Flow";
+
+/// @brief Find the nearest common dominator for multiple BasicBlocks
+///
+/// Helper class for StructurizeCFG
+/// TODO: Maybe move into common code
+class NearestCommonDominator {
+  DominatorTree *DT;
+
+  DTN2UnsignedMap IndexMap;
+
+  BasicBlock *Result;
+  unsigned ResultIndex;
+  bool ExplicitMentioned;
+
+public:
+  /// \brief Start a new query
+  NearestCommonDominator(DominatorTree *DomTree) {
+    DT = DomTree;
+    Result = nullptr;
+  }
+
+  /// \brief Add BB to the resulting dominator
+  void addBlock(BasicBlock *BB, bool Remember = true) {
+    DomTreeNode *Node = DT->getNode(BB);
+
+    if (!Result) {
+      unsigned Numbering = 0;
+      for (;Node;Node = Node->getIDom())
+        IndexMap[Node] = ++Numbering;
+      Result = BB;
+      ResultIndex = 1;
+      ExplicitMentioned = Remember;
+      return;
+    }
+
+    for (;Node;Node = Node->getIDom())
+      if (IndexMap.count(Node))
+        break;
+      else
+        IndexMap[Node] = 0;
+
+    assert(Node && "Dominator tree invalid!");
+
+    unsigned Numbering = IndexMap[Node];
+    if (Numbering > ResultIndex) {
+      Result = Node->getBlock();
+      ResultIndex = Numbering;
+      ExplicitMentioned = Remember && (Result == BB);
+    } else if (Numbering == ResultIndex) {
+      ExplicitMentioned |= Remember;
+    }
+  }
+
+  /// \brief Is "Result" one of the BBs added with "Remember" = True?
+  bool wasResultExplicitMentioned() {
+    return ExplicitMentioned;
+  }
+
+  /// \brief Get the query result
+  BasicBlock *getResult() {
+    return Result;
+  }
+};
+
+/// @brief Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// ||   Where:
+/// | |  1 = "If" block, calculates the condition
+/// 4 |  2 = "Then" subregion, runs if the condition is true
+/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/   4 = "Else" optional subregion, runs if the condition is false
+/// 5    5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incomming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^  Where:
+/// | /  1 = "Entry" block
+/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3    3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+class StructurizeCFG : public RegionPass {
+  Type *Boolean;
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+
+  Function *Func;
+  Region *ParentRegion;
+
+  DominatorTree *DT;
+  LoopInfo *LI;
+
+  RNVector Order;
+  BBSet Visited;
+
+  BBPhiMap DeletedPhis;
+  BB2BBVecMap AddedPhis;
+
+  PredMap Predicates;
+  BranchVector Conditions;
+
+  BB2BBMap Loops;
+  PredMap LoopPreds;
+  BranchVector LoopConds;
+
+  RegionNode *PrevNode;
+
+  void orderNodes();
+
+  void analyzeLoops(RegionNode *N);
+
+  Value *invert(Value *Condition);
+
+  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+  void gatherPredicates(RegionNode *N);
+
+  void collectInfos();
+
+  void insertConditions(bool Loops);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void setPhiValues();
+
+  void killTerminator(BasicBlock *BB);
+
+  void changeExit(RegionNode *Node, BasicBlock *NewExit,
+                  bool IncludeDominator);
+
+  BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+  BasicBlock *needPrefix(bool NeedEmpty);
+
+  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+  void setPrevNode(BasicBlock *BB);
+
+  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+  bool isPredictableTrue(RegionNode *Node);
+
+  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void createFlow();
+
+  void rebuildSSA();
+
+public:
+  static char ID;
+
+  StructurizeCFG() :
+    RegionPass(ID) {
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+  }
+
+  using Pass::doInitialization;
+  bool doInitialization(Region *R, RGPassManager &RGM) override;
+
+  bool runOnRegion(Region *R, RGPassManager &RGM) override;
+
+  const char *getPassName() const override {
+    return "Structurize control flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    RegionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char StructurizeCFG::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                    false, false)
+
+/// \brief Initialize the types and constants used in the pass
+bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+  LLVMContext &Context = R->getEntry()->getContext();
+
+  Boolean = Type::getInt1Ty(Context);
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+
+  return false;
+}
+
+/// \brief Build up the general order of nodes
+void StructurizeCFG::orderNodes() {
+  RNVector TempOrder;
+  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+  TempOrder.append(RPOT.begin(), RPOT.end());
+
+  std::map<Loop*, unsigned> LoopBlocks;
+
+
+  // The reverse post-order traversal of the list gives us an ordering close
+  // to what we want.  The only problem with it is that sometimes backedges
+  // for outer loops will be visited before backedges for inner loops.
+  for (RegionNode *RN : TempOrder) {
+    BasicBlock *BB = RN->getEntry();
+    Loop *Loop = LI->getLoopFor(BB);
+    if (!LoopBlocks.count(Loop)) {
+      LoopBlocks[Loop] = 1;
+      continue;
+    }
+    LoopBlocks[Loop]++;
+  }
+
+  unsigned CurrentLoopDepth = 0;
+  Loop *CurrentLoop = nullptr;
+  BBSet TempVisited;
+  for (RNVector::iterator I = TempOrder.begin(), E = TempOrder.end(); I != E; ++I) {
+    BasicBlock *BB = (*I)->getEntry();
+    unsigned LoopDepth = LI->getLoopDepth(BB);
+
+    if (std::find(Order.begin(), Order.end(), *I) != Order.end())
+      continue;
+
+    if (LoopDepth < CurrentLoopDepth) {
+      // Make sure we have visited all blocks in this loop before moving back to
+      // the outer loop.
+
+      RNVector::iterator LoopI = I;
+      while(LoopBlocks[CurrentLoop]) {
+        LoopI++;
+        BasicBlock *LoopBB = (*LoopI)->getEntry();
+        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+          LoopBlocks[CurrentLoop]--;
+          Order.push_back(*LoopI);
+        }
+      }
+    }
+
+    CurrentLoop = LI->getLoopFor(BB);
+    if (CurrentLoop) {
+      LoopBlocks[CurrentLoop]--;
+    }
+
+    CurrentLoopDepth = LoopDepth;
+    Order.push_back(*I);
+  }
+
+  // This pass originally used a post-order traversal and then operated on
+  // the list in reverse. Now that we are using a reverse post-order traversal
+  // rather than re-working the whole pass to operate on the list in order,
+  // we just reverse the list and continue to operate on it in reverse.
+  std::reverse(Order.begin(), Order.end());
+}
+
+/// \brief Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+  if (N->isSubRegion()) {
+    // Test for exit as back edge
+    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+    if (Visited.count(Exit))
+      Loops[Exit] = N->getEntry();
+
+  } else {
+    // Test for sucessors as back edge
+    BasicBlock *BB = N->getNodeAs<BasicBlock>();
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (BasicBlock *Succ : Term->successors())
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
+  }
+}
+
+/// \brief Invert the given condition
+Value *StructurizeCFG::invert(Value *Condition) {
+  // First: Check if it's a constant
+  if (Condition == BoolTrue)
+    return BoolFalse;
+
+  if (Condition == BoolFalse)
+    return BoolTrue;
+
+  if (Condition == BoolUndef)
+    return BoolUndef;
+
+  // Second: If the condition is already inverted, return the original value
+  if (match(Condition, m_Not(m_Value(Condition))))
+    return Condition;
+
+  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+    // Third: Check all the users for an invert
+    BasicBlock *Parent = Inst->getParent();
+    for (User *U : Condition->users())
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+          return I;
+
+    // Last option: Create a new instruction
+    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  }
+
+  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+    return BinaryOperator::CreateNot(Condition,
+                                     Arg->getName() + ".inv",
+                                     EntryBlock.getTerminator());
+  }
+
+  llvm_unreachable("Unhandled condition to invert");
+}
+
+/// \brief Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                      bool Invert) {
+  Value *Cond = Invert ? BoolFalse : BoolTrue;
+  if (Term->isConditional()) {
+    Cond = Term->getCondition();
+
+    if (Idx != (unsigned)Invert)
+      Cond = invert(Cond);
+  }
+  return Cond;
+}
+
+/// \brief Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = N->getEntry();
+  BBPredicates &Pred = Predicates[BB];
+  BBPredicates &LPred = LoopPreds[BB];
+
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+       PI != PE; ++PI) {
+
+    // Ignore it if it's a branch from outside into our region entry
+    if (!ParentRegion->contains(*PI))
+      continue;
+
+    Region *R = RI->getRegionFor(*PI);
+    if (R == ParentRegion) {
+
+      // It's a top level block in our region
+      BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *Succ = Term->getSuccessor(i);
+        if (Succ != BB)
+          continue;
+
+        if (Visited.count(*PI)) {
+          // Normal forward edge
+          if (Term->isConditional()) {
+            // Try to treat it like an ELSE block
+            BasicBlock *Other = Term->getSuccessor(!i);
+            if (Visited.count(Other) && !Loops.count(Other) &&
+                !Pred.count(Other) && !Pred.count(*PI)) {
+
+              Pred[Other] = BoolFalse;
+              Pred[*PI] = BoolTrue;
+              continue;
+            }
+          }
+          Pred[*PI] = buildCondition(Term, i, false);
+
+        } else {
+          // Back edge
+          LPred[*PI] = buildCondition(Term, i, true);
+        }
+      }
+
+    } else {
+
+      // It's an exit from a sub region
+      while (R->getParent() != ParentRegion)
+        R = R->getParent();
+
+      // Edge from inside a subregion to its entry, ignore it
+      if (*R == *N)
+        continue;
+
+      BasicBlock *Entry = R->getEntry();
+      if (Visited.count(Entry))
+        Pred[Entry] = BoolTrue;
+      else
+        LPred[Entry] = BoolFalse;
+    }
+  }
+}
+
+/// \brief Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+  // Reset predicate
+  Predicates.clear();
+
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();
+
+  // Reset the visited nodes
+  Visited.clear();
+
+  for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
+       OI != OE; ++OI) {
+
+    DEBUG(dbgs() << "Visiting: " <<
+                    ((*OI)->isSubRegion() ? "SubRegion with entry: " : "") <<
+                    (*OI)->getEntry()->getName() << " Loop Depth: " << LI->getLoopDepth((*OI)->getEntry()) << "\n");
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(*OI);
+
+    // Remember that we've seen this node
+    Visited.insert((*OI)->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(*OI);
+  }
+}
+
+/// \brief Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+  BranchVector &Conds = Loops ? LoopConds : Conditions;
+  Value *Default = Loops ? BoolTrue : BoolFalse;
+  SSAUpdater PhiInserter;
+
+  for (BranchInst *Term : Conds) {
+    assert(Term->isConditional());
+
+    BasicBlock *Parent = Term->getParent();
+    BasicBlock *SuccTrue = Term->getSuccessor(0);
+    BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+    NearestCommonDominator Dominator(DT);
+    Dominator.addBlock(Parent, false);
+
+    Value *ParentValue = nullptr;
+    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+         PI != PE; ++PI) {
+
+      if (PI->first == Parent) {
+        ParentValue = PI->second;
+        break;
+      }
+      PhiInserter.AddAvailableValue(PI->first, PI->second);
+      Dominator.addBlock(PI->first);
+    }
+
+    if (ParentValue) {
+      Term->setCondition(ParentValue);
+    } else {
+      if (!Dominator.wasResultExplicitMentioned())
+        PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
+
+      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+    }
+  }
+}
+
+/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+  PhiMap &Map = DeletedPhis[To];
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    while (Phi.getBasicBlockIndex(From) != -1) {
+      Value *Deleted = Phi.removeIncomingValue(From, false);
+      Map[&Phi].push_back(std::make_pair(From, Deleted));
+    }
+  }
+}
+
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    Value *Undef = UndefValue::get(Phi.getType());
+    Phi.addIncoming(Undef, From);
+  }
+  AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+  SSAUpdater Updater;
+  for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
+       AI != AE; ++AI) {
+
+    BasicBlock *To = AI->first;
+    BBVector &From = AI->second;
+
+    if (!DeletedPhis.count(To))
+      continue;
+
+    PhiMap &Map = DeletedPhis[To];
+    for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
+         PI != PE; ++PI) {
+
+      PHINode *Phi = PI->first;
+      Value *Undef = UndefValue::get(Phi->getType());
+      Updater.Initialize(Phi->getType(), "");
+      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+      Updater.AddAvailableValue(To, Undef);
+
+      NearestCommonDominator Dominator(DT);
+      Dominator.addBlock(To, false);
+      for (BBValueVector::iterator VI = PI->second.begin(),
+           VE = PI->second.end(); VI != VE; ++VI) {
+
+        Updater.AddAvailableValue(VI->first, VI->second);
+        Dominator.addBlock(VI->first);
+      }
+
+      if (!Dominator.wasResultExplicitMentioned())
+        Updater.AddAvailableValue(Dominator.getResult(), Undef);
+
+      for (BBVector::iterator FI = From.begin(), FE = From.end();
+           FI != FE; ++FI) {
+
+        int Idx = Phi->getBasicBlockIndex(*FI);
+        assert(Idx != -1);
+        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
+      }
+    }
+
+    DeletedPhis.erase(To);
+  }
+  assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+  TerminatorInst *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI) {
+
+    delPhiValues(BB, *SI);
+  }
+
+  Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                bool IncludeDominator) {
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *OldExit = SubRegion->getExit();
+    BasicBlock *Dominator = nullptr;
+
+    // Find all the edges from the sub region to the exit
+    for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
+         I != E;) {
+
+      BasicBlock *BB = *I++;
+      if (!SubRegion->contains(BB))
+        continue;
+
+      // Modify the edges to point to the new exit
+      delPhiValues(BB, OldExit);
+      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+      addPhiValues(BB, NewExit);
+
+      // Find the new dominator (if requested)
+      if (IncludeDominator) {
+        if (!Dominator)
+          Dominator = BB;
+        else
+          Dominator = DT->findNearestCommonDominator(Dominator, BB);
+      }
+    }
+
+    // Change the dominator (if requested)
+    if (Dominator)
+      DT->changeImmediateDominator(NewExit, Dominator);
+
+    // Update the region info
+    SubRegion->replaceExit(NewExit);
+
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(NewExit, BB);
+    addPhiValues(BB, NewExit);
+    if (IncludeDominator)
+      DT->changeImmediateDominator(NewExit, BB);
+  }
+}
+
+/// \brief Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+  LLVMContext &Context = Func->getContext();
+  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+                       Order.back()->getEntry();
+  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+                                        Func, Insert);
+  DT->addNewBlock(Flow, Dominator);
+  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+  return Flow;
+}
+
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+  BasicBlock *Entry = PrevNode->getEntry();
+
+  if (!PrevNode->isSubRegion()) {
+    killTerminator(Entry);
+    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+      return Entry;
+
+  }
+
+  // create a new flow node
+  BasicBlock *Flow = getNextFlow(Entry);
+
+  // and wire it up
+  changeExit(PrevNode, Flow, true);
+  PrevNode = ParentRegion->getBBNode(Flow);
+  return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+                                        bool ExitUseAllowed) {
+  if (Order.empty() && ExitUseAllowed) {
+    BasicBlock *Exit = ParentRegion->getExit();
+    DT->changeImmediateDominator(Exit, Flow);
+    addPhiValues(Flow, Exit);
+    return Exit;
+  }
+  return getNextFlow(Flow);
+}
+
+/// \brief Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+  PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+                                        : nullptr;
+}
+
+/// \brief Does BB dominate all the predicates of Node ?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+
+    if (!DT->dominates(BB, PI->first))
+      return false;
+  }
+  return true;
+}
+
+/// \brief Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  bool Dominated = false;
+
+  // Regionentry is always true
+  if (!PrevNode)
+    return true;
+
+  for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+
+    if (I->second != BoolTrue)
+      return false;
+
+    if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
+      Dominated = true;
+  }
+
+  // TODO: The dominator check is too strict
+  return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+                              BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.pop_back_val();
+  Visited.insert(Node->getEntry());
+
+  if (isPredictableTrue(Node)) {
+    // Just a linear flow
+    if (PrevNode) {
+      changeExit(PrevNode, Node->getEntry(), true);
+    }
+    PrevNode = Node;
+
+  } else {
+    // Insert extra prefix node (or reuse last one)
+    BasicBlock *Flow = needPrefix(false);
+
+    // Insert extra postfix node (or use exit instead)
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+    // let it point to entry and next block
+    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+    addPhiValues(Flow, Entry);
+    DT->changeImmediateDominator(Entry, Flow);
+
+    PrevNode = Node;
+    while (!Order.empty() && !Visited.count(LoopEnd) &&
+           dominatesPredicates(Entry, Order.back())) {
+      handleLoops(false, LoopEnd);
+    }
+
+    changeExit(PrevNode, Next, false);
+    setPrevNode(Next);
+  }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                 BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.back();
+  BasicBlock *LoopStart = Node->getEntry();
+
+  if (!Loops.count(LoopStart)) {
+    wireFlow(ExitUseAllowed, LoopEnd);
+    return;
+  }
+
+  if (!isPredictableTrue(Node))
+    LoopStart = needPrefix(true);
+
+  LoopEnd = Loops[Node->getEntry()];
+  wireFlow(false, LoopEnd);
+  while (!Visited.count(LoopEnd)) {
+    handleLoops(false, LoopEnd);
+  }
+
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+  }
+
+  // Create an extra loop end node
+  LoopEnd = needPrefix(false);
+  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+                                         BoolUndef, LoopEnd));
+  addPhiValues(LoopEnd, LoopStart);
+  setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+  BasicBlock *Exit = ParentRegion->getExit();
+  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Conditions.clear();
+  LoopConds.clear();
+
+  PrevNode = nullptr;
+  Visited.clear();
+
+  while (!Order.empty()) {
+    handleLoops(EntryDominatesExit, nullptr);
+  }
+
+  if (PrevNode)
+    changeExit(PrevNode, Exit, EntryDominatesExit);
+  else
+    assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really nessasary
+void StructurizeCFG::rebuildSSA() {
+  SSAUpdater Updater;
+  for (auto *BB : ParentRegion->blocks())
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+
+      bool Initialized = false;
+      for (auto I = II->use_begin(), E = II->use_end(); I != E;) {
+        Use &U = *I++;
+        Instruction *User = cast<Instruction>(U.getUser());
+        if (User->getParent() == BB) {
+          continue;
+
+        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(U) == BB)
+            continue;
+        }
+
+        if (DT->dominates(&*II, User))
+          continue;
+
+        if (!Initialized) {
+          Value *Undef = UndefValue::get(II->getType());
+          Updater.Initialize(II->getType(), "");
+          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+          Updater.AddAvailableValue(BB, &*II);
+          Initialized = true;
+        }
+        Updater.RewriteUseAfterInsertions(U);
+      }
+    }
+}
+
+/// \brief Run the transformation for each region found
+bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+  if (R->isTopLevelRegion())
+    return false;
+
+  Func = R->getEntry()->getParent();
+  ParentRegion = R;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  orderNodes();
+  collectInfos();
+  createFlow();
+  insertConditions(false);
+  insertConditions(true);
+  setPhiValues();
+  rebuildSSA();
+
+  // Cleanup
+  Order.clear();
+  Visited.clear();
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Predicates.clear();
+  Conditions.clear();
+  Loops.clear();
+  LoopPreds.clear();
+  LoopConds.clear();
+
+  return true;
+}
+
+/// \brief Create the pass
+Pass *llvm::createStructurizeCFGPass() {
+  return new StructurizeCFG();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 0000000..4e84d72
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,851 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative and commutative expression to use an
+//     accumulator variable, thus compiling the typical naive factorial or
+//     'fib' implementation into efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access their caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately precedes the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailcallelim"
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped,   "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+namespace {
+  struct TailCallElim : public FunctionPass {
+    const TargetTransformInfo *TTI;
+
+    static char ID; // Pass identification, replacement for typeid
+    TailCallElim() : FunctionPass(ID) {
+      initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+    bool runOnFunction(Function &F) override;
+
+  private:
+    bool runTRE(Function &F);
+    bool markTails(Function &F, bool &AllCallsAreTailCalls);
+
+    CallInst *FindTRECandidate(Instruction *I,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+                                    BasicBlock *&OldEntry,
+                                    bool &TailCallsAreMarkedTail,
+                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                    bool CannotTailCallElimCallsMarkedTail);
+    bool FoldReturnAndProcessPred(BasicBlock *BB,
+                                  ReturnInst *Ret, BasicBlock *&OldEntry,
+                                  bool &TailCallsAreMarkedTail,
+                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                  bool CannotTailCallElimCallsMarkedTail);
+    bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
+                               bool &TailCallsAreMarkedTail,
+                               SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool CanMoveAboveCall(Instruction *I, CallInst *CI);
+    Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
+  };
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
+                      "Tail Call Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
+                    "Tail Call Elimination", false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+/// \brief Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool CanTRE(Function &F) {
+  // Because of PR962, we don't TRE dynamic allocas.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        if (!AI->isStaticAlloca())
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool TailCallElim::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+    return false;
+
+  bool AllCallsAreTailCalls = false;
+  bool Modified = markTails(F, AllCallsAreTailCalls);
+  if (AllCallsAreTailCalls)
+    Modified |= runTRE(F);
+  return Modified;
+}
+
+namespace {
+struct AllocaDerivedValueTracker {
+  // Start at a root value and walk its use-def chain to mark calls that use the
+  // value or a derived value in AllocaUsers, and places where it may escape in
+  // EscapePoints.
+  void walk(Value *Root) {
+    SmallVector<Use *, 32> Worklist;
+    SmallPtrSet<Use *, 32> Visited;
+
+    auto AddUsesToWorklist = [&](Value *V) {
+      for (auto &U : V->uses()) {
+        if (!Visited.insert(&U).second)
+          continue;
+        Worklist.push_back(&U);
+      }
+    };
+
+    AddUsesToWorklist(Root);
+
+    while (!Worklist.empty()) {
+      Use *U = Worklist.pop_back_val();
+      Instruction *I = cast<Instruction>(U->getUser());
+
+      switch (I->getOpcode()) {
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        CallSite CS(I);
+        bool IsNocapture =
+            CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));
+        callUsesLocalStack(CS, IsNocapture);
+        if (IsNocapture) {
+          // If the alloca-derived argument is passed in as nocapture, then it
+          // can't propagate to the call's return. That would be capturing.
+          continue;
+        }
+        break;
+      }
+      case Instruction::Load: {
+        // The result of a load is not alloca-derived (unless an alloca has
+        // otherwise escaped, but this is a local analysis).
+        continue;
+      }
+      case Instruction::Store: {
+        if (U->getOperandNo() == 0)
+          EscapePoints.insert(I);
+        continue;  // Stores have no users to analyze.
+      }
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::PHI:
+      case Instruction::Select:
+      case Instruction::AddrSpaceCast:
+        break;
+      default:
+        EscapePoints.insert(I);
+        break;
+      }
+
+      AddUsesToWorklist(I);
+    }
+  }
+
+  void callUsesLocalStack(CallSite CS, bool IsNocapture) {
+    // Add it to the list of alloca users.
+    AllocaUsers.insert(CS.getInstruction());
+
+    // If it's nocapture then it can't capture this alloca.
+    if (IsNocapture)
+      return;
+
+    // If it can write to memory, it can leak the alloca value.
+    if (!CS.onlyReadsMemory())
+      EscapePoints.insert(CS.getInstruction());
+  }
+
+  SmallPtrSet<Instruction *, 32> AllocaUsers;
+  SmallPtrSet<Instruction *, 32> EscapePoints;
+};
+}
+
+bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
+  if (F.callsFunctionThatReturnsTwice())
+    return false;
+  AllCallsAreTailCalls = true;
+
+  // The local stack holds all alloca instructions and all byval arguments.
+  AllocaDerivedValueTracker Tracker;
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr())
+      Tracker.walk(&Arg);
+  }
+  for (auto &BB : F) {
+    for (auto &I : BB)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+        Tracker.walk(AI);
+  }
+
+  bool Modified = false;
+
+  // Track whether a block is reachable after an alloca has escaped. Blocks that
+  // contain the escaping instruction will be marked as being visited without an
+  // escaped alloca, since that is how the block began.
+  enum VisitType {
+    UNVISITED,
+    UNESCAPED,
+    ESCAPED
+  };
+  DenseMap<BasicBlock *, VisitType> Visited;
+
+  // We propagate the fact that an alloca has escaped from block to successor.
+  // Visit the blocks that are propagating the escapedness first. To do this, we
+  // maintain two worklists.
+  SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+  // We may enter a block and visit it thinking that no alloca has escaped yet,
+  // then see an escape point and go back around a loop edge and come back to
+  // the same block twice. Because of this, we defer setting tail on calls when
+  // we first encounter them in a block. Every entry in this list does not
+  // statically use an alloca via use-def chain analysis, but may find an alloca
+  // through other means if the block turns out to be reachable after an escape
+  // point.
+  SmallVector<CallInst *, 32> DeferredTails;
+
+  BasicBlock *BB = &F.getEntryBlock();
+  VisitType Escaped = UNESCAPED;
+  do {
+    for (auto &I : *BB) {
+      if (Tracker.EscapePoints.count(&I))
+        Escaped = ESCAPED;
+
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI || CI->isTailCall())
+        continue;
+
+      bool IsNoTail = CI->isNoTailCall();
+
+      if (!IsNoTail && CI->doesNotAccessMemory()) {
+        // A call to a readnone function whose arguments are all things computed
+        // outside this function can be marked tail. Even if you stored the
+        // alloca address into a global, a readnone function can't load the
+        // global anyhow.
+        //
+        // Note that this runs whether we know an alloca has escaped or not. If
+        // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+        bool SafeToTail = true;
+        for (auto &Arg : CI->arg_operands()) {
+          if (isa<Constant>(Arg.getUser()))
+            continue;
+          if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+            if (!A->hasByValAttr())
+              continue;
+          SafeToTail = false;
+          break;
+        }
+        if (SafeToTail) {
+          emitOptimizationRemark(
+              F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
+              "marked this readnone call a tail call candidate");
+          CI->setTailCall();
+          Modified = true;
+          continue;
+        }
+      }
+
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+        DeferredTails.push_back(CI);
+      } else {
+        AllCallsAreTailCalls = false;
+      }
+    }
+
+    for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+      auto &State = Visited[SuccBB];
+      if (State < Escaped) {
+        State = Escaped;
+        if (State == ESCAPED)
+          WorklistEscaped.push_back(SuccBB);
+        else
+          WorklistUnescaped.push_back(SuccBB);
+      }
+    }
+
+    if (!WorklistEscaped.empty()) {
+      BB = WorklistEscaped.pop_back_val();
+      Escaped = ESCAPED;
+    } else {
+      BB = nullptr;
+      while (!WorklistUnescaped.empty()) {
+        auto *NextBB = WorklistUnescaped.pop_back_val();
+        if (Visited[NextBB] == UNESCAPED) {
+          BB = NextBB;
+          Escaped = UNESCAPED;
+          break;
+        }
+      }
+    }
+  } while (BB);
+
+  for (CallInst *CI : DeferredTails) {
+    if (Visited[CI->getParent()] != ESCAPED) {
+      // If the escape point was part way through the block, calls after the
+      // escape point wouldn't have been put into DeferredTails.
+      emitOptimizationRemark(F.getContext(), "tailcallelim", F,
+                             CI->getDebugLoc(),
+                             "marked this call a tail call candidate");
+      CI->setTailCall();
+      Modified = true;
+    } else {
+      AllCallsAreTailCalls = false;
+    }
+  }
+
+  return Modified;
+}
+
+bool TailCallElim::runTRE(Function &F) {
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg()) return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  BasicBlock *OldEntry = nullptr;
+  bool TailCallsAreMarkedTail = false;
+  SmallVector<PHINode*, 8> ArgumentPHIs;
+  bool MadeChange = false;
+
+  // If false, we cannot perform TRE on tail calls marked with the 'tail'
+  // attribute, because doing so would cause the stack size to increase (real
+  // TRE would deallocate variable sized allocas, TRE doesn't).
+  bool CanTRETailMarkedCall = CanTRE(F);
+
+  // Change any tail recursive calls to loops.
+  //
+  // FIXME: The code generator produces really bad code when an 'escaping
+  // alloca' is changed from being a static alloca to being a dynamic alloca.
+  // Until this is resolved, disable this transformation if that would ever
+  // happen.  This bug is PR962.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
+    BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB.
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs, !CanTRETailMarkedCall);
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
+                                          TailCallsAreMarkedTail, ArgumentPHIs,
+                                          !CanTRETailMarkedCall);
+      MadeChange |= Change;
+    }
+  }
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  for (PHINode *PN : ArgumentPHIs) {
+    // If the PHI Node is a dynamic constant, replace it with the value it is.
+    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+      PN->replaceAllUsesWith(PNV);
+      PN->eraseFromParent();
+    }
+  }
+
+  return MadeChange;
+}
+
+
+/// Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects())  // This also handles volatile loads.
+    return false;
+
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    // Loads may always be moved above calls without side effects.
+    if (CI->mayHaveSideEffects()) {
+      // Non-volatile loads may be moved above a call with side effects if it
+      // does not write to memory and the load provably won't trap.
+      // FIXME: Writes to memory only matter if they may alias the pointer
+      // being loaded from.
+      if (CI->mayWriteToMemory() ||
+          !isSafeToLoadUnconditionally(L->getPointerOperand(), L,
+                                       L->getAlignment()))
+        return false;
+    }
+  }
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  return std::find(I->op_begin(), I->op_end(), CI) == I->op_end();
+}
+
+/// Return true if the specified value is the same when the return would exit
+/// as it was when the initial iteration of the recursive function was executed.
+///
+/// We currently handle static constants and arguments that are not modified as
+/// part of the recursion.
+static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
+  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+  // Check to see if this is an immutable argument, if so, the value
+  // will be available to initialize the accumulator.
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // Figure out which argument number this is...
+    unsigned ArgNo = 0;
+    Function *F = CI->getParent()->getParent();
+    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+      ++ArgNo;
+
+    // If we are passing this argument into call as the corresponding
+    // argument operand, then the argument is dynamically constant.
+    // Otherwise, we cannot transform this function safely.
+    if (CI->getArgOperand(ArgNo) == Arg)
+      return true;
+  }
+
+  // Switch cases are always constant integers. If the value is being switched
+  // on and the return is only reachable from one of its cases, it's
+  // effectively constant.
+  if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
+      if (SI->getCondition() == V)
+        return SI->getDefaultDest() != RI->getParent();
+
+  // Not a constant or immutable argument, we can't safely transform.
+  return false;
+}
+
+/// Check to see if the function containing the specified tail call consistently
+/// returns the same runtime-constant value at all exit points except for
+/// IgnoreRI. If so, return the returned value.
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
+  Value *ReturnedValue = nullptr;
+
+  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) {
+    ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator());
+    if (RI == nullptr || RI == IgnoreRI) continue;
+
+    // We can only perform this transformation if the value returned is
+    // evaluatable at the start of the initial invocation of the function,
+    // instead of at the end of the evaluation.
+    //
+    Value *RetOp = RI->getOperand(0);
+    if (!isDynamicConstant(RetOp, CI, RI))
+      return nullptr;
+
+    if (ReturnedValue && RetOp != ReturnedValue)
+      return nullptr;     // Cannot transform if differing values are returned.
+    ReturnedValue = RetOp;
+  }
+  return ReturnedValue;
+}
+
+/// If the specified instruction can be transformed using accumulator recursion
+/// elimination, return the constant which is the start of the accumulator
+/// value.  Otherwise return null.
+Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
+                                                      CallInst *CI) {
+  if (!I->isAssociative() || !I->isCommutative()) return nullptr;
+  assert(I->getNumOperands() == 2 &&
+         "Associative/commutative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction.
+  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+      (I->getOperand(0) != CI && I->getOperand(1) != CI))
+    return nullptr;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
+    return nullptr;
+
+  // Ok, now we have to check all of the other return instructions in this
+  // function.  If they return non-constants or differing values, then we cannot
+  // transform the function safely.
+  return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
+}
+
+static Instruction *FirstNonDbg(BasicBlock::iterator I) {
+  while (isa<DbgInfoIntrinsic>(I))
+    ++I;
+  return &*I;
+}
+
+CallInst*
+TailCallElim::FindTRECandidate(Instruction *TI,
+                               bool CannotTailCallElimCallsMarkedTail) {
+  BasicBlock *BB = TI->getParent();
+  Function *F = BB->getParent();
+
+  if (&BB->front() == TI) // Make sure there is something before the terminator.
+    return nullptr;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI = nullptr;
+  BasicBlock::iterator BBI(TI);
+  while (true) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == F)
+      break;
+
+    if (BBI == BB->begin())
+      return nullptr;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return nullptr;
+
+  // As a special case, detect code like this:
+  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+  // and disable this xform in this case, because the code generator will
+  // lower the call to fabs into inline code.
+  if (BB == &F->getEntryBlock() &&
+      FirstNonDbg(BB->front().getIterator()) == CI &&
+      FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
+    // A single-block function with just a call and a return. Check that
+    // the arguments match.
+    CallSite::arg_iterator I = CallSite(CI).arg_begin(),
+                           E = CallSite(CI).arg_end();
+    Function::arg_iterator FI = F->arg_begin(),
+                           FE = F->arg_end();
+    for (; I != E && FI != FE; ++I, ++FI)
+      if (*I != &*FI) break;
+    if (I == E && FI == FE)
+      return nullptr;
+  }
+
+  return CI;
+}
+
+bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+                                       BasicBlock *&OldEntry,
+                                       bool &TailCallsAreMarkedTail,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  // If we are introducing accumulator recursion to eliminate operations after
+  // the call instruction that are both associative and commutative, the initial
+  // value for the accumulator is placed in this variable.  If this value is set
+  // then we actually perform accumulator recursion elimination instead of
+  // simple tail recursion elimination.  If the operation is an LLVM instruction
+  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
+  // we are handling the case when the return instruction returns a constant C
+  // which is different to the constant returned by other return instructions
+  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
+  // special case of accumulator recursion, the operation being "return C".
+  Value *AccumulatorRecursionEliminationInitVal = nullptr;
+  Instruction *AccumulatorRecursionInstr = nullptr;
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  BasicBlock::iterator BBI(CI);
+  for (++BBI; &*BBI != Ret; ++BBI) {
+    if (CanMoveAboveCall(&*BBI, CI)) continue;
+
+    // If we can't move the instruction above the call, it might be because it
+    // is an associative and commutative operation that could be transformed
+    // using accumulator recursion elimination.  Check to see if this is the
+    // case, and if so, remember the initial accumulator value for later.
+    if ((AccumulatorRecursionEliminationInitVal =
+             CanTransformAccumulatorRecursion(&*BBI, CI))) {
+      // Yes, this is accumulator recursion.  Remember which instruction
+      // accumulates.
+      AccumulatorRecursionInstr = &*BBI;
+    } else {
+      return false;   // Otherwise, we cannot eliminate the tail recursion!
+    }
+  }
+
+  // We can only transform call/return pairs that either ignore the return value
+  // of the call and return void, ignore the value of the call and return a
+  // constant, return the value returned by the tail call, or that are being
+  // accumulator recursion variable eliminated.
+  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+      !isa<UndefValue>(Ret->getReturnValue()) &&
+      AccumulatorRecursionEliminationInitVal == nullptr &&
+      !getCommonReturnValue(nullptr, CI)) {
+    // One case remains that we are able to handle: the current return
+    // instruction returns a constant, and all other return instructions
+    // return a different constant.
+    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+      return false; // Current return instruction does not return a constant.
+    // Check that all other return instructions return a common constant.  If
+    // so, record it in AccumulatorRecursionEliminationInitVal.
+    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+    if (!AccumulatorRecursionEliminationInitVal)
+      return false;
+  }
+
+  BasicBlock *BB = Ret->getParent();
+  Function *F = BB->getParent();
+
+  emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
+                         "transforming tail recursion to loop");
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (!OldEntry) {
+    OldEntry = &F->getEntryBlock();
+    BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
+    NewEntry->takeName(OldEntry);
+    OldEntry->setName("tailrecurse");
+    BranchInst::Create(OldEntry, NewEntry);
+
+    // If this tail call is marked 'tail' and if there are any allocas in the
+    // entry block, move them up to the new entry block.
+    TailCallsAreMarkedTail = CI->isTailCall();
+    if (TailCallsAreMarkedTail)
+      // Move all fixed sized allocas from OldEntry to NewEntry.
+      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+             NEBI = NewEntry->begin(); OEBI != E; )
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+          if (isa<ConstantInt>(AI->getArraySize()))
+            AI->moveBefore(&*NEBI);
+
+    // Now that we have created a new block, which jumps to the entry
+    // block, insert a PHI node for each argument of the function.
+    // For now, we initialize each PHI to only have the real arguments
+    // which are passed in.
+    Instruction *InsertPos = &OldEntry->front();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(), 2,
+                                    I->getName() + ".tr", InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(&*I, NewEntry);
+      ArgumentPHIs.push_back(PN);
+    }
+  }
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.  We
+  // have to choose whether we move allocas in the entry block to the new entry
+  // block or not, so we can't make a good choice for both.  NOTE: We could do
+  // slightly better here in the case that the function has no entry block
+  // allocas.
+  if (TailCallsAreMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+  // If we are introducing an accumulator variable to eliminate the recursion,
+  // do so now.  Note that we _know_ that no subsequent tail recursion
+  // eliminations will happen on this function because of the way the
+  // accumulator recursion predicate is set up.
+  //
+  if (AccumulatorRecursionEliminationInitVal) {
+    Instruction *AccRecInstr = AccumulatorRecursionInstr;
+    // Start by inserting a new PHI node for the accumulator.
+    pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
+    PHINode *AccPN = PHINode::Create(
+        AccumulatorRecursionEliminationInitVal->getType(),
+        std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());
+
+    // Loop over all of the predecessors of the tail recursion block.  For the
+    // real entry into the function we seed the PHI with the initial value,
+    // computed earlier.  For any other existing branches to this block (due to
+    // other tail recursions eliminated) the accumulator is not modified.
+    // Because we haven't added the branch in the current block to OldEntry yet,
+    // it will not show up as a predecessor.
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      if (P == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
+      else
+        AccPN->addIncoming(AccPN, P);
+    }
+
+    if (AccRecInstr) {
+      // Add an incoming argument for the current block, which is computed by
+      // our associative and commutative accumulator instruction.
+      AccPN->addIncoming(AccRecInstr, BB);
+
+      // Next, rewrite the accumulator recursion instruction so that it does not
+      // use the result of the call anymore, instead, use the PHI node we just
+      // inserted.
+      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    } else {
+      // Add an incoming argument for the current block, which is just the
+      // constant returned by the current return instruction.
+      AccPN->addIncoming(Ret->getReturnValue(), BB);
+    }
+
+    // Finally, rewrite any return instructions in the program to return the PHI
+    // node instead of the "initval" that they do currently.  This loop will
+    // actually rewrite the return value we are destroying, but that's ok.
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+        RI->setOperand(0, AccPN);
+    ++NumAccumAdded;
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+  NewBI->setDebugLoc(CI->getDebugLoc());
+
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  ++NumEliminated;
+  return true;
+}
+
+bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
+                                       ReturnInst *Ret, BasicBlock *&OldEntry,
+                                       bool &TailCallsAreMarkedTail,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  bool Change = false;
+
+  // If the return block contains nothing but the return and PHI's,
+  // there might be an opportunity to duplicate the return in its
+  // predecessors and perform TRC there. Look for predecessors that end
+  // in unconditional branch and recursive call(s).
+  SmallVector<BranchInst*, 8> UncondBranchPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    TerminatorInst *PTI = Pred->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
+      if (BI->isUnconditional())
+        UncondBranchPreds.push_back(BI);
+  }
+
+  while (!UncondBranchPreds.empty()) {
+    BranchInst *BI = UncondBranchPreds.pop_back_val();
+    BasicBlock *Pred = BI->getParent();
+    if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){
+      DEBUG(dbgs() << "FOLDING: " << *BB
+            << "INTO UNCOND BRANCH PRED: " << *Pred);
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+      // Cleanup: if all predecessors of BB have been eliminated by
+      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
+      // because the ret instruction in there is still using a value which
+      // EliminateRecursiveTailCall will attempt to remove.
+      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+        BB->eraseFromParent();
+
+      EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+                                 ArgumentPHIs,
+                                 CannotTailCallElimCallsMarkedTail);
+      ++NumRetDuped;
+      Change = true;
+    }
+  }
+
+  return Change;
+}
+
+bool
+TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                    bool &TailCallsAreMarkedTail,
+                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                    bool CannotTailCallElimCallsMarkedTail) {
+  CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
+  if (!CI)
+    return false;
+
+  return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
+                                    ArgumentPHIs,
+                                    CannotTailCallElimCallsMarkedTail);
+}