27 files changed, 4989 insertions, 2583 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index a097308..a3eb07a9 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -83,7 +83,7 @@ bool ADCE::runOnFunction(Function& F) {
       I->dropAllReferences();
     }
 
-  for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
+  for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(),
        E = worklist.end(); I != E; ++I) {
     ++NumRemoved;
     (*I)->eraseFromParent();
diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
deleted file mode 100644
index e755008..0000000
--- a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a very simple profile guided basic block placement
-// algorithm.  The idea is to put frequently executed blocks together at the
-// start of the function, and hopefully increase the number of fall-through
-// conditional branches.  If there is no profile information for a particular
-// function, this pass basically orders blocks in depth-first order
-//
-// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
-// Positioning" by Pettis and Hansen, except that it uses basic block counts
-// instead of edge counts.  This should be improved in many ways, but is very
-// simple for now.
-//
-// Basically we "place" the entry block, then loop over all successors in a DFO,
-// placing the most frequently executed successor until we run out of blocks.  I
-// told you this was _extremely_ simplistic. :) This is also much slower than it
-// could be.  When it becomes important, this pass will be rewritten to use a
-// better algorithm, and then we can worry about efficiency.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "block-placement"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumMoved, "Number of basic blocks moved");
-
-namespace {
-  struct BlockPlacement : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    BlockPlacement() : FunctionPass(ID) {
-      initializeBlockPlacementPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<ProfileInfo>();
-      //AU.addPreserved<ProfileInfo>();  // Does this work?
-    }
-  private:
-    /// PI - The profile information that is guiding us.
-    ///
-    ProfileInfo *PI;
-
-    /// NumMovedBlocks - Every time we move a block, increment this counter.
-    ///
-    unsigned NumMovedBlocks;
-
-    /// PlacedBlocks - Every time we place a block, remember it so we don't get
-    /// into infinite loops.
-    std::set<BasicBlock*> PlacedBlocks;
-
-    /// InsertPos - This an iterator to the next place we want to insert a
-    /// block.
-    Function::iterator InsertPos;
-
-    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
-    /// successors.
-    void PlaceBlocks(BasicBlock *BB);
-  };
-}
-
-char BlockPlacement::ID = 0;
-INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-
-FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
-
-bool BlockPlacement::runOnFunction(Function &F) {
-  PI = &getAnalysis<ProfileInfo>();
-
-  NumMovedBlocks = 0;
-  InsertPos = F.begin();
-
-  // Recursively place all blocks.
-  PlaceBlocks(F.begin());
-
-  PlacedBlocks.clear();
-  NumMoved += NumMovedBlocks;
-  return NumMovedBlocks != 0;
-}
-
-
-/// PlaceBlocks - Recursively place the specified blocks and any unplaced
-/// successors.
-void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
-  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
-  PlacedBlocks.insert(BB);
-
-  // Place the specified block.
-  if (&*InsertPos != BB) {
-    // Use splice to move the block into the right place.  This avoids having to
-    // remove the block from the function then readd it, which causes a bunch of
-    // symbol table traffic that is entirely pointless.
-    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
-    Blocks.splice(InsertPos, Blocks, BB);
-
-    ++NumMovedBlocks;
-  } else {
-    // This block is already in the right place, we don't have to do anything.
-    ++InsertPos;
-  }
-
-  // Keep placing successors until we run out of ones to place.  Note that this
-  // loop is very inefficient (N^2) for blocks with many successors, like switch
-  // statements.  FIXME!
-  while (1) {
-    // Okay, now place any unplaced successors.
-    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
-
-    // Scan for the first unplaced successor.
-    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
-      /*empty*/;
-    if (SI == E) return;  // No more successors to place.
-
-    double MaxExecutionCount = PI->getExecutionCount(*SI);
-    BasicBlock *MaxSuccessor = *SI;
-
-    // Scan for more frequently executed successors
-    for (; SI != E; ++SI)
-      if (!PlacedBlocks.count(*SI)) {
-        double Count = PI->getExecutionCount(*SI);
-        if (Count > MaxExecutionCount ||
-            // Prefer to not disturb the code.
-            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
-          MaxExecutionCount = Count;
-          MaxSuccessor = *SI;
-        }
-      }
-
-    // Now that we picked the maximally executed successor, place it.
-    PlaceBlocks(MaxSuccessor);
-  }
-}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
index f0d29c8..007e9b7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -76,10 +75,10 @@ namespace {
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
+    const TargetMachine *TM;
     const TargetLowering *TLI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
-    ProfileInfo *PFI;
 
     /// CurInstIterator - As we scan instructions optimizing them, this is the
     /// next instruction to optimize.  Xforms that can invalidate this should
@@ -100,8 +99,8 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit CodeGenPrepare(const TargetLowering *tli = 0)
-      : FunctionPass(ID), TLI(tli) {
+    explicit CodeGenPrepare(const TargetMachine *TM = 0)
+      : FunctionPass(ID), TM(TM), TLI(0) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F);
@@ -110,7 +109,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
@@ -139,17 +137,17 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
                 "Optimize for code generation", false, false)
 
-FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
-  return new CodeGenPrepare(TLI);
+FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
+  return new CodeGenPrepare(TM);
 }
 
 bool CodeGenPrepare::runOnFunction(Function &F) {
   bool EverMadeChange = false;
 
   ModifiedDT = false;
+  if (TM) TLI = TM->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
-  PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::OptimizeForSize);
 
@@ -205,7 +203,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
 
       DeleteDeadBlock(BB);
-      
+
       for (SmallVectorImpl<BasicBlock*>::iterator
              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
         if (pred_begin(*II) == pred_end(*II))
@@ -440,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
     DT->changeImmediateDominator(DestBB, NewIDom);
     DT->eraseNode(BB);
   }
-  if (PFI) {
-    PFI->replaceAllUses(BB, DestBB);
-    PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
-  }
   BB->eraseFromParent();
   ++NumBlocksElim;
 
@@ -830,7 +824,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
   void print(raw_ostream &OS) const;
   void dump() const;
-  
+
   bool operator==(const ExtAddrMode& O) const {
     return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
            (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
@@ -838,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   }
 };
 
+#ifndef NDEBUG
 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
   AM.print(OS);
   return OS;
 }
+#endif
 
 void ExtAddrMode::print(raw_ostream &OS) const {
   bool NeedPlus = false;
@@ -866,7 +862,6 @@ void ExtAddrMode::print(raw_ostream &OS) const {
     OS << (NeedPlus ? " + " : "")
        << Scale << "*";
     WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
-    NeedPlus = true;
   }
 
   OS << ']';
@@ -891,16 +886,16 @@ class AddressingModeMatcher {
   /// the memory instruction that we're computing this address for.
   Type *AccessTy;
   Instruction *MemoryInst;
-  
+
   /// AddrMode - This is the addressing mode that we're building up.  This is
   /// part of the return value of this addressing mode matching stuff.
   ExtAddrMode &AddrMode;
-  
+
   /// IgnoreProfitability - This is set to true when we should not do
   /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
   /// always returns true.
   bool IgnoreProfitability;
-  
+
   AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
                         const TargetLowering &T, Type *AT,
                         Instruction *MI, ExtAddrMode &AM)
@@ -908,7 +903,7 @@ class AddressingModeMatcher {
     IgnoreProfitability = false;
   }
 public:
-  
+
   /// Match - Find the maximal addressing mode that a load/store of V can fold,
   /// give an access type of AccessTy.  This returns a list of involved
   /// instructions in AddrModeInsts.
@@ -918,7 +913,7 @@ public:
                            const TargetLowering &TLI) {
     ExtAddrMode Result;
 
-    bool Success = 
+    bool Success =
       AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
                             MemoryInst, Result).MatchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -943,11 +938,11 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
   // mode.  Just process that directly.
   if (Scale == 1)
     return MatchAddr(ScaleReg, Depth);
-  
+
   // If the scale is 0, it takes nothing to add this.
   if (Scale == 0)
     return true;
-  
+
   // If we already have a scale of this value, we can add to it, otherwise, we
   // need an available scale field.
   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
@@ -966,7 +961,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
 
   // It was legal, so commit it.
   AddrMode = TestAddrMode;
-  
+
   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
   // X*Scale + C*Scale to addr mode.
@@ -975,7 +970,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
     TestAddrMode.ScaledReg = AddLHS;
     TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
-      
+
     // If this addressing mode is legal, commit it and remember that we folded
     // this instruction.
     if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
@@ -1026,7 +1021,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
                                                unsigned Depth) {
   // Avoid exponential behavior on extremely deep expression trees.
   if (Depth >= 5) return false;
-  
+
   switch (Opcode) {
   case Instruction::PtrToInt:
     // PtrToInt is always a noop, as we know that the int type is pointer sized.
@@ -1034,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   case Instruction::IntToPtr:
     // This inttoptr is a no-op if the integer type is pointer sized.
     if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy())
+        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::BitCast:
@@ -1055,16 +1050,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
         MatchAddr(AddrInst->getOperand(0), Depth+1))
       return true;
-    
+
     // Restore the old addr mode info.
     AddrMode = BackupAddrMode;
     AddrModeInsts.resize(OldSize);
-    
+
     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
     if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
         MatchAddr(AddrInst->getOperand(1), Depth+1))
       return true;
-    
+
     // Otherwise we definitely can't merge the ADD in.
     AddrMode = BackupAddrMode;
     AddrModeInsts.resize(OldSize);
@@ -1081,7 +1076,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     int64_t Scale = RHS->getSExtValue();
     if (Opcode == Instruction::Shl)
       Scale = 1LL << Scale;
-    
+
     return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
   }
   case Instruction::GetElementPtr: {
@@ -1089,7 +1084,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     // one variable offset.
     int VariableOperand = -1;
     unsigned VariableScale = 0;
-    
+
     int64_t ConstantOffset = 0;
     const DataLayout *TD = TLI.getDataLayout();
     gep_type_iterator GTI = gep_type_begin(AddrInst);
@@ -1107,14 +1102,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
           // We only allow one variable index at the moment.
           if (VariableOperand != -1)
             return false;
-          
+
           // Remember the variable index.
           VariableOperand = i;
           VariableScale = TypeSize;
         }
       }
     }
-    
+
     // A common case is for the GEP to only do a constant offset.  In this case,
     // just add it to the disp field and check validity.
     if (VariableOperand == -1) {
@@ -1208,7 +1203,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
         AddrModeInsts.push_back(I);
         return true;
       }
-      
+
       // It isn't profitable to do this, roll back.
       //cerr << "NOT FOLDING: " << *I;
       AddrMode = BackupAddrMode;
@@ -1254,7 +1249,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
   TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-    
+
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
@@ -1279,7 +1274,7 @@ static bool FindAllMemoryUses(Instruction *I,
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I))
     return false;
-  
+
   // If this is an obviously unfoldable instruction, bail out.
   if (!MightBeFoldableInst(I))
     return true;
@@ -1293,24 +1288,24 @@ static bool FindAllMemoryUses(Instruction *I,
       MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
       continue;
     }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       unsigned opNo = UI.getOperandNo();
       if (opNo == 0) return true; // Storing addr, not into addr.
       MemoryUses.push_back(std::make_pair(SI, opNo));
       continue;
     }
-    
+
     if (CallInst *CI = dyn_cast<CallInst>(U)) {
       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
       if (!IA) return true;
-      
+
       // If this is a memory operand, we're cool, otherwise bail out.
       if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
         return true;
       continue;
     }
-    
+
     if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
                           TLI))
       return true;
@@ -1328,17 +1323,17 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
   // If Val is either of the known-live values, we know it is live!
   if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
     return true;
-  
+
   // All values other than instructions and arguments (e.g. constants) are live.
   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
-  
+
   // If Val is a constant sized alloca in the entry block, it is live, this is
   // true because it is just a reference to the stack/frame pointer, which is
   // live for the whole function.
   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
     if (AI->isStaticAlloca())
       return true;
-  
+
   // Check to see if this value is already used in the memory instruction's
   // block.  If so, it's already live into the block at the very least, so we
   // can reasonably fold it.
@@ -1370,7 +1365,7 @@ bool AddressingModeMatcher::
 IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
                                      ExtAddrMode &AMAfter) {
   if (IgnoreProfitability) return true;
-  
+
   // AMBefore is the addressing mode before this instruction was folded into it,
   // and AMAfter is the addressing mode after the instruction was folded.  Get
   // the set of registers referenced by AMAfter and subtract out those
@@ -1381,7 +1376,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // BaseReg and ScaleReg (global addresses are always available, as are any
   // folded immediates).
   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
-  
+
   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
   // lifetime wasn't extended by adding this instruction.
   if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
@@ -1402,7 +1397,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
     return false;  // Has a non-memory, non-foldable use!
-  
+
   // Now that we know that all uses of this instruction are part of a chain of
   // computation involving only operations that could theoretically be folded
   // into a memory use, loop over each of these uses and see if they could
@@ -1411,15 +1406,14 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
     Instruction *User = MemoryUses[i].first;
     unsigned OpNo = MemoryUses[i].second;
-    
+
     // Get the access type of this use.  If the use isn't a pointer, we don't
     // know what it accesses.
     Value *Address = User->getOperand(OpNo);
     if (!Address->getType()->isPointerTy())
       return false;
-    Type *AddressAccessTy =
-      cast<PointerType>(Address->getType())->getElementType();
-    
+    Type *AddressAccessTy = Address->getType()->getPointerElementType();
+
     // Do a match against the root of this address, ignoring profitability. This
     // will tell us if the addressing mode for the memory operation will
     // *actually* cover the shared instruction.
@@ -1434,10 +1428,10 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
                   I) == MatchedAddrModeInsts.end())
       return false;
-    
+
     MatchedAddrModeInsts.clear();
   }
-  
+
   return true;
 }
 
@@ -1572,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
-    Type *IntPtrTy =
-          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
-
+    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
     Value *Result = 0;
 
     // Start with the base register. Do this first so that subsequent address
@@ -1893,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P)) {
+    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0,
+                                       TLInfo, DT)) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
       ++NumPHIsElim;
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 3c08634..5266894 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -72,11 +72,6 @@ namespace {
 }
 
 namespace llvm {
-// SimpleValue is POD.
-template<> struct isPodLike<SimpleValue> {
-  static const bool value = true;
-};
-
 template<> struct DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
     return DenseMapInfo<Instruction*>::getEmptyKey();
@@ -220,11 +215,6 @@ namespace {
 }
 
 namespace llvm {
-  // CallValue is POD.
-  template<> struct isPodLike<CallValue> {
-    static const bool value = true;
-  };
-
   template<> struct DenseMapInfo<CallValue> {
     static inline CallValue getEmptyKey() {
       return DenseMapInfo<Instruction*>::getEmptyKey();
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
new file mode 100644
index 0000000..e7de07f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -0,0 +1,79 @@
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "flattencfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+public:
+  FlattenCFGPass() : FunctionPass(ID) {
+    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<AliasAnalysis>();
+  }
+
+private:
+  AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                      false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                    false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded...
+    //
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
+      if (FlattenCFG(BBIt++, AA)) {
+        LocalChange = true;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+  AA = &getAnalysis<AliasAnalysis>();
+  bool EverChanged = false;
+  // iterativelyFlattenCFG can make some blocks dead.
+  while (iterativelyFlattenCFG(F, AA)) {
+    removeUnreachableBlocks(F);
+    EverChanged = true;
+  }
+  return EverChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index f350b9b..6af269d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -21,8 +21,10 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -45,6 +47,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -505,7 +508,9 @@ namespace {
     enum ValType {
       SimpleVal,  // A simple offsetted value that is accessed.
       LoadVal,    // A value produced by a load.
-      MemIntrin   // A memory intrinsic which is loaded from.
+      MemIntrin,  // A memory intrinsic which is loaded from.
+      UndefVal    // A UndefValue representing a value from dead block (which
+                  // is not yet physically removed from the CFG). 
     };
   
     /// V - The value that is live out of the block.
@@ -543,10 +548,20 @@ namespace {
       Res.Offset = Offset;
       return Res;
     }
-  
+
+    static AvailableValueInBlock getUndef(BasicBlock *BB) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(0);
+      Res.Val.setInt(UndefVal);
+      Res.Offset = 0;
+      return Res;
+    }
+
     bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
     bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
     bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+    bool isUndefValue() const { return Val.getInt() == UndefVal; }
   
     Value *getSimpleValue() const {
       assert(isSimpleValue() && "Wrong accessor");
@@ -574,6 +589,7 @@ namespace {
     DominatorTree *DT;
     const DataLayout *TD;
     const TargetLibraryInfo *TLI;
+    SetVector<BasicBlock *> DeadBlocks;
 
     ValueTable VN;
 
@@ -692,9 +708,13 @@ namespace {
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
+    BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
     unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
                                          const BasicBlockEdge &Root);
     bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
+    bool processFoldableCondBr(BranchInst *BI);
+    void addDeadBlock(BasicBlock *BB);
+    void assignValNumForDeadCode();
   };
 
   char GVN::ID = 0;
@@ -1068,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (Offset == -1)
     return Offset;
 
+  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
   return -1;
@@ -1152,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     Type *DestPTy =
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy,
-                       cast<PointerType>(PtrVal->getType())->getAddressSpace());
+                               PtrVal->getType()->getPointerAddressSpace());
     Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
@@ -1227,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
 
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
-  ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
 
@@ -1250,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   // just use the dominating value directly.
   if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
-                                               LI->getParent()))
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
+  }
 
   // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
@@ -1321,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
                    << *getCoercedLoadValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
     }
-  } else {
+  } else if (isMemIntrinValue()) {
     const DataLayout *TD = gvn.getDataLayout();
     assert(TD && "Need target data to handle type mismatch case");
     Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
@@ -1329,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
   }
   return Res;
 }
@@ -1352,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
     if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
@@ -1513,7 +1548,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     FullyAvailableBlocks[UnavailableBlocks[i]] = false;
 
-  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit;
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
        PI != E; ++PI) {
     BasicBlock *Pred = *PI;
@@ -1536,20 +1571,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
         return false;
       }
 
-      unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
-      NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
+      CriticalEdgePred.push_back(Pred);
     }
   }
 
-  if (!NeedToSplit.empty()) {
-    toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
-    return false;
-  }
-
   // Decide whether PRE is profitable for this load.
   unsigned NumUnavailablePreds = PredLoads.size();
   assert(NumUnavailablePreds != 0 &&
-         "Fully available value should be eliminated above!");
+         "Fully available value should already be eliminated!");
 
   // If this load is unavailable in multiple predecessors, reject it.
   // FIXME: If we could restructure the CFG, we could make a common pred with
@@ -1558,6 +1587,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   if (NumUnavailablePreds != 1)
       return false;
 
+  // Split critical edges, and update the unavailable predecessors accordingly.
+  for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(),
+         E = CriticalEdgePred.end(); I != E; I++) {
+    BasicBlock *OrigPred = *I;
+    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+    PredLoads.erase(OrigPred);
+    PredLoads[NewPred] = 0;
+    DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                 << LoadBB->getName() << '\n');
+  }
+
   // Check if the load can safely be moved to all the unavailable predecessors.
   bool CanDoPRE = true;
   SmallVector<Instruction*, 8> NewInsts;
@@ -1594,7 +1634,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       if (MD) MD->removeInstruction(I);
       I->eraseFromParent();
     }
-    return false;
+    // HINT:Don't revert the edge-splitting as following transformation may 
+    // also need to split these critial edges.
+    return !CriticalEdgePred.empty();
   }
 
   // Okay, we can eliminate this load by inserting a reload in the predecessor
@@ -2181,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) {
   // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+    if (!BI->isConditional())
       return false;
 
-    Value *BranchCond = BI->getCondition();
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
 
+    Value *BranchCond = BI->getCondition();
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
     // Avoid multiple edges early.
@@ -2297,25 +2341,30 @@ bool GVN::runOnFunction(Function& F) {
   while (ShouldContinue) {
     DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
     ShouldContinue = iterateOnFunction(F);
-    if (splitCriticalEdges())
-      ShouldContinue = true;
     Changed |= ShouldContinue;
     ++Iteration;
   }
 
   if (EnablePRE) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
       Changed |= PREChanged;
     }
   }
+
   // FIXME: Should perform GVN again after PRE does something.  PRE can move
   // computations into blocks where they become fully redundant.  Note that
   // we can't do this until PRE's critical edge splitting updates memdep.
   // Actually, when this happens, we should just fully integrate PRE into GVN.
 
   cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration. 
+  DeadBlocks.clear();
 
   return Changed;
 }
@@ -2326,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) {
   // (and incrementing BI before processing an instruction).
   assert(InstrsToErase.empty() &&
          "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
@@ -2344,7 +2396,7 @@ bool GVN::processBlock(BasicBlock *BB) {
     if (!AtStart)
       --BI;
 
-    for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(),
+    for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
          E = InstrsToErase.end(); I != E; ++I) {
       DEBUG(dbgs() << "GVN removed: " << **I << '\n');
       if (MD) MD->removeInstruction(*I);
@@ -2543,6 +2595,15 @@ bool GVN::performPRE(Function &F) {
   return Changed;
 }
 
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+  BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this);
+  if (MD)
+    MD->invalidateCachedPredecessors();
+  return BB;
+}
+
 /// splitCriticalEdges - Split critical edges found during the previous
 /// iteration that may enable further optimization.
 bool GVN::splitCriticalEdges() {
@@ -2569,9 +2630,18 @@ bool GVN::iterateOnFunction(Function &F) {
        RE = RPOT.end(); RI != RE; ++RI)
     Changed |= processBlock(*RI);
 #else
+  // Save the blocks this function have before transformation begins. GVN may
+  // split critical edge, and hence may invalidate the RPO/DT iterator.
+  //
+  std::vector<BasicBlock *> BBVect;
+  BBVect.reserve(256);
   for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
        DE = df_end(DT->getRootNode()); DI != DE; ++DI)
-    Changed |= processBlock(DI->getBlock());
+    BBVect.push_back(DI->getBlock());
+
+  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
+       I != E; I++)
+    Changed |= processBlock(*I);
 #endif
 
   return Changed;
@@ -2601,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
     }
   }
 }
+
+// BB is declared dead, which implied other blocks become dead as well. This
+// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+// live successors, update their phi nodes by replacing the operands
+// corresponding to dead blocks with UndefVal.
+//
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+    
+    // Figure out the dominance-frontier(D).
+    for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(),
+           E = Dom.end(); I != E; I++) {
+      BasicBlock *B = *I;
+      for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) {
+        BasicBlock *S = *SI;
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++)
+          if (!DeadBlocks.count(*PI)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(),
+           PE = Preds.end(); PI != PE; PI++) {
+      BasicBlock *P = *PI;
+
+      if (!DeadBlocks.count(P))
+        continue;
+
+      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+
+      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+        PHINode &Phi = cast<PHINode>(*II);
+        Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+                             UndefValue::get(Phi.getType()));
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let 
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead prodecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot = Cond->getZExtValue() ? 
+                         BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it come across an instruciton without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+        E = DeadBlocks.end(); I != E; I++) {
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), EE = BB->end();
+          II != EE; II++) {
+      Instruction *Inst = &*II;
+      unsigned ValNum = VN.lookup_or_add(Inst);
+      addToLeaderTable(ValNum, Inst, BB);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
index 4796eb2..954e545 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -72,15 +72,13 @@ using namespace llvm;
 
 static cl::opt<bool>
 EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
-                  	cl::desc("Enable global merge pass on constants"),
-                  	cl::init(false));
+                         cl::desc("Enable global merge pass on constants"),
+                         cl::init(false));
 
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// target type sizes.
-    const TargetLowering *TLI;
+    const TargetMachine *TM;
 
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
@@ -104,8 +102,8 @@ namespace {
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    explicit GlobalMerge(const TargetLowering *tli = 0)
-      : FunctionPass(ID), TLI(tli) {
+    explicit GlobalMerge(const TargetMachine *TM = 0)
+      : FunctionPass(ID), TM(TM) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
@@ -144,6 +142,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge",
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
+  const TargetLowering *TLI = TM->getTargetLowering();
   const DataLayout *TD = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
@@ -234,6 +233,7 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
 bool GlobalMerge::doInitialization(Module &M) {
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
+  const TargetLowering *TLI = TM->getTargetLowering();
   const DataLayout *TD = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
@@ -305,6 +305,6 @@ bool GlobalMerge::doFinalization(Module &M) {
   return false;
 }
 
-Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
-  return new GlobalMerge(tli);
+Pass *llvm::createGlobalMergePass(const TargetMachine *TM) {
+  return new GlobalMerge(TM);
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 8e76c78..235aaaa 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L))
+        if (!SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE))
           continue;
 
         // Computing the value outside of the loop brings no benefit if :
@@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
   if (IndVar->getType()->isPointerTy()
       && !IVCount->getType()->isPointerTy()) {
 
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. IVCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend IVCount.
     Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
-    const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy);
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
 
     // Expand the code for the iteration count.
     assert(SE->isLoopInvariant(IVOffset, L) &&
@@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
     // We could handle pointer IVs other than i8*, but we need to compensate for
     // gep index scaling. See canExpandBackedgeTakenCount comments.
-    assert(SE->getSizeOfExpr(
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
              cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
            && "unit stride pointer IV must be i8*");
 
@@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
     //
     // Valid Cases: (1) both integers is most common; (2) both may be pointers
-    // for simple memset-style loops; (3) IVInit is an integer and IVCount is a
-    // pointer may occur when enable-iv-rewrite generates a canonical IV on top
-    // of case #2.
+    // for simple memset-style loops.
+    //
+    // IVInit integer and IVCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
 
     const SCEV *IVLimit = 0;
     // For unit stride, IVCount = Start + BECount with 2's complement overflow.
@@ -1552,44 +1560,23 @@ LinearFunctionTestReplace(Loop *L,
                           SCEVExpander &Rewriter) {
   assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
 
-  // LFTR can ignore IV overflow and truncate to the width of
-  // BECount. This avoids materializing the add(zext(add)) expression.
-  Type *CntTy = BackedgeTakenCount->getType();
-
+  // Initialize CmpIndVar and IVCount to their preincremented values.
+  Value *CmpIndVar = IndVar;
   const SCEV *IVCount = BackedgeTakenCount;
 
   // If the exiting block is the same as the backedge block, we prefer to
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
-  Value *CmpIndVar;
   if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
-    // If this addition may overflow, we have to be more pessimistic and
-    // cast the induction variable before doing the add.
-    const SCEV *N =
-      SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1));
-    if (CntTy == IVCount->getType())
-      IVCount = N;
-    else {
-      const SCEV *Zero = SE->getConstant(IVCount->getType(), 0);
-      if ((isa<SCEVConstant>(N) && !N->isZero()) ||
-          SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
-        // No overflow. Cast the sum.
-        IVCount = SE->getTruncateOrZeroExtend(N, CntTy);
-      } else {
-        // Potential overflow. Cast before doing the add.
-        IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
-        IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1));
-      }
-    }
+    // This addition may overflow, which is valid as long as the comparison is
+    // truncated to BackedgeTakenCount->getType().
+    IVCount = SE->getAddExpr(BackedgeTakenCount,
+                             SE->getConstant(BackedgeTakenCount->getType(), 1));
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
     CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
-  } else {
-    // We must use the preincremented value...
-    IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
-    CmpIndVar = IndVar;
   }
 
   Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
@@ -1612,12 +1599,40 @@ LinearFunctionTestReplace(Loop *L,
                << "  IVCount:\t" << *IVCount << "\n");
 
   IRBuilder<> Builder(BI);
-  if (SE->getTypeSizeInBits(CmpIndVar->getType())
-      > SE->getTypeSizeInBits(ExitCnt->getType())) {
-    CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
-                                    "lftr.wideiv");
-  }
 
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+  unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+  if (CmpIndVarSize > ExitCntSize) {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+    const SCEV *ARStart = AR->getStart();
+    const SCEV *ARStep = AR->getStepRecurrence(*SE);
+    // For constant IVCount, avoid truncation.
+    if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
+      const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue();
+      APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue();
+      // Note that the post-inc value of BackedgeTakenCount may have overflowed
+      // above such that IVCount is now zero.
+      if (IVCount != BackedgeTakenCount && Count == 0) {
+        Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize);
+        ++Count;
+      }
+      else
+        Count = Count.zext(CmpIndVarSize);
+      APInt NewLimit;
+      if (cast<SCEVConstant>(ARStep)->getValue()->isNegative())
+        NewLimit = Start - Count;
+      else
+        NewLimit = Start + Count;
+      ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
+
+      DEBUG(dbgs() << "  Widen RHS:\t" << *ExitCnt << "\n");
+    } else {
+      CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                      "lftr.wideiv");
+    }
+  }
   Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
   Value *OrigCond = BI->getCondition();
   // It's tempting to use replaceAllUsesWith here to fully replace the old
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index b61c5ba..b3ec2fc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
@@ -129,6 +130,7 @@ namespace {
     bool ProcessBranchOnXOR(BinaryOperator *BO);
 
     bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+    bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
   };
 }
 
@@ -775,7 +777,11 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
           return true;
         }
       }
+
     }
+
+    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
+      return true;
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -821,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   return false;
 }
 
-
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -836,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (LoadBB->getSinglePredecessor())
     return false;
 
+  // If the load is defined in a landing pad, it can't be partially redundant,
+  // because the edges between the invoke and the landing pad cannot have other
+  // instructions between them.
+  if (LoadBB->isLandingPad())
+    return false;
+
   Value *LoadedPtr = LI->getOperand(0);
 
   // If the loaded operand is defined in the LoadBB, it can't be available.
@@ -1615,4 +1626,80 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   return true;
 }
 
+/// TryToUnfoldSelect - Look for blocks of the form
+/// bb1:
+///   %a = select
+///   br bb
+///
+/// bb2:
+///   %p = phi [%a, %bb] ...
+///   %c = icmp %p
+///   br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
+bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+  Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+  if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+      CondLHS->getParent() != BB)
+    return false;
+
+  for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+    BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+    SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
 
+    // Look if one of the incoming values is a select in the corresponding
+    // predecessor.
+    if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+      continue;
+
+    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredTerm || !PredTerm->isUnconditional())
+      continue;
+
+    // Now check if one of the select values would allow us to constant fold the
+    // terminator in BB. We don't do the transform if both sides fold, those
+    // cases will be threaded in any case.
+    LazyValueInfo::Tristate LHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+                                CondRHS, Pred, BB);
+    LazyValueInfo::Tristate RHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+                                CondRHS, Pred, BB);
+    if ((LHSFolds != LazyValueInfo::Unknown ||
+         RHSFolds != LazyValueInfo::Unknown) &&
+        LHSFolds != RHSFolds) {
+      // Expand the select.
+      //
+      // Pred --
+      //  |    v
+      //  |  NewBB
+      //  |    |
+      //  |-----
+      //  v
+      // BB
+      BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+                                             BB->getParent(), BB);
+      // Move the unconditional branch to NewBB.
+      PredTerm->removeFromParent();
+      NewBB->getInstList().insert(NewBB->end(), PredTerm);
+      // Create a conditional branch and update PHI nodes.
+      BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+      CondLHS->setIncomingValue(I, SI->getFalseValue());
+      CondLHS->addIncoming(SI->getTrueValue(), NewBB);
+      // The select is now dead.
+      SI->eraseFromParent();
+
+      // Update any other PHI nodes in BB.
+      for (BasicBlock::iterator BI = BB->begin();
+           PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+        if (Phi != CondLHS)
+          Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 0b62050..9e39d2e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -51,8 +51,8 @@ namespace {
     }
 
   private:
-    bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks,
-                    SmallVector<BasicBlock*, 4> &exitBlocks,
+    bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                    SmallVectorImpl<BasicBlock *> &exitBlocks,
                     bool &Changed, BasicBlock *Preheader);
 
   };
@@ -77,8 +77,8 @@ Pass *llvm::createLoopDeletionPass() {
 /// checked for unique exit and exiting blocks, and that the code is in LCSSA
 /// form.
 bool LoopDeletion::isLoopDead(Loop *L,
-                              SmallVector<BasicBlock*, 4> &exitingBlocks,
-                              SmallVector<BasicBlock*, 4> &exitBlocks,
+                              SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                              SmallVectorImpl<BasicBlock *> &exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
   BasicBlock *exitBlock = exitBlocks[0];
 
@@ -209,7 +209,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
     // Move all of the block's children to be children of the preheader, which
     // allows us to remove the domtree entry for the block.
     ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
-    for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(),
+    for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(),
          DE = ChildNodes.end(); DI != DE; ++DI) {
       DT.changeImmediateDominator(*DI, DT[preheader]);
     }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8258719..952b76b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -81,7 +81,7 @@ namespace {
     /// Return the condition of the branch terminating the given basic block.
     static Value *getBrCondtion(BasicBlock *);
 
-    /// Derive the precondition block (i.e the block that guards the loop 
+    /// Derive the precondition block (i.e the block that guards the loop
     /// preheader) from the given preheader.
     static BasicBlock *getPrecondBb(BasicBlock *PreHead);
   };
@@ -111,7 +111,7 @@ namespace {
     /// beween a variable and zero, and if the variable is non-zero, the
     /// control yeilds to the loop entry. If the branch matches the behavior,
     /// the variable involved in the comparion is returned. This function will
-    /// be called to see if the precondition and postcondition of the loop 
+    /// be called to see if the precondition and postcondition of the loop
     /// are in desirable form.
     Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
 
@@ -274,11 +274,11 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
 //
 //===----------------------------------------------------------------------===//
 
-// This fucntion will return true iff the given block contains nothing but goto. 
-// A typical usage of this function is to check if the preheader fucntion is 
-// "almost" empty such that generated intrinsic function can be moved across 
-// preheader and to be placed at the end of the preconditiona block without 
-// concerning of breaking data dependence.
+// This function will return true iff the given block contains nothing but goto.
+// A typical usage of this function is to check if the preheader function is
+// "almost" empty such that generated intrinsic functions can be moved across
+// the preheader and be placed at the end of the precondition block without
+// the concern of breaking data dependence.
 bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
   if (BranchInst *Br = getBranch(BB)) {
     return Br->isUnconditional() && BB->size() == 1;
@@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() {
   if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
     return false;
 
-  // Counting population are usually conducted by few arithmetic instrutions.
+  // Counting population are usually conducted by few arithmetic instructions.
   // Such instructions can be easilly "absorbed" by vacant slots in a
   // non-compact loop. Therefore, recognizing popcount idiom only makes sense
   // in a compact loop.
@@ -339,7 +339,7 @@ bool NclPopcountRecognize::preliminaryScreen() {
   PreCondBB = LIRUtil::getPrecondBb(PreHead);
   if (!PreCondBB)
     return false;
- 
+
   return true;
 }
 
@@ -504,7 +504,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
   // Assuming before transformation, the loop is following:
   //  if (x) // the precondition
   //     do { cnt++; x &= x - 1; } while(x);
- 
+
   // Step 1: Insert the ctpop instruction at the end of the precondition block
   IRBuilderTy Builder(PreCondBr);
   Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
@@ -611,7 +611,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
   SE->forgetLoop(CurLoop);
 }
 
-CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 
+CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder,
                                                       Value *Val, DebugLoc DL) {
   Value *Ops[] = { Val };
   Type *Tys[] = { Val->getType() };
@@ -667,13 +667,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
   if (!getDataLayout())
     return false;
 
-  // set DT 
+  // set DT
   (void)getDominatorTree();
 
   LoopInfo &LI = getAnalysis<LoopInfo>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
-  // set TLI 
+  // set TLI
   (void)getTargetLibraryInfo();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = 0;
 
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+
   // If we're allowed to form a memset, and the stored value would be acceptable
   // for memset, use it.
   if (SplatValue && TLI->has(LibFunc::memset) &&
@@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
       CurLoop->isLoopInvariant(SplatValue)) {
     // Keep and use SplatValue.
     PatternValue = 0;
-  } else if (TLI->has(LibFunc::memset_pattern16) &&
+  } else if (DestAS == 0 &&
+             TLI->has(LibFunc::memset_pattern16) &&
              (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
+    // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
     SplatValue = 0;
   } else {
@@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, "loop-idiom");
 
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
   // or write to the aliased location.  Check for any overlap by generating the
   // base pointer and checking the region.
-  unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
-    Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
+    Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
                            Preheader->getTerminator());
 
-
   if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
                             CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     deleteIfDeadInstruction(BasePtr, *SE, TLI);
@@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = Builder.getIntPtrTy(TD, DestAS);
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
                                          SCEV::FlagNUW);
-  if (StoreSize != 1)
+  if (StoreSize != 1) {
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
                                SCEV::FlagNUW);
+  }
 
   Value *NumBytes =
     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
   CallInst *NewCall;
-  if (SplatValue)
-    NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment);
-  else {
+  if (SplatValue) {
+    NewCall = Builder.CreateMemSet(BasePtr,
+                                   SplatValue,
+                                   NumBytes,
+                                   StoreAlignment);
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
     Module *M = TheStore->getParent()->getParent()->getParent();
     Value *MSP = M->getOrInsertFunction("memset_pattern16",
                                         Builder.getVoidTy(),
-                                        Builder.getInt8PtrTy(),
-                                        Builder.getInt8PtrTy(), IntPtr,
+                                        Int8PtrTy,
+                                        Int8PtrTy,
+                                        IntPtr,
                                         (void*)0);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(true); // Ok to merge these.
     GV->setAlignment(16);
-    Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy());
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
   }
 
@@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(SI->getContext());
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace());
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
 
-  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
+  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
                                          SCEV::FlagNUW);
   if (StoreSize != 1)
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
 
   Value *NumBytes =
-    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+    Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
   CallInst *NewCall =
     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 0000000..335af81
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1184 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reroll"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+  cl::desc("The maximum increment for loop rerolling"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+  class LoopReroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopReroll() : LoopPass(ID) {
+      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    DataLayout *DL;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+
+    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+
+    // A chain of isomorphic instructions, indentified by a single-use PHI,
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L)
+        : Valid(false), Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      typedef SmallInstructionVector::iterator iterator;
+      typedef SmallInstructionVector::const_iterator const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) {
+        PossibleReds.push_back(SLR);
+      }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+      
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+                 JE = PossibleReds[i].end(); J != JE; ++J) {
+              PossibleRedSet.insert(*J);
+              PossibleRedIdx[*J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      // Is the provided instruction the PHI of a reduction selected for
+      // rerolling?
+      bool isSelectedPHI(Instruction *J) {
+        if (!isa<PHINode>(J))
+          return false;
+
+        for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+             RI != RIE; ++RI) {
+          int i = *RI;
+          if (cast<Instruction>(J) == PossibleReds[i].getPHI())
+            return true;
+        }
+
+        return false;
+      }
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    void collectInLoopUserSet(Loop *L,
+           const SmallInstructionVector &Roots,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    void collectInLoopUserSet(Loop *L,
+           Instruction * Root,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                          Instruction *&IV,
+                          SmallInstructionVector &LoopIncs);
+    bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
+                         SmallVector<SmallInstructionVector, 32> &Roots,
+                         SmallInstructionSet &AllRoots,
+                         SmallInstructionVector &LoopIncs);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+                ReductionTracker &Reductions);
+  };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+  return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (Value::use_iterator UI = I->use_begin(),
+       UIE = I->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!L->contains(User))
+      return true;
+  }
+
+  return false;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      if (const SCEVConstant *IncSCEV =
+          dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
+        if (!IncSCEV->getValue()->getValue().isStrictlyPositive())
+          continue;
+        if (IncSCEV->getValue()->uge(MaxInc))
+          continue;
+
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " <<
+              *PHISCEV << "\n");
+        PossibleIVs.push_back(I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+
+  do {
+    C = cast<Instruction>(*C->use_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_begin() == C->use_end())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end();
+       UI != UIE; ++UI) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(*UI)))
+      if (cast<Instruction>(*UI ) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(I, L);
+    if (!SLR.valid())
+      continue;
+
+    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+          SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Value::use_iterator UI = I->use_begin(),
+           UIE = I->use_end(); UI != UIE; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(UI) == L->getHeader())
+            continue;
+        }
+  
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (SmallInstructionVector::const_iterator I = Roots.begin(),
+       IE = Roots.end(); I != IE; ++I)
+    collectInLoopUserSet(L, *I, Exclude, Final, Users);
+}
+
+static bool isSimpleLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+// Recognize loops that are setup like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
+bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                                  Instruction *&IV,
+                                  SmallInstructionVector &LoopIncs) {
+  // This is a special case: here we're looking for all uses (except for
+  // the increment) to be multiplied by a common factor. The increment must
+  // be by one. This is to capture loops like:
+  //   for (int i = 0; i < 500; ++i) {
+  //     foo(3*i); foo(3*i+1); foo(3*i+2);
+  //   }
+  if (RealIV->getNumUses() != 2)
+    return false;
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
+  Instruction *User1 = cast<Instruction>(*RealIV->use_begin()),
+              *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin()));
+  if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
+    return false;
+  const SCEVAddRecExpr *User1SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)),
+                       *User2SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2));
+  if (!User1SCEV || !User1SCEV->isAffine() ||
+      !User2SCEV || !User2SCEV->isAffine())
+    return false;
+
+  // We assume below that User1 is the scale multiply and User2 is the
+  // increment. If this can't be true, then swap them.
+  if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
+    std::swap(User1, User2);
+    std::swap(User1SCEV, User2SCEV);
+  }
+
+  if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
+    return false;
+  assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
+         "Invalid non-unit step for multiplicative scaling");
+  LoopIncs.push_back(User2);
+
+  if (const SCEVConstant *MulScale =
+      dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) {
+    // Make sure that both the start and step have the same multiplier.
+    if (RealIVSCEV->getStart()->getType() != MulScale->getType())
+      return false;
+    if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
+        User1SCEV->getStart())
+      return false;
+
+    ConstantInt *MulScaleCI = MulScale->getValue();
+    if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
+      return false;
+    Scale = MulScaleCI->getZExtValue();
+    IV = User1;
+  } else
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
+  return true;
+}
+
+// Collect all root increments with respect to the provided induction variable
+// (normally the PHI, but sometimes a multiply). A root increment is an
+// instruction, normally an add, with a positive constant less than Scale. In a
+// rerollable loop, each of these increments is the root of an instruction
+// graph isomorphic to the others. Also, we collect the final induction
+// increment (the increment equal to the Scale), and its users in LoopIncs.
+bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
+                                 Instruction *IV,
+                                 SmallVector<SmallInstructionVector, 32> &Roots,
+                                 SmallInstructionSet &AllRoots,
+                                 SmallInstructionVector &LoopIncs) {
+  for (Value::use_iterator UI = IV->use_begin(),
+       UIE = IV->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!SE->isSCEVable(User->getType()))
+      continue;
+    if (User->getType() != IV->getType())
+      continue;
+    if (!L->contains(User))
+      continue;
+    if (hasUsesOutsideLoop(User, L))
+      continue;
+
+    if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
+          SE->getSCEV(User), SE->getSCEV(IV)))) {
+      uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
+      if (Idx > 0 && Idx < Scale) {
+        Roots[Idx-1].push_back(User);
+        AllRoots.insert(User);
+      } else if (Idx == Scale && Inc > 1) {
+        LoopIncs.push_back(User);
+      }
+    }
+  }
+
+  if (Roots[0].empty())
+    return false;
+  bool AllSame = true;
+  for (unsigned i = 1; i < Scale-1; ++i)
+    if (Roots[i].size() != Roots[0].size()) {
+      AllSame = false;
+      break;
+    }
+
+  if (!AllSame)
+    return false;
+
+  return true;
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+         JE = PossibleReds[i].end(); J != JE; ++J) {
+	// Note that all instructions in the chain must have been found because
+	// all instructions in the function must have been assigned to some
+	// iteration.
+      int Iter = PossibleRedIter[*J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        *J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (Value::use_iterator UI =
+           PossibleReds[i].getReducedValue()->use_begin(),
+         UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI)
+      Users.push_back(cast<Instruction>(*UI));
+
+    for (SmallInstructionVector::iterator J = Users.begin(),
+         JE = Users.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
+  uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
+                   getValue()->getZExtValue();
+  // The collection of loop increment instructions.
+  SmallInstructionVector LoopIncs;
+  uint64_t Scale = Inc;
+
+  // The effective induction variable, IV, is normally also the real induction
+  // variable. When we're dealing with a loop like:
+  //   for (int i = 0; i < 500; ++i)
+  //     x[3*i] = ...;
+  //     x[3*i+1] = ...;
+  //     x[3*i+2] = ...;
+  // then the real IV is still i, but the effective IV is (3*i).
+  Instruction *RealIV = IV;
+  if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
+    return false;
+
+  assert(Scale <= MaxInc && "Scale is too large");
+  assert(Scale > 1 && "Scale must be at least 2");
+
+  // The set of increment instructions for each increment value.
+  SmallVector<SmallInstructionVector, 32> Roots(Scale-1);
+  SmallInstructionSet AllRoots;
+  if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *RealIV << "\n");
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
+                             PossibleRedLastSet);
+
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+  SmallInstructionSet Exclude(AllRoots);
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  DenseSet<Instruction *> BaseUseSet;
+  collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
+
+  DenseSet<Instruction *> AllRootUses;
+  std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+
+  bool MatchFailed = false;
+  for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
+    DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
+    collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
+                         PossibleRedSet, RootUseSet);
+
+    DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
+                    " vs. iteration increment " << (i+1) <<
+                    " use set size: " << RootUseSet.size() << "\n");
+
+    if (BaseUseSet.size() != RootUseSet.size()) {
+      MatchFailed = true;
+      break;
+    }
+
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
+    for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
+         JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
+      if (cast<Instruction>(J1) == RealIV)
+        continue;
+      if (cast<Instruction>(J1) == IV)
+        continue;
+      if (!BaseUseSet.count(J1))
+        continue;
+      if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
+        continue;
+
+      while (J2 != JE && (!RootUseSet.count(J2) ||
+             std::find(Roots[i].begin(), Roots[i].end(), J2) !=
+               Roots[i].end())) {
+        // As we iterate through the instructions, instructions that don't
+        // belong to previous iterations (or the base case), must belong to
+        // future iterations. We want to track the alias set of writes from
+        // previous iterations.
+        if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
+            !AllRootUses.count(J2)) {
+          if (J2->mayWriteToMemory())
+            AST.add(J2);
+
+          // Note: This is specifically guarded by a check on isa<PHINode>,
+          // which while a valid (somewhat arbitrary) micro-optimization, is
+          // needed because otherwise isSafeToSpeculativelyExecute returns
+          // false on PHI nodes.
+          if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
+            FutureSideEffects = true; 
+        }
+
+        ++J2;
+      }
+
+      if (!J1->isSameOperationAs(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << "\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some previous root instruction.
+      if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (prev. case overlap)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (J2->mayReadFromMemory()) {
+        for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
+             K != KE && !MatchFailed; ++K) {
+          if (K->aliasesUnknownInst(J2, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                            " vs. " << *J2 << " (depends on future store)\n");
+            MatchFailed = true;
+            break;
+          }
+        }
+      }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (simple) load/store dependencies.
+      if (FutureSideEffects &&
+            ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) ||
+             (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 <<
+                        " (side effects prevent reordering)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(J1, J2);
+
+      if (!(InReduction && J1->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;;
+        for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
+          Value *Op2 = J2->getOperand(j);
+
+	  // If this is part of a reduction (and the operation is not
+	  // associatve), then we match all operands, but not those that are
+	  // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(J2, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end())
+            Op2 = BMI->second;
+          else if (std::find(Roots[i].begin(), Roots[i].end(),
+                             (Instruction*) Op2) != Roots[i].end())
+            Op2 = IV;
+
+          if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+	    // If we've not already decided to swap the matched operands, and
+	    // we've not already matched our first operand (note that we could
+	    // have skipped matching the first operand because it is part of a
+	    // reduction above), and the instruction is commutative, then try
+	    // the swapped match.
+            if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
+                J1->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                              " vs. " << *J2 << " (operand " << j << ")\n");
+              MatchFailed = true;
+              break;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
+          (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (uses outside loop)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      if (!MatchFailed)
+        BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
+
+      AllRootUses.insert(J2);
+      Reductions.recordPair(J1, J2, i+1);
+
+      ++J2;
+    }
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+                  *RealIV << "\n");
+
+  DenseSet<Instruction *> LoopIncUseSet;
+  collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
+                       SmallInstructionSet(), LoopIncUseSet);
+  DEBUG(dbgs() << "LRR: Loop increment set size: " <<
+                  LoopIncUseSet.size() << "\n");
+
+  // Make sure that all instructions in the loop have been included in some
+  // use set.
+  for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
+       J != JE; ++J) {
+    if (isa<DbgInfoIntrinsic>(J))
+      continue;
+    if (cast<Instruction>(J) == RealIV)
+      continue;
+    if (cast<Instruction>(J) == IV)
+      continue;
+    if (BaseUseSet.count(J) || AllRootUses.count(J) ||
+        (LoopIncUseSet.count(J) && (J->isTerminator() ||
+                                    isSafeToSpeculativelyExecute(J, DL))))
+      continue;
+
+    if (AllRoots.count(J))
+      continue;
+
+    if (Reductions.isSelectedPHI(J))
+      continue;
+
+    DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
+                    " unprocessed instruction found: " << *J << "\n");
+    MatchFailed = true;
+    break;
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: all instructions processed from " <<
+                  *RealIV << "\n");
+
+  if (!Reductions.validateSelected())
+    return false;
+
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin();
+       J != Header->rend();) {
+    if (AllRootUses.count(&*J)) {
+      Instruction *D = &*J;
+      DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
+      D->eraseFromParent();
+      continue;
+    }
+
+    ++J; 
+  }
+
+  // Insert the new induction variable.
+  const SCEV *Start = RealIVSCEV->getStart();
+  if (Inc == 1)
+    Start = SE->getMulExpr(Start,
+                           SE->getConstant(Start->getType(), Scale));
+  const SCEVAddRecExpr *H =
+    cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start,
+                           SE->getConstant(RealIVSCEV->getType(), 1),
+                           L, SCEV::FlagAnyWrap));
+  { // Limit the lifetime of SCEVExpander.
+    SCEVExpander Expander(*SE, "reroll");
+    PHINode *NewIV =
+      cast<PHINode>(Expander.expandCodeFor(H, IV->getType(),
+                                           Header->begin()));
+    for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
+         JE = BaseUseSet.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(IV, NewIV);
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+      if (LoopIncUseSet.count(BI)) {
+        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+        if (Inc == 1)
+          ICSCEV =
+            SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
+        Value *IC;
+        if (isa<SCEVConstant>(ICSCEV)) {
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI);
+        } else {
+          BasicBlock *Preheader = L->getLoopPreheader();
+          if (!Preheader)
+            Preheader = InsertPreheaderForLoop(L, this);
+
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(),
+                                      Preheader->getTerminator());
+        }
+ 
+        Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); 
+        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC,
+                                   "exitcond");
+        BI->setCondition(Cond);
+
+        if (BI->getSuccessor(1) != Header)
+          BI->swapSuccessors();
+      }
+    }
+  }
+
+  SimplifyInstructionsInBlock(Header, DL, TLI);
+  DeleteDeadPHIs(Header, TLI);
+  ++NumRerolledLoops;
+  return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  AA = &getAnalysis<AliasAnalysis>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  DL = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTree>();
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+        "] Loop %" << Header->getName() << " (" <<
+        L->getNumBlocks() << " block(s))\n");
+
+  bool Changed = false;
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+  const SCEV *IterCount =
+    SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1));
+  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return Changed;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
+       IE = PossibleIVs.end(); I != IE; ++I)
+    if (reroll(*I, L, Header, IterCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 73e44d7..eff5268 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 }
 
 namespace {
+class LSRUse;
+}
+// Check if it is legal to fold 2 base registers.
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F);
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F);
+
+namespace {
 
 /// Cost - This class is used to measure and compare candidate formulae.
 class Cost {
@@ -785,11 +795,12 @@ class Cost {
   unsigned NumBaseAdds;
   unsigned ImmCost;
   unsigned SetupCost;
+  unsigned ScaleCost;
 
 public:
   Cost()
     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0) {}
+      SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -799,9 +810,9 @@ public:
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
-             | ImmCost | SetupCost) != ~0u)
+             | ImmCost | SetupCost | ScaleCost) != ~0u)
       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
-           & ImmCost & SetupCost) == ~0u);
+           & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
 
@@ -810,12 +821,14 @@ public:
     return NumRegs == ~0u;
   }
 
-  void RateFormula(const Formula &F,
+  void RateFormula(const TargetTransformInfo &TTI,
+                   const Formula &F,
                    SmallPtrSet<const SCEV *, 16> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
                    const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
+                   const LSRUse &LU,
                    SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 
   void print(raw_ostream &OS) const;
@@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
   }
 }
 
-void Cost::RateFormula(const Formula &F,
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
                        SmallPtrSet<const SCEV *, 16> &Regs,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
                        const SmallVectorImpl<int64_t> &Offsets,
                        ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F,
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
   if (NumBaseParts > 1)
-    NumBaseAdds += NumBaseParts - 1;
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
+
+  // Accumulate non-free scaling amounts.
+  ScaleCost += getScalingFactorCost(TTI, LU, F);
 
   // Tally up the non-zero immediates.
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -955,6 +975,7 @@ void Cost::Loose() {
   NumBaseAdds = ~0u;
   ImmCost = ~0u;
   SetupCost = ~0u;
+  ScaleCost = ~0u;
 }
 
 /// operator< - Choose the lower cost.
@@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const {
     return NumIVMuls < Other.NumIVMuls;
   if (NumBaseAdds != Other.NumBaseAdds)
     return NumBaseAdds < Other.NumBaseAdds;
+  if (ScaleCost != Other.ScaleCost)
+    return ScaleCost < Other.ScaleCost;
   if (ImmCost != Other.ImmCost)
     return ImmCost < Other.ImmCost;
   if (SetupCost != Other.SetupCost)
@@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const {
   if (NumBaseAdds != 0)
     OS << ", plus " << NumBaseAdds << " base add"
        << (NumBaseAdds == 1 ? "" : "s");
+  if (ScaleCost != 0)
+    OS << ", plus " << ScaleCost << " scale cost";
   if (ImmCost != 0)
     OS << ", plus " << ImmCost << " imm cost";
   if (SetupCost != 0)
@@ -1145,6 +1170,13 @@ public:
   /// may be used.
   bool AllFixupsOutsideLoop;
 
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
   /// WidestFixupType - This records the widest use type for any fixup using
   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   /// max fixup widths to be equivalent, because the narrower one may be relying
@@ -1163,6 +1195,7 @@ public:
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
+                                      RigidFormula(false),
                                       WidestFixupType(0) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
@@ -1189,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
@@ -1359,6 +1395,66 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F) {
+  // If F is used as an Addressing Mode, it may fold one Base plus one
+  // scaled register. If the scaled register is nil, do as if another
+  // element of the base regs is a 1-scaled register.
+  // This is possible if BaseRegs has at least 2 registers.
+
+  // If this is not an address calculation, this is not an addressing mode
+  // use.
+  if (LU.Kind !=  LSRUse::Address)
+    return false;
+
+  // F is already scaled.
+  if (F.Scale != 0)
+    return false;
+
+  // We need to keep one register for the base and one to scale.
+  if (F.BaseRegs.size() < 2)
+    return false;
+
+  return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
+ }
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F) {
+  if (!F.Scale)
+    return 0;
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                    LU.AccessTy, F) && "Illegal formula in use.");
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    // Check the scaling factor cost with both the min and max offsets.
+    int ScaleCostMinOffset =
+      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
+                               F.BaseOffset + LU.MinOffset,
+                               F.HasBaseReg, F.Scale);
+    int ScaleCostMaxOffset =
+      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
+                               F.BaseOffset + LU.MaxOffset,
+                               F.HasBaseReg, F.Scale);
+
+    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+           "Legal addressing mode has an illegal cost!");
+    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+  }
+  case LSRUse::ICmpZero:
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
+    // Therefore, return 0 in case F.Scale == -1.
+    return F.Scale != -1;
+
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, Type *AccessTy,
                              GlobalValue *BaseGV, int64_t BaseOffset,
@@ -1664,7 +1760,7 @@ void LSRInstance::OptimizeShadowIV() {
     IVUsers::const_iterator CandidateUI = UI;
     ++UI;
     Instruction *ShadowUse = CandidateUI->getUser();
-    Type *DestTy = NULL;
+    Type *DestTy = 0;
     bool IsSigned = false;
 
     /* If shadow use is a int->float cast then insert a second IV
@@ -1726,7 +1822,7 @@ void LSRInstance::OptimizeShadowIV() {
       continue;
 
     /* Initialize new IV, double d = 0.0 in above example. */
-    ConstantInt *C = NULL;
+    ConstantInt *C = 0;
     if (Incr->getOperand(0) == PH)
       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
     else if (Incr->getOperand(1) == PH)
@@ -2858,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, 0,
@@ -2901,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 /// and loop-computable portions.
 void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
   Formula F;
   F.InitialMatch(S, L, SE);
   bool Inserted = InsertFormula(LU, LUIdx, F);
@@ -3048,7 +3148,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
       if (Remainder)
         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
     }
-    return NULL;
+    return 0;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
     if (AR->getStart()->isZero())
@@ -3060,7 +3160,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
     // does not pertain to this loop.
     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
-      Remainder = NULL;
+      Remainder = 0;
     }
     if (Remainder != AR->getStart()) {
       if (!Remainder)
@@ -3082,7 +3182,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
       if (Remainder)
         Ops.push_back(SE.getMulExpr(C, Remainder));
-      return NULL;
+      return 0;
     }
   }
   return S;
@@ -3607,7 +3707,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                    abs64(NewF.BaseOffset)) &&
                   (C->getValue()->getValue() +
                    NewF.BaseOffset).countTrailingZeros() >=
-                   CountTrailingZeros_64(NewF.BaseOffset))
+                   countTrailingZeros<uint64_t>(NewF.BaseOffset))
                 goto skip_formula;
 
           // Ok, looks good.
@@ -3690,7 +3790,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
       // the corresponding bad register from the Regs set.
       Cost CostF;
       Regs.clear();
-      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
                         &LoserRegs);
       if (CostF.isLoser()) {
         // During initial formula generation, undesirable formulae are generated
@@ -3726,7 +3826,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
         Cost CostBest;
         Regs.clear();
-        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+                             DT, LU);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -4079,7 +4180,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // the current best, prune the search at that point.
     NewCost = CurCost;
     NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        LU);
     if (NewCost < SolutionCost) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
@@ -4266,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                            SCEVExpander &Rewriter,
                            SmallVectorImpl<WeakVH> &DeadInsts) const {
   const LSRUse &LU = Uses[LF.LUIdx];
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
 
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 80d060b..08ac38d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -49,12 +49,17 @@ namespace {
   class LoopUnroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopUnroll(int T = -1, int C = -1,  int P = -1) : LoopPass(ID) {
+    LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
       CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
       CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
+      CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+      UserAllowPartial = (P != -1) ||
+                         (UnrollAllowPartial.getNumOccurrences() > 0);
+      UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
+      UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0);
 
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
@@ -75,7 +80,11 @@ namespace {
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
+    bool     CurrentRuntime;
+    bool     UserCount;            // CurrentCount is user-specified.
     bool     UserThreshold;        // CurrentThreshold is user-specified.
+    bool     UserAllowPartial;     // CurrentAllowPartial is user-specified.
+    bool     UserRuntime;          // CurrentRuntime is user-specified.
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
-  return new LoopUnroll(Threshold, Count, AllowPartial);
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+                                 int Runtime) {
+  return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
@@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
 
+  TargetTransformInfo::UnrollingPreferences UP;
+  UP.Threshold = CurrentThreshold;
+  UP.OptSizeThreshold = OptSizeUnrollThreshold;
+  UP.Count = CurrentCount;
+  UP.Partial = CurrentAllowPartial;
+  UP.Runtime = CurrentRuntime;
+  TTI.getUnrollingPreferences(L, UP);
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
-  unsigned Threshold = CurrentThreshold;
+  unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
   if (!UserThreshold &&
       Header->getParent()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex,
                      Attribute::OptimizeForSize))
-    Threshold = OptSizeUnrollThreshold;
+    Threshold = UP.OptSizeThreshold;
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
+
+  bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime;
+
   // Use a default unroll-count if the user doesn't specify a value
   // and the trip count is a run-time value.  The default is different
   // for run-time or compile-time trip count loops.
-  unsigned Count = CurrentCount;
-  if (UnrollRuntime && CurrentCount == 0 && TripCount == 0)
+  unsigned Count = UserCount ? CurrentCount : UP.Count;
+  if (Runtime && Count == 0 && TripCount == 0)
     Count = UnrollRuntimeCount;
 
   if (Count == 0) {
@@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (TripCount != 1 && Size > Threshold) {
       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
             << " because size: " << Size << ">" << Threshold << "\n");
-      if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) {
+      bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+      if (!AllowPartial && !(Runtime && TripCount == 0)) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
               << "-unroll-allow-partial not given\n");
         return false;
@@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         while (Count != 0 && TripCount%Count != 0)
           Count--;
       }
-      else if (UnrollRuntime) {
+      else if (Runtime) {
         // Reduce unroll count to be a lower power-of-two value
         while (Count != 0 && Size > Threshold) {
           Count >>= 1;
@@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM))
     return false;
 
   return true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 0e8199f..c4ebfd5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -87,8 +87,8 @@ namespace {
     typedef LoopPropsMap::iterator LoopPropsMapIt;
 
     LoopPropsMap LoopsProperties;
-    UnswitchedValsMap* CurLoopInstructions;
-    LoopProperties* CurrentLoopProperties;
+    UnswitchedValsMap *CurLoopInstructions;
+    LoopProperties *CurrentLoopProperties;
 
     // Max size of code we can produce on remained iterations.
     unsigned MaxSize;
@@ -96,30 +96,30 @@ namespace {
     public:
 
       LUAnalysisCache() :
-        CurLoopInstructions(NULL), CurrentLoopProperties(NULL),
+        CurLoopInstructions(0), CurrentLoopProperties(0),
         MaxSize(Threshold)
       {}
 
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop* L, const TargetTransformInfo &TTI);
+      bool countLoop(const Loop *L, const TargetTransformInfo &TTI);
 
       // Clean all data related to given loop.
-      void forgetLoop(const Loop* L);
+      void forgetLoop(const Loop *L);
 
       // Mark case value as unswitched.
       // Since SI instruction can be partly unswitched, in order to avoid
       // extra unswitching in cloned loops keep track all unswitched values.
-      void setUnswitched(const SwitchInst* SI, const Value* V);
+      void setUnswitched(const SwitchInst *SI, const Value *V);
 
       // Check was this case value unswitched before or not.
-      bool isUnswitched(const SwitchInst* SI, const Value* V);
+      bool isUnswitched(const SwitchInst *SI, const Value *V);
 
       // Clone all loop-unswitch related loop properties.
       // Redistribute unswitching quotas.
       // Note, that new loop data is stored inside the VMap.
-      void cloneData(const Loop* NewLoop, const Loop* OldLoop,
-                     const ValueToValueMapTy& VMap);
+      void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                     const ValueToValueMapTy &VMap);
   };
 
   class LoopUnswitch : public LoopPass {
@@ -151,8 +151,8 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     explicit LoopUnswitch(bool Os = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
-      currentLoop(NULL), DT(NULL), loopHeader(NULL),
-      loopPreheader(NULL) {
+      currentLoop(0), DT(0), loopHeader(0),
+      loopPreheader(0) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -196,7 +196,7 @@ namespace {
 
     /// Split all of the edges from inside the loop to their exit blocks.
     /// Update the appropriate Phi nodes as we do so.
-    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
+    void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
     bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
     void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
@@ -212,8 +212,6 @@ namespace {
                                         Instruction *InsertPt);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
-    void RemoveBlockIfDead(BasicBlock *BB,
-                           std::vector<Instruction*> &Worklist, Loop *l);
     void RemoveLoopFromHierarchy(Loop *L);
     bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
                                     BasicBlock **LoopExit = 0);
@@ -225,12 +223,14 @@ namespace {
 // it. Returns true if we can unswitch this loop.
 bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
-  std::pair<LoopPropsMapIt, bool> InsertRes =
+  LoopPropsMapIt PropsIt;
+  bool Inserted;
+  llvm::tie(PropsIt, Inserted) =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
 
-  LoopProperties& Props = InsertRes.first->second;
+  LoopProperties &Props = PropsIt->second;
 
-  if (InsertRes.second) {
+  if (Inserted) {
     // New loop.
 
     // Limit the number of instructions to avoid causing significant code
@@ -242,8 +242,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
     // consideration code simplification opportunities and code that can
     // be shared by the resultant unswitched loops.
     CodeMetrics Metrics;
-    for (Loop::block_iterator I = L->block_begin(),
-           E = L->block_end();
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
          I != E; ++I)
       Metrics.analyzeBasicBlock(*I, TTI);
 
@@ -253,17 +252,16 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "NOT unswitching loop %"
-            << L->getHeader()->getName() << ", contents cannot be "
-            << "duplicated!\n");
+                   << L->getHeader()->getName() << ", contents cannot be "
+                   << "duplicated!\n");
       return false;
     }
   }
 
   if (!Props.CanBeUnswitchedCount) {
     DEBUG(dbgs() << "NOT unswitching loop %"
-          << L->getHeader()->getName() << ", cost too high: "
-          << L->getBlocks().size() << "\n");
-
+                 << L->getHeader()->getName() << ", cost too high: "
+                 << L->getBlocks().size() << "\n");
     return false;
   }
 
@@ -275,41 +273,41 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 }
 
 // Clean all data related to given loop.
-void LUAnalysisCache::forgetLoop(const Loop* L) {
+void LUAnalysisCache::forgetLoop(const Loop *L) {
 
   LoopPropsMapIt LIt = LoopsProperties.find(L);
 
   if (LIt != LoopsProperties.end()) {
-    LoopProperties& Props = LIt->second;
+    LoopProperties &Props = LIt->second;
     MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation;
     LoopsProperties.erase(LIt);
   }
 
-  CurrentLoopProperties = NULL;
-  CurLoopInstructions = NULL;
+  CurrentLoopProperties = 0;
+  CurLoopInstructions = 0;
 }
 
 // Mark case value as unswitched.
 // Since SI instruction can be partly unswitched, in order to avoid
 // extra unswitching in cloned loops keep track all unswitched values.
-void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) {
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
   (*CurLoopInstructions)[SI].insert(V);
 }
 
 // Check was this case value unswitched before or not.
-bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) {
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
   return (*CurLoopInstructions)[SI].count(V);
 }
 
 // Clone all loop-unswitch related loop properties.
 // Redistribute unswitching quotas.
 // Note, that new loop data is stored inside the VMap.
-void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
-                     const ValueToValueMapTy& VMap) {
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                                const ValueToValueMapTy &VMap) {
 
-  LoopProperties& NewLoopProps = LoopsProperties[NewLoop];
-  LoopProperties& OldLoopProps = *CurrentLoopProperties;
-  UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals;
+  LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties &OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
 
   // Reallocate "can-be-unswitched quota"
 
@@ -324,9 +322,9 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
   // for new loop switches we clone info about values that was
   // already unswitched and has redundant successors.
   for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
-    const SwitchInst* OldInst = I->first;
-    Value* NewI = VMap.lookup(OldInst);
-    const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI);
+    const SwitchInst *OldInst = I->first;
+    Value *NewI = VMap.lookup(OldInst);
+    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
     assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
 
     NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
@@ -458,14 +456,14 @@ bool LoopUnswitch::processCurrentLoop() {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
-        Constant *UnswitchVal = NULL;
+        Constant *UnswitchVal = 0;
 
         // Do not process same value again and again.
         // At this point we have some cases already unswitched and
         // some not yet unswitched. Let's find the first not yet unswitched one.
         for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
              i != e; ++i) {
-          Constant* UnswitchValCandidate = i.getCaseValue();
+          Constant *UnswitchValCandidate = i.getCaseValue();
           if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
             UnswitchVal = UnswitchValCandidate;
             break;
@@ -511,7 +509,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
     // Already visited. Without more analysis, this could indicate an infinite
     // loop.
     return false;
-  } else if (!L->contains(BB)) {
+  }
+  if (!L->contains(BB)) {
     // Otherwise, this is a loop exit, this is fine so long as this is the
     // first exit.
     if (ExitBB != 0) return false;
@@ -595,11 +594,11 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
     // on already unswitched cases.
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      BasicBlock* LoopExitCandidate;
+      BasicBlock *LoopExitCandidate;
       if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
                                                i.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt* CaseVal = i.getCaseValue();
+        ConstantInt *CaseVal = i.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -752,7 +751,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
 /// SplitExitEdges - Split all of the edges from inside the loop to their exit
 /// blocks.  Update the appropriate Phi nodes as we do so.
 void LoopUnswitch::SplitExitEdges(Loop *L,
-                                const SmallVector<BasicBlock *, 8> &ExitBlocks){
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks){
 
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *ExitBlock = ExitBlocks[i];
@@ -854,9 +853,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
     // If the successor of the exit block had PHI nodes, add an entry for
     // NewExit.
-    PHINode *PN;
-    for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) {
-      PN = cast<PHINode>(I);
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
       Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
       ValueToValueMapTy::iterator It = VMap.find(V);
       if (It != VMap.end()) V = It->second;
@@ -864,8 +862,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     }
 
     if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
-      PN = PHINode::Create(LPad->getType(), 0, "",
-                           ExitSucc->getFirstInsertionPt());
+      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+                                    ExitSucc->getFirstInsertionPt());
 
       for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
            I != E; ++I) {
@@ -946,117 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   ++NumSimplify;
 }
 
-/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
-/// information, and remove any dead successors it has.
-///
-void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
-                                     std::vector<Instruction*> &Worklist,
-                                     Loop *L) {
-  if (pred_begin(BB) != pred_end(BB)) {
-    // This block isn't dead, since an edge to BB was just removed, see if there
-    // are any easy simplifications we can do now.
-    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-      // If it has one pred, fold phi nodes in BB.
-      while (isa<PHINode>(BB->begin()))
-        ReplaceUsesOfWith(BB->begin(),
-                          cast<PHINode>(BB->begin())->getIncomingValue(0),
-                          Worklist, L, LPM);
-
-      // If this is the header of a loop and the only pred is the latch, we now
-      // have an unreachable loop.
-      if (Loop *L = LI->getLoopFor(BB))
-        if (loopHeader == BB && L->contains(Pred)) {
-          // Remove the branch from the latch to the header block, this makes
-          // the header dead, which will make the latch dead (because the header
-          // dominates the latch).
-          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
-          Pred->getTerminator()->eraseFromParent();
-          new UnreachableInst(BB->getContext(), Pred);
-
-          // The loop is now broken, remove it from LI.
-          RemoveLoopFromHierarchy(L);
-
-          // Reprocess the header, which now IS dead.
-          RemoveBlockIfDead(BB, Worklist, L);
-          return;
-        }
-
-      // If pred ends in a uncond branch, add uncond branch to worklist so that
-      // the two blocks will get merged.
-      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-        if (BI->isUnconditional())
-          Worklist.push_back(BI);
-    }
-    return;
-  }
-
-  DEBUG(dbgs() << "Nuking dead block: " << *BB);
-
-  // Remove the instructions in the basic block from the worklist.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    RemoveFromWorklist(I, Worklist);
-
-    // Anything that uses the instructions in this basic block should have their
-    // uses replaced with undefs.
-    // If I is not void type then replaceAllUsesWith undef.
-    // This allows ValueHandlers and custom metadata to adjust itself.
-    if (!I->getType()->isVoidTy())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-  }
-
-  // If this is the edge to the header block for a loop, remove the loop and
-  // promote all subloops.
-  if (Loop *BBLoop = LI->getLoopFor(BB)) {
-    if (BBLoop->getLoopLatch() == BB) {
-      RemoveLoopFromHierarchy(BBLoop);
-      if (currentLoop == BBLoop) {
-        currentLoop = 0;
-        redoLoop = false;
-      }
-    }
-  }
-
-  // Remove the block from the loop info, which removes it from any loops it
-  // was in.
-  LI->removeBlock(BB);
-
-
-  // Remove phi node entries in successors for this block.
-  TerminatorInst *TI = BB->getTerminator();
-  SmallVector<BasicBlock*, 4> Succs;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    Succs.push_back(TI->getSuccessor(i));
-    TI->getSuccessor(i)->removePredecessor(BB);
-  }
-
-  // Unique the successors, remove anything with multiple uses.
-  array_pod_sort(Succs.begin(), Succs.end());
-  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
-
-  // Remove the basic block, including all of the instructions contained in it.
-  LPM->deleteSimpleAnalysisValue(BB, L);
-  BB->eraseFromParent();
-  // Remove successor blocks here that are not dead, so that we know we only
-  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
-  // then getting removed before we revisit them, which is badness.
-  //
-  for (unsigned i = 0; i != Succs.size(); ++i)
-    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
-      // One exception is loop headers.  If this block was the preheader for a
-      // loop, then we DO want to visit the loop so the loop gets deleted.
-      // We know that if the successor is a loop header, that this loop had to
-      // be the preheader: the case where this was the latch block was handled
-      // above and headers can only have two predecessors.
-      if (!LI->isLoopHeader(Succs[i])) {
-        Succs.erase(Succs.begin()+i);
-        --i;
-      }
-    }
-
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
-    RemoveBlockIfDead(Succs[i], Worklist, L);
-}
-
 /// RemoveLoopFromHierarchy - We have discovered that the specified loop has
 /// become unwrapped, either because the backedge was deleted, or because the
 /// edge into the header was removed.  If the edge into the header from the
@@ -1088,7 +975,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   std::vector<Instruction*> Worklist;
   LLVMContext &Context = Val->getContext();
 
-
   // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
   // in the loop with the appropriate one directly.
   if (IsEqual || (isa<ConstantInt>(Val) &&
@@ -1108,8 +994,8 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       Worklist.push_back(U);
     }
 
-    for (std::vector<Instruction*>::iterator UI = Worklist.begin();
-         UI != Worklist.end(); ++UI)
+    for (std::vector<Instruction*>::iterator UI = Worklist.begin(),
+         UE = Worklist.end(); UI != UE; ++UI)
       (*UI)->replaceUsesOfWith(LIC, Replacement);
 
     SimplifyCode(Worklist, L);
@@ -1266,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         continue;
       }
 
-      if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
-        // Conditional branch.  Turn it into an unconditional branch, then
-        // remove dead blocks.
-        continue;  // FIXME: Enable.
-
-        DEBUG(dbgs() << "Folded branch: " << *BI);
-        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
-        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
-        DeadSucc->removePredecessor(BI->getParent(), true);
-        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
-        LPM->deleteSimpleAnalysisValue(BI, L);
-        BI->eraseFromParent();
-        RemoveFromWorklist(BI, Worklist);
-        ++NumSimplify;
-
-        RemoveBlockIfDead(DeadSucc, Worklist, L);
-      }
       continue;
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index be0f0e8..9912d3d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
   // pessimize the llvm optimizer.
   //
   // Since we don't have perfect knowledge here, make some assumptions: assume
-  // the maximum GPR width is the same size as the pointer size and assume that
-  // this width can be stored.  If so, check to see whether we will end up
-  // actually reducing the number of stores used.
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
   unsigned Bytes = unsigned(End-Start);
-  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+  unsigned MaxIntSize = TD.getLargestLegalIntTypeSize();
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
 
   // Assume the remaining bytes if any are done a byte at a time.
-  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+  unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize;
 
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
@@ -465,7 +468,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
     // Zap all the stores.
-    for (SmallVector<Instruction*, 16>::const_iterator
+    for (SmallVectorImpl<Instruction *>::const_iterator
          SI = Range.TheStores.begin(),
          SE = Range.TheStores.end(); SI != SE; ++SI) {
       MD->removeInstruction(*SI);
@@ -626,8 +629,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
       return false;
 
     Type *StructTy = cast<PointerType>(A->getType())->getElementType();
-    uint64_t destSize = TD->getTypeAllocSize(StructTy);
+    if (!StructTy->isSized()) {
+      // The call may never return and hence the copy-instruction may never
+      // be executed, and therefore it's not safe to say "the destination
+      // has at least <cpyLen> bytes, as implied by the copy-instruction",
+      return false;
+    }
 
+    uint64_t destSize = TD->getTypeAllocSize(StructTy);
     if (destSize < srcSize)
       return false;
   } else {
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 0000000..15cee44
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,156 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+  class PartiallyInlineLibCalls : public FunctionPass {
+  public:
+    static char ID;
+
+    PartiallyInlineLibCalls() :
+      FunctionPass(ID) {
+      initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    /// Optimize calls to sqrt.
+    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                      BasicBlock &CurrBB, Function::iterator &BB);
+  };
+
+  char PartiallyInlineLibCalls::ID = 0;
+}
+
+INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
+                "Partially inline calls to library functions", false, false)
+
+void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetTransformInfo>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
+  bool Changed = false;
+  Function::iterator CurrBB;
+  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc::Func LibFunc;
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+        continue;
+
+      switch (LibFunc) {
+      case LibFunc::sqrtf:
+      case LibFunc::sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
+                                           Function *CalledFunc,
+                                           BasicBlock &CurrBB,
+                                           Function::iterator &BB) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // if (v0 is a NaN)
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB;
+  return true;
+}
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCalls();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index a3c241d..328a9c5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -122,7 +122,6 @@ namespace {
   class XorOpnd {
   public:
     XorOpnd(Value *V);
-    const XorOpnd &operator=(const XorOpnd &That);
 
     bool isInvalid() const { return SymbolicPart == 0; }
     bool isOrExpr() const { return isOr; }
@@ -225,15 +224,6 @@ XorOpnd::XorOpnd(Value *V) {
   isOr = true;
 }
 
-const XorOpnd &XorOpnd::operator=(const XorOpnd &That) {
-  OrigVal = That.OrigVal;
-  SymbolicPart = That.SymbolicPart;
-  ConstPart = That.ConstPart;
-  SymbolicRank = That.SymbolicRank;
-  isOr = That.isOr;
-  return *this;
-}
-
 char Reassociate::ID = 0;
 INITIALIZE_PASS(Reassociate, "reassociate",
                 "Reassociate expressions", false, false)
@@ -251,21 +241,24 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
 }
 
 static bool isUnmovableInstruction(Instruction *I) {
-  if (I->getOpcode() == Instruction::PHI ||
-      I->getOpcode() == Instruction::LandingPad ||
-      I->getOpcode() == Instruction::Alloca ||
-      I->getOpcode() == Instruction::Load ||
-      I->getOpcode() == Instruction::Invoke ||
-      (I->getOpcode() == Instruction::Call &&
-       !isa<DbgInfoIntrinsic>(I)) ||
-      I->getOpcode() == Instruction::UDiv ||
-      I->getOpcode() == Instruction::SDiv ||
-      I->getOpcode() == Instruction::FDiv ||
-      I->getOpcode() == Instruction::URem ||
-      I->getOpcode() == Instruction::SRem ||
-      I->getOpcode() == Instruction::FRem)
+  switch (I->getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::LandingPad:
+  case Instruction::Alloca:
+  case Instruction::Load:
+  case Instruction::Invoke:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
     return true;
-  return false;
+  case Instruction::Call:
+    return !isa<DbgInfoIntrinsic>(I);
+  default:
+    return false;
+  }
 }
 
 void Reassociate::BuildRankMap(Function &F) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index e30a274..4364720 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -214,7 +214,7 @@ public:
   /// This returns true if the block was not considered live before.
   bool MarkBlockExecutable(BasicBlock *BB) {
     if (!BBExecutable.insert(BB)) return false;
-    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n");
+    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
     BBWorkList.push_back(BB);  // Add the block to the work list!
     return true;
   }
@@ -427,7 +427,7 @@ private:
       // feasible that wasn't before.  Revisit the PHI nodes in the block
       // because they have potentially new operands.
       DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
-            << " -> " << Dest->getName() << "\n");
+            << " -> " << Dest->getName() << '\n');
 
       PHINode *PN;
       for (BasicBlock::iterator I = Dest->begin();
@@ -439,7 +439,7 @@ private:
   // getFeasibleSuccessors - Return a vector of booleans to indicate which
   // successors are reachable from a given terminator instruction.
   //
-  void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs);
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
 
   // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
   // block to the 'To' basic block is currently feasible.
@@ -501,7 +501,7 @@ private:
 
   void visitInstruction(Instruction &I) {
     // If a new instruction is added to LLVM that we don't handle.
-    dbgs() << "SCCP: Don't know how to handle: " << I;
+    dbgs() << "SCCP: Don't know how to handle: " << I << '\n';
     markAnythingOverdefined(&I);   // Just in case
   }
 };
@@ -513,7 +513,7 @@ private:
 // successors are reachable from a given terminator instruction.
 //
 void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
-                                       SmallVector<bool, 16> &Succs) {
+                                       SmallVectorImpl<bool> &Succs) {
   Succs.resize(TI.getNumSuccessors());
   if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
     if (BI->isUnconditional()) {
@@ -1604,7 +1604,7 @@ bool SCCP::runOnFunction(Function &F) {
 
       Constant *Const = IV.isConstant()
         ? IV.getConstant() : UndefValue::get(Inst->getType());
-      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
+      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
 
       // Replaces all of the uses of a variable with uses of the constant.
       Inst->replaceAllUsesWith(Const);
@@ -1812,7 +1812,7 @@ bool IPSCCP::runOnModule(Module &M) {
 
         Constant *Const = IV.isConstant()
           ? IV.getConstant() : UndefValue::get(Inst->getType());
-        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
+        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
 
         // Replaces all of the uses of a variable with uses of the
         // constant.
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index d073e78..9f3fc83 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -47,6 +47,7 @@
 #include "llvm/InstVisitor.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -58,9 +59,9 @@ using namespace llvm;
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
-STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
-STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
-STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
@@ -110,17 +111,39 @@ typedef llvm::IRBuilder<false, ConstantFolder,
 }
 
 namespace {
-/// \brief A common base class for representing a half-open byte range.
-struct ByteRange {
+/// \brief A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
   /// \brief The beginning offset of the range.
   uint64_t BeginOffset;
 
   /// \brief The ending offset, not included in the range.
   uint64_t EndOffset;
 
-  ByteRange() : BeginOffset(), EndOffset() {}
-  ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
-      : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+  /// \brief Storage for both the use of this slice and whether it can be
+  /// split.
+  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+  Slice() : BeginOffset(), EndOffset() {}
+  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset),
+        UseAndIsSplittable(U, IsSplittable) {}
+
+  uint64_t beginOffset() const { return BeginOffset; }
+  uint64_t endOffset() const { return EndOffset; }
+
+  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+  bool isDead() const { return getUse() == 0; }
+  void kill() { UseAndIsSplittable.setPointer(0); }
 
   /// \brief Support for ordering ranges.
   ///
@@ -128,173 +151,67 @@ struct ByteRange {
   /// always increasing, and within equal start offsets, the end offsets are
   /// decreasing. Thus the spanning range comes first in a cluster with the
   /// same start position.
-  bool operator<(const ByteRange &RHS) const {
-    if (BeginOffset < RHS.BeginOffset) return true;
-    if (BeginOffset > RHS.BeginOffset) return false;
-    if (EndOffset > RHS.EndOffset) return true;
+  bool operator<(const Slice &RHS) const {
+    if (beginOffset() < RHS.beginOffset()) return true;
+    if (beginOffset() > RHS.beginOffset()) return false;
+    if (isSplittable() != RHS.isSplittable()) return !isSplittable();
+    if (endOffset() > RHS.endOffset()) return true;
     return false;
   }
 
   /// \brief Support comparison with a single offset to allow binary searches.
-  friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
-    return LHS.BeginOffset < RHSOffset;
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+                                              uint64_t RHSOffset) {
+    return LHS.beginOffset() < RHSOffset;
   }
-
   friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
-                                              const ByteRange &RHS) {
-    return LHSOffset < RHS.BeginOffset;
+                                              const Slice &RHS) {
+    return LHSOffset < RHS.beginOffset();
   }
 
-  bool operator==(const ByteRange &RHS) const {
-    return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+  bool operator==(const Slice &RHS) const {
+    return isSplittable() == RHS.isSplittable() &&
+           beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
   }
-  bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
 };
-
-/// \brief A partition of an alloca.
-///
-/// This structure represents a contiguous partition of the alloca. These are
-/// formed by examining the uses of the alloca. During formation, they may
-/// overlap but once an AllocaPartitioning is built, the Partitions within it
-/// are all disjoint.
-struct Partition : public ByteRange {
-  /// \brief Whether this partition is splittable into smaller partitions.
-  ///
-  /// We flag partitions as splittable when they are formed entirely due to
-  /// accesses by trivially splittable operations such as memset and memcpy.
-  bool IsSplittable;
-
-  /// \brief Test whether a partition has been marked as dead.
-  bool isDead() const {
-    if (BeginOffset == UINT64_MAX) {
-      assert(EndOffset == UINT64_MAX);
-      return true;
-    }
-    return false;
-  }
-
-  /// \brief Kill a partition.
-  /// This is accomplished by setting both its beginning and end offset to
-  /// the maximum possible value.
-  void kill() {
-    assert(!isDead() && "He's Dead, Jim!");
-    BeginOffset = EndOffset = UINT64_MAX;
-  }
-
-  Partition() : ByteRange(), IsSplittable() {}
-  Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
-      : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
-};
-
-/// \brief A particular use of a partition of the alloca.
-///
-/// This structure is used to associate uses of a partition with it. They
-/// mark the range of bytes which are referenced by a particular instruction,
-/// and includes a handle to the user itself and the pointer value in use.
-/// The bounds of these uses are determined by intersecting the bounds of the
-/// memory use itself with a particular partition. As a consequence there is
-/// intentionally overlap between various uses of the same partition.
-class PartitionUse : public ByteRange {
-  /// \brief Combined storage for both the Use* and split state.
-  PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit;
-
-public:
-  PartitionUse() : ByteRange(), UsePtrAndIsSplit() {}
-  PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U,
-               bool IsSplit)
-      : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {}
-
-  /// \brief The use in question. Provides access to both user and used value.
-  ///
-  /// Note that this may be null if the partition use is *dead*, that is, it
-  /// should be ignored.
-  Use *getUse() const { return UsePtrAndIsSplit.getPointer(); }
-
-  /// \brief Set the use for this partition use range.
-  void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); }
-
-  /// \brief Whether this use is split across multiple partitions.
-  bool isSplit() const { return UsePtrAndIsSplit.getInt(); }
-};
-}
+} // end anonymous namespace
 
 namespace llvm {
-template <> struct isPodLike<Partition> : llvm::true_type {};
-template <> struct isPodLike<PartitionUse> : llvm::true_type {};
+template <typename T> struct isPodLike;
+template <> struct isPodLike<Slice> {
+   static const bool value = true;
+};
 }
 
 namespace {
-/// \brief Alloca partitioning representation.
+/// \brief Representation of the alloca slices.
 ///
-/// This class represents a partitioning of an alloca into slices, and
-/// information about the nature of uses of each slice of the alloca. The goal
-/// is that this information is sufficient to decide if and how to split the
-/// alloca apart and replace slices with scalars. It is also intended that this
-/// structure can capture the relevant information needed both to decide about
-/// and to enact these transformations.
-class AllocaPartitioning {
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class AllocaSlices {
 public:
-  /// \brief Construct a partitioning of a particular alloca.
-  ///
-  /// Construction does most of the work for partitioning the alloca. This
-  /// performs the necessary walks of users and builds a partitioning from it.
-  AllocaPartitioning(const DataLayout &TD, AllocaInst &AI);
+  /// \brief Construct the slices of a particular alloca.
+  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
 
   /// \brief Test whether a pointer to the allocation escapes our analysis.
   ///
-  /// If this is true, the partitioning is never fully built and should be
+  /// If this is true, the slices are never fully built and should be
   /// ignored.
   bool isEscaped() const { return PointerEscapingInstr; }
 
-  /// \brief Support for iterating over the partitions.
+  /// \brief Support for iterating over the slices.
   /// @{
-  typedef SmallVectorImpl<Partition>::iterator iterator;
-  iterator begin() { return Partitions.begin(); }
-  iterator end() { return Partitions.end(); }
+  typedef SmallVectorImpl<Slice>::iterator iterator;
+  iterator begin() { return Slices.begin(); }
+  iterator end() { return Slices.end(); }
 
-  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
-  const_iterator begin() const { return Partitions.begin(); }
-  const_iterator end() const { return Partitions.end(); }
-  /// @}
-
-  /// \brief Support for iterating over and manipulating a particular
-  /// partition's uses.
-  ///
-  /// The iteration support provided for uses is more limited, but also
-  /// includes some manipulation routines to support rewriting the uses of
-  /// partitions during SROA.
-  /// @{
-  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
-  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
-  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
-  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
-  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
-
-  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
-  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
-  const_use_iterator use_begin(const_iterator I) const {
-    return Uses[I - begin()].begin();
-  }
-  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
-  const_use_iterator use_end(const_iterator I) const {
-    return Uses[I - begin()].end();
-  }
-
-  unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); }
-  unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); }
-  const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const {
-    return Uses[PIdx][UIdx];
-  }
-  const PartitionUse &getUse(const_iterator I, unsigned UIdx) const {
-    return Uses[I - begin()][UIdx];
-  }
-
-  void use_push_back(unsigned Idx, const PartitionUse &PU) {
-    Uses[Idx].push_back(PU);
-  }
-  void use_push_back(const_iterator I, const PartitionUse &PU) {
-    Uses[I - begin()].push_back(PU);
-  }
+  typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+  const_iterator begin() const { return Slices.begin(); }
+  const_iterator end() const { return Slices.end(); }
   /// @}
 
   /// \brief Allow iterating the dead users for this alloca.
@@ -320,66 +237,12 @@ public:
   dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
   /// @}
 
-  /// \brief MemTransferInst auxiliary data.
-  /// This struct provides some auxiliary data about memory transfer
-  /// intrinsics such as memcpy and memmove. These intrinsics can use two
-  /// different ranges within the same alloca, and provide other challenges to
-  /// correctly represent. We stash extra data to help us untangle this
-  /// after the partitioning is complete.
-  struct MemTransferOffsets {
-    /// The destination begin and end offsets when the destination is within
-    /// this alloca. If the end offset is zero the destination is not within
-    /// this alloca.
-    uint64_t DestBegin, DestEnd;
-
-    /// The source begin and end offsets when the source is within this alloca.
-    /// If the end offset is zero, the source is not within this alloca.
-    uint64_t SourceBegin, SourceEnd;
-
-    /// Flag for whether an alloca is splittable.
-    bool IsSplittable;
-  };
-  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
-    return MemTransferInstData.lookup(&II);
-  }
-
-  /// \brief Map from a PHI or select operand back to a partition.
-  ///
-  /// When manipulating PHI nodes or selects, they can use more than one
-  /// partition of an alloca. We store a special mapping to allow finding the
-  /// partition referenced by each of these operands, if any.
-  iterator findPartitionForPHIOrSelectOperand(Use *U) {
-    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
-      = PHIOrSelectOpMap.find(U);
-    if (MapIt == PHIOrSelectOpMap.end())
-      return end();
-
-    return begin() + MapIt->second.first;
-  }
-
-  /// \brief Map from a PHI or select operand back to the specific use of
-  /// a partition.
-  ///
-  /// Similar to mapping these operands back to the partitions, this maps
-  /// directly to the use structure of that partition.
-  use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) {
-    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
-      = PHIOrSelectOpMap.find(U);
-    assert(MapIt != PHIOrSelectOpMap.end());
-    return Uses[MapIt->second.first].begin() + MapIt->second.second;
-  }
-
-  /// \brief Compute a common type among the uses of a particular partition.
-  ///
-  /// This routines walks all of the uses of a particular partition and tries
-  /// to find a common type between them. Untyped operations such as memset and
-  /// memcpy are ignored.
-  Type *getCommonType(iterator I) const;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
-  void printUsers(raw_ostream &OS, const_iterator I,
+  void printSlice(raw_ostream &OS, const_iterator I,
                   StringRef Indent = "  ") const;
+  void printUse(raw_ostream &OS, const_iterator I,
+                StringRef Indent = "  ") const;
   void print(raw_ostream &OS) const;
   void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
   void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
@@ -387,47 +250,36 @@ public:
 
 private:
   template <typename DerivedT, typename RetT = void> class BuilderBase;
-  class PartitionBuilder;
-  friend class AllocaPartitioning::PartitionBuilder;
-  class UseBuilder;
-  friend class AllocaPartitioning::UseBuilder;
+  class SliceBuilder;
+  friend class AllocaSlices::SliceBuilder;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// \brief Handle to alloca instruction to simplify method interfaces.
   AllocaInst &AI;
 #endif
 
-  /// \brief The instruction responsible for this alloca having no partitioning.
+  /// \brief The instruction responsible for this alloca not having a known set
+  /// of slices.
   ///
   /// When an instruction (potentially) escapes the pointer to the alloca, we
-  /// store a pointer to that here and abort trying to partition the alloca.
-  /// This will be null if the alloca is partitioned successfully.
+  /// store a pointer to that here and abort trying to form slices of the
+  /// alloca. This will be null if the alloca slices are analyzed successfully.
   Instruction *PointerEscapingInstr;
 
-  /// \brief The partitions of the alloca.
+  /// \brief The slices of the alloca.
   ///
-  /// We store a vector of the partitions over the alloca here. This vector is
-  /// sorted by increasing begin offset, and then by decreasing end offset. See
-  /// the Partition inner class for more details. Initially (during
-  /// construction) there are overlaps, but we form a disjoint sequence of
-  /// partitions while finishing construction and a fully constructed object is
-  /// expected to always have this as a disjoint space.
-  SmallVector<Partition, 8> Partitions;
-
-  /// \brief The uses of the partitions.
-  ///
-  /// This is essentially a mapping from each partition to a list of uses of
-  /// that partition. The mapping is done with a Uses vector that has the exact
-  /// same number of entries as the partition vector. Each entry is itself
-  /// a vector of the uses.
-  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
+  /// We store a vector of the slices formed by uses of the alloca here. This
+  /// vector is sorted by increasing begin offset, and then the unsplittable
+  /// slices before the splittable ones. See the Slice inner class for more
+  /// details.
+  SmallVector<Slice, 8> Slices;
 
   /// \brief Instructions which will become dead if we rewrite the alloca.
   ///
-  /// Note that these are not separated by partition. This is because we expect
-  /// a partitioned alloca to be completely rewritten or not rewritten at all.
-  /// If rewritten, all these instructions can simply be removed and replaced
-  /// with undef as they come from outside of the allocated space.
+  /// Note that these are not separated by slice. This is because we expect an
+  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+  /// all these instructions can simply be removed and replaced with undef as
+  /// they come from outside of the allocated space.
   SmallVector<Instruction *, 8> DeadUsers;
 
   /// \brief Operands which will become dead if we rewrite the alloca.
@@ -439,26 +291,6 @@ private:
   /// want to swap this particular input for undef to simplify the use lists of
   /// the alloca.
   SmallVector<Use *, 8> DeadOperands;
-
-  /// \brief The underlying storage for auxiliary memcpy and memset info.
-  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
-
-  /// \brief A side datastructure used when building up the partitions and uses.
-  ///
-  /// This mapping is only really used during the initial building of the
-  /// partitioning so that we can retain information about PHI and select nodes
-  /// processed.
-  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
-
-  /// \brief Auxiliary information for particular PHI or select operands.
-  SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
-
-  /// \brief A utility routine called from the constructor.
-  ///
-  /// This does what it says on the tin. It is the key of the alloca partition
-  /// splitting and merging. After it is called we have the desired disjoint
-  /// collection of partitions.
-  void splitAndMergePartitions();
 };
 }
 
@@ -474,29 +306,35 @@ static Value *foldSelectInst(SelectInst &SI) {
   return 0;
 }
 
-/// \brief Builder for the alloca partitioning.
+/// \brief Builder for the alloca slices.
 ///
-/// This class builds an alloca partitioning by recursively visiting the uses
-/// of an alloca and splitting the partitions for each load and store at each
-/// offset.
-class AllocaPartitioning::PartitionBuilder
-    : public PtrUseVisitor<PartitionBuilder> {
-  friend class PtrUseVisitor<PartitionBuilder>;
-  friend class InstVisitor<PartitionBuilder>;
-  typedef PtrUseVisitor<PartitionBuilder> Base;
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+  friend class PtrUseVisitor<SliceBuilder>;
+  friend class InstVisitor<SliceBuilder>;
+  typedef PtrUseVisitor<SliceBuilder> Base;
 
   const uint64_t AllocSize;
-  AllocaPartitioning &P;
+  AllocaSlices &S;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+  SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
 
-  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
-  PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P)
-      : PtrUseVisitor<PartitionBuilder>(DL),
-        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())),
-        P(P) {}
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S)
+      : PtrUseVisitor<SliceBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {}
 
 private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I))
+      S.DeadUsers.push_back(&I);
+  }
+
   void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
                  bool IsSplittable = false) {
     // Completely skip uses which have a zero size or start either before or
@@ -505,9 +343,9 @@ private:
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
                    << " which has zero size or starts outside of the "
                    << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
+                   << "    alloca: " << S.AI << "\n"
                    << "       use: " << I << "\n");
-      return;
+      return markAsDead(I);
     }
 
     uint64_t BeginOffset = Offset.getZExtValue();
@@ -523,13 +361,26 @@ private:
     if (Size > AllocSize - BeginOffset) {
       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
                    << " to remain within the " << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
+                   << "    alloca: " << S.AI << "\n"
                    << "       use: " << I << "\n");
       EndOffset = AllocSize;
     }
 
-    Partition New(BeginOffset, EndOffset, IsSplittable);
-    P.Partitions.push_back(New);
+    S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    return Base::visitBitCastInst(BC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    return Base::visitGetElementPtrInst(GEPI);
   }
 
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
@@ -580,9 +431,9 @@ private:
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
                    << " which extends past the end of the " << AllocSize
                    << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
+                   << "    alloca: " << S.AI << "\n"
                    << "       use: " << SI << "\n");
-      return;
+      return markAsDead(SI);
     }
 
     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
@@ -597,7 +448,7 @@ private:
     if ((Length && Length->getValue() == 0) ||
         (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
-      return;
+      return markAsDead(II);
 
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
@@ -613,7 +464,7 @@ private:
     if ((Length && Length->getValue() == 0) ||
         (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
-      return;
+      return markAsDead(II);
 
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
@@ -622,63 +473,44 @@ private:
     uint64_t Size = Length ? Length->getLimitedValue()
                            : AllocSize - RawOffset;
 
-    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
-
-    // Only intrinsics with a constant length can be split.
-    Offsets.IsSplittable = Length;
+    // Check for the special case where the same exact value is used for both
+    // source and dest.
+    if (*U == II.getRawDest() && *U == II.getRawSource()) {
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return markAsDead(II);
 
-    if (*U == II.getRawDest()) {
-      Offsets.DestBegin = RawOffset;
-      Offsets.DestEnd = RawOffset + Size;
-    }
-    if (*U == II.getRawSource()) {
-      Offsets.SourceBegin = RawOffset;
-      Offsets.SourceEnd = RawOffset + Size;
+      return insertUse(II, Offset, Size, /*IsSplittable=*/false);
     }
 
-    // If we have set up end offsets for both the source and the destination,
-    // we have found both sides of this transfer pointing at the same alloca.
-    bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd;
-    if (SeenBothEnds && II.getRawDest() != II.getRawSource()) {
-      unsigned PrevIdx = MemTransferPartitionMap[&II];
+    // If we have seen both source and destination for a mem transfer, then
+    // they both point to the same alloca.
+    bool Inserted;
+    SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+    llvm::tie(MTPI, Inserted) =
+        MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size()));
+    unsigned PrevIdx = MTPI->second;
+    if (!Inserted) {
+      Slice &PrevP = S.Slices[PrevIdx];
 
       // Check if the begin offsets match and this is a non-volatile transfer.
       // In that case, we can completely elide the transfer.
-      if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
-        P.Partitions[PrevIdx].kill();
-        return;
+      if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+        PrevP.kill();
+        return markAsDead(II);
       }
 
       // Otherwise we have an offset transfer within the same alloca. We can't
       // split those.
-      P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false;
-    } else if (SeenBothEnds) {
-      // Handle the case where this exact use provides both ends of the
-      // operation.
-      assert(II.getRawDest() == II.getRawSource());
-
-      // For non-volatile transfers this is a no-op.
-      if (!II.isVolatile())
-        return;
-
-      // Otherwise just suppress splitting.
-      Offsets.IsSplittable = false;
+      PrevP.makeUnsplittable();
     }
 
-
     // Insert the use now that we've fixed up the splittable nature.
-    insertUse(II, Offset, Size, Offsets.IsSplittable);
-
-    // Setup the mapping from intrinsic to partition of we've not seen both
-    // ends of this transfer.
-    if (!SeenBothEnds) {
-      unsigned NewIdx = P.Partitions.size() - 1;
-      bool Inserted
-        = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second;
-      assert(Inserted &&
-             "Already have intrinsic in map but haven't seen both ends");
-      (void)Inserted;
-    }
+    insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+    // Check that we ended up with a valid index in the map.
+    assert(S.Slices[PrevIdx].getUse()->getUser() == &II &&
+           "Map index doesn't point back to a slice with this user.");
   }
 
   // Disable SRoA for any intrinsics except for lifetime invariants.
@@ -702,7 +534,7 @@ private:
 
   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
     // We consider any PHI or select that results in a direct load or store of
-    // the same offset to be a viable use for partitioning purposes. These uses
+    // the same offset to be a viable use for slicing purposes. These uses
     // are considered unsplittable and the size is the maximum loaded or stored
     // size.
     SmallPtrSet<Instruction *, 4> Visited;
@@ -747,234 +579,36 @@ private:
 
   void visitPHINode(PHINode &PN) {
     if (PN.use_empty())
-      return;
+      return markAsDead(PN);
     if (!IsOffsetKnown)
       return PI.setAborted(&PN);
 
     // See if we already have computed info on this node.
-    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
-    if (PHIInfo.first) {
-      PHIInfo.second = true;
-      insertUse(PN, Offset, PHIInfo.first);
-      return;
+    uint64_t &PHISize = PHIOrSelectSizes[&PN];
+    if (!PHISize) {
+      // This is a new PHI node, check for an unsafe use of the PHI node.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize))
+        return PI.setAborted(UnsafeI);
     }
 
-    // Check for an unsafe use of the PHI node.
-    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
-      return PI.setAborted(UnsafeI);
-
-    insertUse(PN, Offset, PHIInfo.first);
-  }
-
-  void visitSelectInst(SelectInst &SI) {
-    if (SI.use_empty())
-      return;
-    if (Value *Result = foldSelectInst(SI)) {
-      if (Result == *U)
-        // If the result of the constant fold will be the pointer, recurse
-        // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI);
-
-      return;
-    }
-    if (!IsOffsetKnown)
-      return PI.setAborted(&SI);
-
-    // See if we already have computed info on this node.
-    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
-    if (SelectInfo.first) {
-      SelectInfo.second = true;
-      insertUse(SI, Offset, SelectInfo.first);
-      return;
-    }
-
-    // Check for an unsafe use of the PHI node.
-    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
-      return PI.setAborted(UnsafeI);
-
-    insertUse(SI, Offset, SelectInfo.first);
-  }
-
-  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
-  void visitInstruction(Instruction &I) {
-    PI.setAborted(&I);
-  }
-};
-
-/// \brief Use adder for the alloca partitioning.
-///
-/// This class adds the uses of an alloca to all of the partitions which they
-/// use. For splittable partitions, this can end up doing essentially a linear
-/// walk of the partitions, but the number of steps remains bounded by the
-/// total result instruction size:
-/// - The number of partitions is a result of the number unsplittable
-///   instructions using the alloca.
-/// - The number of users of each partition is at worst the total number of
-///   splittable instructions using the alloca.
-/// Thus we will produce N * M instructions in the end, where N are the number
-/// of unsplittable uses and M are the number of splittable. This visitor does
-/// the exact same number of updates to the partitioning.
-///
-/// In the more common case, this visitor will leverage the fact that the
-/// partition space is pre-sorted, and do a logarithmic search for the
-/// partition needed, making the total visit a classical ((N + M) * log(N))
-/// complexity operation.
-class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> {
-  friend class PtrUseVisitor<UseBuilder>;
-  friend class InstVisitor<UseBuilder>;
-  typedef PtrUseVisitor<UseBuilder> Base;
-
-  const uint64_t AllocSize;
-  AllocaPartitioning &P;
-
-  /// \brief Set to de-duplicate dead instructions found in the use walk.
-  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
-
-public:
-  UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : PtrUseVisitor<UseBuilder>(TD),
-        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
-        P(P) {}
-
-private:
-  void markAsDead(Instruction &I) {
-    if (VisitedDeadInsts.insert(&I))
-      P.DeadUsers.push_back(&I);
-  }
-
-  void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) {
-    // If the use has a zero size or extends outside of the allocation, record
-    // it as a dead use for elimination later.
-    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize))
-      return markAsDead(User);
-
-    uint64_t BeginOffset = Offset.getZExtValue();
-    uint64_t EndOffset = BeginOffset + Size;
-
-    // Clamp the end offset to the end of the allocation. Note that this is
-    // formulated to handle even the case where "BeginOffset + Size" overflows.
-    assert(AllocSize >= BeginOffset); // Established above.
-    if (Size > AllocSize - BeginOffset)
-      EndOffset = AllocSize;
-
-    // NB: This only works if we have zero overlapping partitions.
-    iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset);
-    if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset)
-      I = llvm::prior(I);
-    iterator E = P.end();
-    bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset;
-    for (; I != E && I->BeginOffset < EndOffset; ++I) {
-      PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
-                         std::min(I->EndOffset, EndOffset), U, IsSplit);
-      P.use_push_back(I, NewPU);
-      if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
-        P.PHIOrSelectOpMap[U]
-          = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
-    }
-  }
-
-  void visitBitCastInst(BitCastInst &BC) {
-    if (BC.use_empty())
-      return markAsDead(BC);
-
-    return Base::visitBitCastInst(BC);
-  }
-
-  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (GEPI.use_empty())
-      return markAsDead(GEPI);
-
-    return Base::visitGetElementPtrInst(GEPI);
-  }
-
-  void visitLoadInst(LoadInst &LI) {
-    assert(IsOffsetKnown);
-    uint64_t Size = DL.getTypeStoreSize(LI.getType());
-    insertUse(LI, Offset, Size);
-  }
-
-  void visitStoreInst(StoreInst &SI) {
-    assert(IsOffsetKnown);
-    uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType());
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
-    // ignore it. Note that this is more strict than the generic clamping
-    // behavior of insertUse.
-    if (Offset.isNegative() || Size > AllocSize ||
-        Offset.ugt(AllocSize - Size))
-      return markAsDead(SI);
-
-    insertUse(SI, Offset, Size);
-  }
-
-  void visitMemSetInst(MemSetInst &II) {
-    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
-      return markAsDead(II);
-
-    assert(IsOffsetKnown);
-    insertUse(II, Offset, Length ? Length->getLimitedValue()
-                                 : AllocSize - Offset.getLimitedValue());
-  }
-
-  void visitMemTransferInst(MemTransferInst &II) {
-    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
-      return markAsDead(II);
-
-    assert(IsOffsetKnown);
-    uint64_t Size = Length ? Length->getLimitedValue()
-                           : AllocSize - Offset.getLimitedValue();
-
-    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
-    if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
-        Offsets.DestBegin == Offsets.SourceBegin)
-      return markAsDead(II); // Skip identity transfers without side-effects.
-
-    insertUse(II, Offset, Size);
-  }
-
-  void visitIntrinsicInst(IntrinsicInst &II) {
-    assert(IsOffsetKnown);
-    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
-           II.getIntrinsicID() == Intrinsic::lifetime_end);
-
-    ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-    insertUse(II, Offset, std::min(Length->getLimitedValue(),
-                                   AllocSize - Offset.getLimitedValue()));
-  }
-
-  void insertPHIOrSelect(Instruction &User, const APInt &Offset) {
-    uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
-
     // For PHI and select operands outside the alloca, we can't nuke the entire
     // phi or select -- the other side might still be relevant, so we special
     // case them here and use a separate structure to track the operands
     // themselves which should be replaced with undef.
-    if ((Offset.isNegative() && Offset.uge(Size)) ||
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if ((Offset.isNegative() && (-Offset).uge(PHISize)) ||
         (!Offset.isNegative() && Offset.uge(AllocSize))) {
-      P.DeadOperands.push_back(U);
+      S.DeadOperands.push_back(U);
       return;
     }
 
-    insertUse(User, Offset, Size);
-  }
-
-  void visitPHINode(PHINode &PN) {
-    if (PN.use_empty())
-      return markAsDead(PN);
-
-    assert(IsOffsetKnown);
-    insertPHIOrSelect(PN, Offset);
+    insertUse(PN, Offset, PHISize);
   }
 
   void visitSelectInst(SelectInst &SI) {
     if (SI.use_empty())
       return markAsDead(SI);
-
     if (Value *Result = foldSelectInst(SI)) {
       if (Result == *U)
         // If the result of the constant fold will be the pointer, recurse
@@ -983,276 +617,106 @@ private:
       else
         // Otherwise the operand to the select is dead, and we can replace it
         // with undef.
-        P.DeadOperands.push_back(U);
+        S.DeadOperands.push_back(U);
 
       return;
     }
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
 
-    assert(IsOffsetKnown);
-    insertPHIOrSelect(SI, Offset);
-  }
-
-  /// \brief Unreachable, we've already visited the alloca once.
-  void visitInstruction(Instruction &I) {
-    llvm_unreachable("Unhandled instruction in use builder.");
-  }
-};
-
-void AllocaPartitioning::splitAndMergePartitions() {
-  size_t NumDeadPartitions = 0;
-
-  // Track the range of splittable partitions that we pass when accumulating
-  // overlapping unsplittable partitions.
-  uint64_t SplitEndOffset = 0ull;
-
-  Partition New(0ull, 0ull, false);
-
-  for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
-    ++j;
-
-    if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
-      assert(New.BeginOffset == New.EndOffset);
-      New = Partitions[i];
-    } else {
-      assert(New.IsSplittable);
-      New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
-    }
-    assert(New.BeginOffset != New.EndOffset);
-
-    // Scan the overlapping partitions.
-    while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
-      // If the new partition we are forming is splittable, stop at the first
-      // unsplittable partition.
-      if (New.IsSplittable && !Partitions[j].IsSplittable)
-        break;
-
-      // Grow the new partition to include any equally splittable range. 'j' is
-      // always equally splittable when New is splittable, but when New is not
-      // splittable, we may subsume some (or part of some) splitable partition
-      // without growing the new one.
-      if (New.IsSplittable == Partitions[j].IsSplittable) {
-        New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
-      } else {
-        assert(!New.IsSplittable);
-        assert(Partitions[j].IsSplittable);
-        SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
-      }
-
-      Partitions[j].kill();
-      ++NumDeadPartitions;
-      ++j;
-    }
-
-    // If the new partition is splittable, chop off the end as soon as the
-    // unsplittable subsequent partition starts and ensure we eventually cover
-    // the splittable area.
-    if (j != e && New.IsSplittable) {
-      SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
-      New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+    // See if we already have computed info on this node.
+    uint64_t &SelectSize = PHIOrSelectSizes[&SI];
+    if (!SelectSize) {
+      // This is a new Select, check for an unsafe use of it.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize))
+        return PI.setAborted(UnsafeI);
     }
 
-    // Add the new partition if it differs from the original one and is
-    // non-empty. We can end up with an empty partition here if it was
-    // splittable but there is an unsplittable one that starts at the same
-    // offset.
-    if (New != Partitions[i]) {
-      if (New.BeginOffset != New.EndOffset)
-        Partitions.push_back(New);
-      // Mark the old one for removal.
-      Partitions[i].kill();
-      ++NumDeadPartitions;
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if ((Offset.isNegative() && Offset.uge(SelectSize)) ||
+        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+      S.DeadOperands.push_back(U);
+      return;
     }
 
-    New.BeginOffset = New.EndOffset;
-    if (!New.IsSplittable) {
-      New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
-      if (j != e && !Partitions[j].IsSplittable)
-        New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
-      New.IsSplittable = true;
-      // If there is a trailing splittable partition which won't be fused into
-      // the next splittable partition go ahead and add it onto the partitions
-      // list.
-      if (New.BeginOffset < New.EndOffset &&
-          (j == e || !Partitions[j].IsSplittable ||
-           New.EndOffset < Partitions[j].BeginOffset)) {
-        Partitions.push_back(New);
-        New.BeginOffset = New.EndOffset = 0ull;
-      }
-    }
+    insertUse(SI, Offset, SelectSize);
   }
 
-  // Re-sort the partitions now that they have been split and merged into
-  // disjoint set of partitions. Also remove any of the dead partitions we've
-  // replaced in the process.
-  std::sort(Partitions.begin(), Partitions.end());
-  if (NumDeadPartitions) {
-    assert(Partitions.back().isDead());
-    assert((ptrdiff_t)NumDeadPartitions ==
-           std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  void visitInstruction(Instruction &I) {
+    PI.setAborted(&I);
   }
-  Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
-}
+};
 
-AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
     :
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
       PointerEscapingInstr(0) {
-  PartitionBuilder PB(TD, AI, *this);
-  PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  SliceBuilder PB(DL, AI, *this);
+  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
   if (PtrI.isEscaped() || PtrI.isAborted()) {
     // FIXME: We should sink the escape vs. abort info into the caller nicely,
-    // possibly by just storing the PtrInfo in the AllocaPartitioning.
+    // possibly by just storing the PtrInfo in the AllocaSlices.
     PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
                                                   : PtrI.getAbortingInst();
     assert(PointerEscapingInstr && "Did not track a bad instruction");
     return;
   }
 
+  Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
+                              std::mem_fun_ref(&Slice::isDead)),
+               Slices.end());
+
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
-  std::sort(Partitions.begin(), Partitions.end());
-
-  // Remove any partitions from the back which are marked as dead.
-  while (!Partitions.empty() && Partitions.back().isDead())
-    Partitions.pop_back();
-
-  if (Partitions.size() > 1) {
-    // Intersect splittability for all partitions with equal offsets and sizes.
-    // Then remove all but the first so that we have a sequence of non-equal but
-    // potentially overlapping partitions.
-    for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
-         I = J) {
-      ++J;
-      while (J != E && *I == *J) {
-        I->IsSplittable &= J->IsSplittable;
-        ++J;
-      }
-    }
-    Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
-                     Partitions.end());
-
-    // Split splittable and merge unsplittable partitions into a disjoint set
-    // of partitions over the used space of the allocation.
-    splitAndMergePartitions();
-  }
-
-  // Record how many partitions we end up with.
-  NumAllocaPartitions += Partitions.size();
-  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
-
-  // Now build up the user lists for each of these disjoint partitions by
-  // re-walking the recursive users of the alloca.
-  Uses.resize(Partitions.size());
-  UseBuilder UB(TD, AI, *this);
-  PtrI = UB.visitPtr(AI);
-  assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
-  assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
-
-  unsigned NumUses = 0;
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
-  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
-    NumUses += Uses[Idx].size();
-#endif
-  NumAllocaPartitionUses += NumUses;
-  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
+  std::sort(Slices.begin(), Slices.end());
 }
 
-Type *AllocaPartitioning::getCommonType(iterator I) const {
-  Type *Ty = 0;
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    Use *U = UI->getUse();
-    if (!U)
-      continue; // Skip dead uses.
-    if (isa<IntrinsicInst>(*U->getUser()))
-      continue;
-    if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
-      continue;
-
-    Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
-      UserTy = LI->getType();
-    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
-      UserTy = SI->getValueOperand()->getType();
-    else
-      return 0; // Bail if we have weird uses.
-
-    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
-      // If the type is larger than the partition, skip it. We only encounter
-      // this for split integer operations where we want to use the type of the
-      // entity causing the split.
-      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
-        continue;
-
-      // If we have found an integer type use covering the alloca, use that
-      // regardless of the other types, as integers are often used for a "bucket
-      // of bits" type.
-      return ITy;
-    }
-
-    if (Ty && Ty != UserTy)
-      return 0;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
-    Ty = UserTy;
-  }
-  return Ty;
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+                         StringRef Indent) const {
+  printSlice(OS, I, Indent);
+  printUse(OS, I, Indent);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-
-void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
-                               StringRef Indent) const {
-  OS << Indent << "partition #" << (I - begin())
-     << " [" << I->BeginOffset << "," << I->EndOffset << ")"
-     << (I->IsSplittable ? " (splittable)" : "")
-     << (Uses[I - begin()].empty() ? " (zero uses)" : "")
-     << "\n";
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+                              StringRef Indent) const {
+  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+     << " slice #" << (I - begin())
+     << (I->isSplittable() ? " (splittable)" : "") << "\n";
 }
 
-void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
-                                    StringRef Indent) const {
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (!UI->getUse())
-      continue; // Skip dead uses.
-    OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
-       << "used by: " << *UI->getUse()->getUser() << "\n";
-    if (MemTransferInst *II =
-            dyn_cast<MemTransferInst>(UI->getUse()->getUser())) {
-      const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
-      bool IsDest;
-      if (!MTO.IsSplittable)
-        IsDest = UI->BeginOffset == MTO.DestBegin;
-      else
-        IsDest = MTO.DestBegin != 0u;
-      OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
-         << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
-         << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
-    }
-  }
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+                            StringRef Indent) const {
+  OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
 }
 
-void AllocaPartitioning::print(raw_ostream &OS) const {
+void AllocaSlices::print(raw_ostream &OS) const {
   if (PointerEscapingInstr) {
-    OS << "No partitioning for alloca: " << AI << "\n"
+    OS << "Can't analyze slices for alloca: " << AI << "\n"
        << "  A pointer to this alloca escaped by:\n"
        << "  " << *PointerEscapingInstr << "\n";
     return;
   }
 
-  OS << "Partitioning of alloca: " << AI << "\n";
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+  OS << "Slices of alloca: " << AI << "\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
     print(OS, I);
-    printUsers(OS, I);
-  }
 }
 
-void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
-void AllocaPartitioning::dump() const { print(dbgs()); }
+void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); }
+void AllocaSlices::dump() const { print(dbgs()); }
 
 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
-
 namespace {
 /// \brief Implementation of LoadAndStorePromoter for promoting allocas.
 ///
@@ -1269,12 +733,13 @@ class AllocaPromoter : public LoadAndStorePromoter {
   SmallVector<DbgValueInst *, 4> DVIs;
 
 public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+  AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S,
                  AllocaInst &AI, DIBuilder &DIB)
-    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
 
   void run(const SmallVectorImpl<Instruction*> &Insts) {
-    // Remember which alloca we're promoting (for isInstInList).
+    // Retain the debug information attached to the alloca for use when
+    // rewriting loads and stores.
     if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
       for (Value::use_iterator UI = DebugNode->use_begin(),
                                UE = DebugNode->use_end();
@@ -1286,7 +751,9 @@ public:
     }
 
     LoadAndStorePromoter::run(Insts);
-    AI.eraseFromParent();
+
+    // While we have the debug information, clear it off of the alloca. The
+    // caller takes care of deleting the alloca.
     while (!DDIs.empty())
       DDIs.pop_back_val()->eraseFromParent();
     while (!DVIs.empty())
@@ -1295,13 +762,34 @@ public:
 
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const {
+    Value *Ptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      return LI->getOperand(0) == &AI;
-    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+
+    // Only used to detect cycles, which will be rare and quickly found as
+    // we're walking up a chain of defs rather than down through uses.
+    SmallPtrSet<Value *, 4> Visited;
+
+    do {
+      if (Ptr == &AI)
+        return true;
+
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
+        Ptr = BCI->getOperand(0);
+      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+        Ptr = GEPI->getPointerOperand();
+      else
+        return false;
+
+    } while (Visited.insert(Ptr));
+
+    return false;
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
-    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
@@ -1309,7 +797,7 @@ public:
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
     }
-    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       Value *Arg = 0;
@@ -1360,7 +848,7 @@ class SROA : public FunctionPass {
   const bool RequiresDomTree;
 
   LLVMContext *C;
-  const DataLayout *TD;
+  const DataLayout *DL;
   DominatorTree *DT;
 
   /// \brief Worklist of alloca instructions to simplify.
@@ -1390,10 +878,25 @@ class SROA : public FunctionPass {
   /// \brief A collection of alloca instructions we can directly promote.
   std::vector<AllocaInst *> PromotableAllocas;
 
+  /// \brief A worklist of PHIs to speculate prior to promoting allocas.
+  ///
+  /// All of these PHIs have been checked for the safety of speculation and by
+  /// being speculated will allow promoting allocas currently in the promotable
+  /// queue.
+  SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs;
+
+  /// \brief A worklist of select instructions to speculate prior to promoting
+  /// allocas.
+  ///
+  /// All of these select instructions have been checked for the safety of
+  /// speculation and by being speculated will allow promoting allocas
+  /// currently in the promotable queue.
+  SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects;
+
 public:
   SROA(bool RequiresDomTree = true)
       : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
-        C(0), TD(0), DT(0) {
+        C(0), DL(0), DT(0) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F);
@@ -1404,13 +907,13 @@ public:
 
 private:
   friend class PHIOrSelectSpeculator;
-  friend class AllocaPartitionRewriter;
-  friend class AllocaPartitionVectorRewriter;
+  friend class AllocaSliceRewriter;
 
-  bool rewriteAllocaPartition(AllocaInst &AI,
-                              AllocaPartitioning &P,
-                              AllocaPartitioning::iterator PI);
-  bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
+  bool rewritePartition(AllocaInst &AI, AllocaSlices &S,
+                        AllocaSlices::iterator B, AllocaSlices::iterator E,
+                        int64_t BeginOffset, int64_t EndOffset,
+                        ArrayRef<AllocaSlices::iterator> SplitUses);
+  bool splitAlloca(AllocaInst &AI, AllocaSlices &S);
   bool runOnAlloca(AllocaInst &AI);
   void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
   bool promoteAllocas(Function &F);
@@ -1429,286 +932,255 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
                     false, false)
 
-namespace {
-/// \brief Visitor to speculate PHIs and Selects where possible.
-class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
-  // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
-
-  const DataLayout &TD;
-  AllocaPartitioning &P;
-  SROA &Pass;
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
+static Type *findCommonType(AllocaSlices::const_iterator B,
+                            AllocaSlices::const_iterator E,
+                            uint64_t EndOffset) {
+  Type *Ty = 0;
+  bool IgnoreNonIntegralTypes = false;
+  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+    Use *U = I->getUse();
+    if (isa<IntrinsicInst>(*U->getUser()))
+      continue;
+    if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+      continue;
 
-public:
-  PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass)
-    : TD(TD), P(P), Pass(Pass) {}
-
-  /// \brief Visit the users of an alloca partition and rewrite them.
-  void visitUsers(AllocaPartitioning::const_iterator PI) {
-    // Note that we need to use an index here as the underlying vector of uses
-    // may be grown during speculation. However, we never need to re-visit the
-    // new uses, and so we can use the initial size bound.
-    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
-      const PartitionUse &PU = P.getUse(PI, Idx);
-      if (!PU.getUse())
-        continue; // Skip dead use.
-
-      visit(cast<Instruction>(PU.getUse()->getUser()));
+    Type *UserTy = 0;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      UserTy = SI->getValueOperand()->getType();
+    } else {
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
+      continue;
     }
-  }
 
-private:
-  // By default, skip this instruction.
-  void visitInstruction(Instruction &I) {}
-
-  /// PHI instructions that use an alloca and are subsequently loaded can be
-  /// rewritten to load both input pointers in the pred blocks and then PHI the
-  /// results, allowing the load of the alloca to be promoted.
-  /// From this:
-  ///   %P2 = phi [i32* %Alloca, i32* %Other]
-  ///   %V = load i32* %P2
-  /// to:
-  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
-  ///   ...
-  ///   %V2 = load i32* %Other
-  ///   ...
-  ///   %V = phi [i32 %V1, i32 %V2]
-  ///
-  /// We can do this to a select if its only uses are loads and if the operands
-  /// to the select can be loaded unconditionally.
-  ///
-  /// FIXME: This should be hoisted into a generic utility, likely in
-  /// Transforms/Util/Local.h
-  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
-    // For now, we can only do this promotion if the load is in the same block
-    // as the PHI, and if there are no stores between the phi and load.
-    // TODO: Allow recursive phi users.
-    // TODO: Allow stores.
-    BasicBlock *BB = PN.getParent();
-    unsigned MaxAlign = 0;
-    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
-         UI != UE; ++UI) {
-      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-      if (LI == 0 || !LI->isSimple()) return false;
-
-      // For now we only allow loads in the same block as the PHI.  This is
-      // a common case that happens when instcombine merges two loads through
-      // a PHI.
-      if (LI->getParent() != BB) return false;
-
-      // Ensure that there are no instructions between the PHI and the load that
-      // could store.
-      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
-        if (BBI->mayWriteToMemory())
-          return false;
-
-      MaxAlign = std::max(MaxAlign, LI->getAlignment());
-      Loads.push_back(LI);
+    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (ITy->getBitWidth() % 8 != 0 ||
+          ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+        continue;
+
+      // If we have found an integer type use covering the alloca, use that
+      // regardless of the other types, as integers are often used for
+      // a "bucket of bits" type.
+      //
+      // NB: This *must* be the only return from inside the loop so that the
+      // order of slices doesn't impact the computed type.
+      return ITy;
+    } else if (IgnoreNonIntegralTypes) {
+      continue;
     }
 
-    // We can only transform this if it is safe to push the loads into the
-    // predecessor blocks. The only thing to watch out for is that we can't put
-    // a possibly trapping load in the predecessor if it is a critical edge.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
-      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
-      Value *InVal = PN.getIncomingValue(Idx);
-
-      // If the value is produced by the terminator of the predecessor (an
-      // invoke) or it has side-effects, there is no valid place to put a load
-      // in the predecessor.
-      if (TI == InVal || TI->mayHaveSideEffects())
-        return false;
+    if (Ty && Ty != UserTy)
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
 
-      // If the predecessor has a single successor, then the edge isn't
-      // critical.
-      if (TI->getNumSuccessors() == 1)
-        continue;
+    Ty = UserTy;
+  }
+  return Ty;
+}
 
-      // If this pointer is always safe to load, or if we can prove that there
-      // is already a load in the block, then we can move the load to the pred
-      // block.
-      if (InVal->isDereferenceablePointer() ||
-          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
-        continue;
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN,
+                                 const DataLayout *DL = 0) {
+  // For now, we can only do this promotion if the load is in the same block
+  // as the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN.getParent();
+  unsigned MaxAlign = 0;
+  bool HaveLoad = false;
+  for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE;
+       ++UI) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+    if (LI == 0 || !LI->isSimple())
+      return false;
 
+    // For now we only allow loads in the same block as the PHI.  This is
+    // a common case that happens when instcombine merges two loads through
+    // a PHI.
+    if (LI->getParent() != BB)
       return false;
-    }
 
-    return true;
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+    HaveLoad = true;
   }
 
-  void visitPHINode(PHINode &PN) {
-    DEBUG(dbgs() << "    original: " << PN << "\n");
+  if (!HaveLoad)
+    return false;
 
-    SmallVector<LoadInst *, 4> Loads;
-    if (!isSafePHIToSpeculate(PN, Loads))
-      return;
+  // We can only transform this if it is safe to push the loads into the
+  // predecessor blocks. The only thing to watch out for is that we can't put
+  // a possibly trapping load in the predecessor if it is a critical edge.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+
+    // If the value is produced by the terminator of the predecessor (an
+    // invoke) or it has side-effects, there is no valid place to put a load
+    // in the predecessor.
+    if (TI == InVal || TI->mayHaveSideEffects())
+      return false;
 
-    assert(!Loads.empty());
+    // If the predecessor has a single successor, then the edge isn't
+    // critical.
+    if (TI->getNumSuccessors() == 1)
+      continue;
 
-    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
-    IRBuilderTy PHIBuilder(&PN);
-    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
-                                          PN.getName() + ".sroa.speculated");
+    // If this pointer is always safe to load, or if we can prove that there
+    // is already a load in the block, then we can move the load to the pred
+    // block.
+    if (InVal->isDereferenceablePointer() ||
+        isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL))
+      continue;
 
-    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
-    // matter which one we get and if any differ.
-    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
-    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
-    unsigned Align = SomeLoad->getAlignment();
+    return false;
+  }
 
-    // Rewrite all loads of the PN to use the new PHI.
-    do {
-      LoadInst *LI = Loads.pop_back_val();
-      LI->replaceAllUsesWith(NewPN);
-      Pass.DeadInsts.insert(LI);
-    } while (!Loads.empty());
-
-    // Inject loads into all of the pred blocks.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
-      BasicBlock *Pred = PN.getIncomingBlock(Idx);
-      TerminatorInst *TI = Pred->getTerminator();
-      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
-      Value *InVal = PN.getIncomingValue(Idx);
-      IRBuilderTy PredBuilder(TI);
-
-      LoadInst *Load
-        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
-                                         Pred->getName()));
-      ++NumLoadsSpeculated;
-      Load->setAlignment(Align);
-      if (TBAATag)
-        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-      NewPN->addIncoming(Load, Pred);
-
-      Instruction *Ptr = dyn_cast<Instruction>(InVal);
-      if (!Ptr)
-        // No uses to rewrite.
-        continue;
+  return true;
+}
 
-      // Try to lookup and rewrite any partition uses corresponding to this phi
-      // input.
-      AllocaPartitioning::iterator PI
-        = P.findPartitionForPHIOrSelectOperand(InUse);
-      if (PI == P.end())
-        continue;
+static void speculatePHINodeLoads(PHINode &PN) {
+  DEBUG(dbgs() << "    original: " << PN << "\n");
+
+  Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+  IRBuilderTy PHIBuilder(&PN);
+  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                        PN.getName() + ".sroa.speculated");
+
+  // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+  // matter which one we get and if any differ.
+  LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin());
+  MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+  unsigned Align = SomeLoad->getAlignment();
+
+  // Rewrite all loads of the PN to use the new PHI.
+  while (!PN.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(*PN.use_begin());
+    LI->replaceAllUsesWith(NewPN);
+    LI->eraseFromParent();
+  }
+
+  // Inject loads into all of the pred blocks.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    BasicBlock *Pred = PN.getIncomingBlock(Idx);
+    TerminatorInst *TI = Pred->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+    IRBuilderTy PredBuilder(TI);
+
+    LoadInst *Load = PredBuilder.CreateLoad(
+        InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+    ++NumLoadsSpeculated;
+    Load->setAlignment(Align);
+    if (TBAATag)
+      Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+    NewPN->addIncoming(Load, Pred);
+  }
+
+  DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  PN.eraseFromParent();
+}
 
-      // Replace the Use in the PartitionUse for this operand with the Use
-      // inside the load.
-      AllocaPartitioning::use_iterator UI
-        = P.findPartitionUseForPHIOrSelectOperand(InUse);
-      assert(isa<PHINode>(*UI->getUse()->getUser()));
-      UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex()));
-    }
-    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
-  }
-
-  /// Select instructions that use an alloca and are subsequently loaded can be
-  /// rewritten to load both input pointers and then select between the result,
-  /// allowing the load of the alloca to be promoted.
-  /// From this:
-  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
-  ///   %V = load i32* %P2
-  /// to:
-  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
-  ///   %V2 = load i32* %Other
-  ///   %V = select i1 %cond, i32 %V1, i32 %V2
-  ///
-  /// We can do this to a select if its only uses are loads and if the operand
-  /// to the select can be loaded unconditionally.
-  bool isSafeSelectToSpeculate(SelectInst &SI,
-                               SmallVectorImpl<LoadInst *> &Loads) {
-    Value *TValue = SI.getTrueValue();
-    Value *FValue = SI.getFalseValue();
-    bool TDerefable = TValue->isDereferenceablePointer();
-    bool FDerefable = FValue->isDereferenceablePointer();
-
-    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
-         UI != UE; ++UI) {
-      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-      if (LI == 0 || !LI->isSimple()) return false;
-
-      // Both operands to the select need to be dereferencable, either
-      // absolutely (e.g. allocas) or at this point because we can see other
-      // accesses to it.
-      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
-                                                      LI->getAlignment(), &TD))
-        return false;
-      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
-                                                      LI->getAlignment(), &TD))
-        return false;
-      Loads.push_back(LI);
-    }
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) {
+  Value *TValue = SI.getTrueValue();
+  Value *FValue = SI.getFalseValue();
+  bool TDerefable = TValue->isDereferenceablePointer();
+  bool FDerefable = FValue->isDereferenceablePointer();
+
+  for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE;
+       ++UI) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+    if (LI == 0 || !LI->isSimple())
+      return false;
 
-    return true;
+    // Both operands to the select need to be dereferencable, either
+    // absolutely (e.g. allocas) or at this point because we can see other
+    // accesses to it.
+    if (!TDerefable &&
+        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL))
+      return false;
+    if (!FDerefable &&
+        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL))
+      return false;
   }
 
-  void visitSelectInst(SelectInst &SI) {
-    DEBUG(dbgs() << "    original: " << SI << "\n");
-
-    // If the select isn't safe to speculate, just use simple logic to emit it.
-    SmallVector<LoadInst *, 4> Loads;
-    if (!isSafeSelectToSpeculate(SI, Loads))
-      return;
+  return true;
+}
 
-    IRBuilderTy IRB(&SI);
-    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
-    AllocaPartitioning::iterator PIs[2];
-    PartitionUse PUs[2];
-    for (unsigned i = 0, e = 2; i != e; ++i) {
-      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
-      if (PIs[i] != P.end()) {
-        // If the pointer is within the partitioning, remove the select from
-        // its uses. We'll add in the new loads below.
-        AllocaPartitioning::use_iterator UI
-          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
-        PUs[i] = *UI;
-        // Clear out the use here so that the offsets into the use list remain
-        // stable but this use is ignored when rewriting.
-        UI->setUse(0);
-      }
-    }
+static void speculateSelectInstLoads(SelectInst &SI) {
+  DEBUG(dbgs() << "    original: " << SI << "\n");
 
-    Value *TV = SI.getTrueValue();
-    Value *FV = SI.getFalseValue();
-    // Replace the loads of the select with a select of two loads.
-    while (!Loads.empty()) {
-      LoadInst *LI = Loads.pop_back_val();
+  IRBuilderTy IRB(&SI);
+  Value *TV = SI.getTrueValue();
+  Value *FV = SI.getFalseValue();
+  // Replace the loads of the select with a select of two loads.
+  while (!SI.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(*SI.use_begin());
+    assert(LI->isSimple() && "We only speculate simple loads");
 
-      IRB.SetInsertPoint(LI);
-      LoadInst *TL =
+    IRB.SetInsertPoint(LI);
+    LoadInst *TL =
         IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
-      LoadInst *FL =
+    LoadInst *FL =
         IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
-      NumLoadsSpeculated += 2;
-
-      // Transfer alignment and TBAA info if present.
-      TL->setAlignment(LI->getAlignment());
-      FL->setAlignment(LI->getAlignment());
-      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
-        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
-        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
-      }
+    NumLoadsSpeculated += 2;
 
-      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
-                                  LI->getName() + ".sroa.speculated");
+    // Transfer alignment and TBAA info if present.
+    TL->setAlignment(LI->getAlignment());
+    FL->setAlignment(LI->getAlignment());
+    if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
+      TL->setMetadata(LLVMContext::MD_tbaa, Tag);
+      FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+    }
 
-      LoadInst *Loads[2] = { TL, FL };
-      for (unsigned i = 0, e = 2; i != e; ++i) {
-        if (PIs[i] != P.end()) {
-          Use *LoadUse = &Loads[i]->getOperandUse(0);
-          assert(PUs[i].getUse()->get() == LoadUse->get());
-          PUs[i].setUse(LoadUse);
-          P.use_push_back(PIs[i], PUs[i]);
-        }
-      }
+    Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                LI->getName() + ".sroa.speculated");
 
-      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
-      LI->replaceAllUsesWith(V);
-      Pass.DeadInsts.insert(LI);
-    }
+    DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LI->replaceAllUsesWith(V);
+    LI->eraseFromParent();
   }
-};
+  SI.eraseFromParent();
 }
 
 /// \brief Build a GEP out of a base pointer and indices.
@@ -1737,7 +1209,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
 /// TargetTy. If we can't find one with the same type, we at least try to use
 /// one with the same size. If none of that works, we just produce the GEP as
 /// indicated by Indices to have the correct offset.
-static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
                                     SmallVectorImpl<Value *> &Indices) {
   if (Ty == TargetTy)
@@ -1754,7 +1226,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
       ElementTy = SeqTy->getElementType();
       // Note that we use the default address space as this index is over an
       // array or a vector, not a pointer.
-      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
+      Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0)));
     } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
       if (STy->element_begin() == STy->element_end())
         break; // Nothing left to descend into.
@@ -1775,12 +1247,12 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
 ///
 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
 /// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
                                        SmallVectorImpl<Value *> &Indices) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1790,7 +1262,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType());
+    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8)
       return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
@@ -1799,20 +1271,20 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
       return 0;
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
-    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
+    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
                                     Offset, TargetTy, Indices);
   }
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
     Type *ElementTy = ArrTy->getElementType();
-    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+    APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
       return 0;
 
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
-    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
                                     Indices);
   }
 
@@ -1820,18 +1292,18 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
   if (!STy)
     return 0;
 
-  const StructLayout *SL = TD.getStructLayout(STy);
+  const StructLayout *SL = DL.getStructLayout(STy);
   uint64_t StructOffset = Offset.getZExtValue();
   if (StructOffset >= SL->getSizeInBytes())
     return 0;
   unsigned Index = SL->getElementContainingOffset(StructOffset);
   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
   Type *ElementTy = STy->getElementType(Index);
-  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
     return 0; // The offset points into alignment padding.
 
   Indices.push_back(IRB.getInt32(Index));
-  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
                                   Indices);
 }
 
@@ -1845,7 +1317,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
 /// Indices, and setting Ty to the result subtype.
 ///
 /// If no natural GEP can be constructed, this function returns null.
-static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
                                       SmallVectorImpl<Value *> &Indices) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
@@ -1858,14 +1330,14 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
     return 0; // We can't GEP through an unsized element.
-  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+  APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
   if (ElementSize == 0)
     return 0; // Zero-length arrays can't help us build a natural GEP.
   APInt NumSkippedElements = Offset.sdiv(ElementSize);
 
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
-  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
                                   Indices);
 }
 
@@ -1884,7 +1356,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
 /// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
-static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
                              Value *Ptr, APInt Offset, Type *PointerTy) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
@@ -1908,7 +1380,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
     // First fold any existing GEPs into the offset.
     while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
       APInt GEPOffset(Offset.getBitWidth(), 0);
-      if (!GEP->accumulateConstantOffset(TD, GEPOffset))
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
         break;
       Offset += GEPOffset;
       Ptr = GEP->getPointerOperand();
@@ -1918,7 +1390,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
 
     // See if we can perform a natural GEP here.
     Indices.clear();
-    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
+    if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
                                            Indices)) {
       if (P->getType() == PointerTy) {
         // Zap any offset pointer that we ended up computing in previous rounds.
@@ -1989,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
 
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
     if (NewTy->isPointerTy() && OldTy->isPointerTy())
       return true;
@@ -2007,24 +1483,126 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *Ty) {
-  assert(canConvertValue(DL, V->getType(), Ty) &&
-         "Value not convertable to type");
-  if (V->getType() == Ty)
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
     return V;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
       if (NewITy->getBitWidth() > OldITy->getBitWidth())
         return IRB.CreateZExt(V, NewITy);
-  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
-    return IRB.CreateIntToPtr(V, Ty);
-  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
-    return IRB.CreatePtrToInt(V, Ty);
 
-  return IRB.CreateBitCast(V, Ty);
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isIntegerTy() &&
+      NewTy->getScalarType()->isPointerTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isPointerTy() &&
+      NewTy->getScalarType()->isIntegerTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
 }
 
-/// \brief Test whether the given alloca partition can be promoted to a vector.
+/// \brief Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partioning which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(
+    const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset,
+    uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize,
+    AllocaSlices::const_iterator I) {
+  // First validate the slice offsets.
+  uint64_t BeginOffset =
+      std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset;
+  uint64_t BeginIndex = BeginOffset / ElementSize;
+  if (BeginIndex * ElementSize != BeginOffset ||
+      BeginIndex >= Ty->getNumElements())
+    return false;
+  uint64_t EndOffset =
+      std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset;
+  uint64_t EndIndex = EndOffset / ElementSize;
+  if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+    return false;
+
+  assert(EndIndex > BeginIndex && "Empty vector!");
+  uint64_t NumElements = EndIndex - BeginIndex;
+  Type *SliceTy =
+      (NumElements == 1) ? Ty->getElementType()
+                         : VectorType::get(Ty->getElementType(), NumElements);
+
+  Type *SplitIntTy =
+      Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+  Use *U = I->getUse();
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile())
+      return false;
+    if (!I->isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+    // Disable vector promotion when there are loads or stores of an FCA.
+    return false;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    Type *LTy = LI->getType();
+    if (SliceBeginOffset > I->beginOffset() ||
+        SliceEndOffset < I->endOffset()) {
+      assert(LTy->isIntegerTy());
+      LTy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, SliceTy, LTy))
+      return false;
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    if (SI->isVolatile())
+      return false;
+    Type *STy = SI->getValueOperand()->getType();
+    if (SliceBeginOffset > I->beginOffset() ||
+        SliceEndOffset < I->endOffset()) {
+      assert(STy->isIntegerTy());
+      STy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, STy, SliceTy))
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
 ///
 /// This is a quick test to check whether we can rewrite a particular alloca
 /// partition (and its newly formed alloca) into a vector alloca with only
@@ -2032,75 +1610,103 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static bool isVectorPromotionViable(const DataLayout &TD,
-                                    Type *AllocaTy,
-                                    AllocaPartitioning &P,
-                                    uint64_t PartitionBeginOffset,
-                                    uint64_t PartitionEndOffset,
-                                    AllocaPartitioning::const_use_iterator I,
-                                    AllocaPartitioning::const_use_iterator E) {
+static bool
+isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S,
+                        uint64_t SliceBeginOffset, uint64_t SliceEndOffset,
+                        AllocaSlices::const_iterator I,
+                        AllocaSlices::const_iterator E,
+                        ArrayRef<AllocaSlices::iterator> SplitUses) {
   VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
   if (!Ty)
     return false;
 
-  uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType());
+  uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType());
 
   // While the definition of LLVM vectors is bitpacked, we don't support sizes
   // that aren't byte sized.
   if (ElementSize % 8)
     return false;
-  assert((TD.getTypeSizeInBits(Ty) % 8) == 0 &&
+  assert((DL.getTypeSizeInBits(Ty) % 8) == 0 &&
          "vector size not a multiple of element size?");
   ElementSize /= 8;
 
-  for (; I != E; ++I) {
-    Use *U = I->getUse();
-    if (!U)
-      continue; // Skip dead use.
-
-    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
-    uint64_t BeginIndex = BeginOffset / ElementSize;
-    if (BeginIndex * ElementSize != BeginOffset ||
-        BeginIndex >= Ty->getNumElements())
+  for (; I != E; ++I)
+    if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
+                                         SliceEndOffset, Ty, ElementSize, I))
       return false;
-    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
-    uint64_t EndIndex = EndOffset / ElementSize;
-    if (EndIndex * ElementSize != EndOffset ||
-        EndIndex > Ty->getNumElements())
+
+  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
+                                                        SUE = SplitUses.end();
+       SUI != SUE; ++SUI)
+    if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
+                                         SliceEndOffset, Ty, ElementSize, *SUI))
       return false;
 
-    assert(EndIndex > BeginIndex && "Empty vector!");
-    uint64_t NumElements = EndIndex - BeginIndex;
-    Type *PartitionTy
-      = (NumElements == 1) ? Ty->getElementType()
-                           : VectorType::get(Ty->getElementType(), NumElements);
+  return true;
+}
 
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
-      if (MI->isVolatile())
-        return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
-        const AllocaPartitioning::MemTransferOffsets &MTO
-          = P.getMemTransferOffsets(*MTI);
-        if (!MTO.IsSplittable)
-          return false;
-      }
-    } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
-      // Disable vector promotion when there are loads or stores of an FCA.
+/// \brief Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const DataLayout &DL,
+                                            Type *AllocaTy,
+                                            uint64_t AllocBeginOffset,
+                                            uint64_t Size, AllocaSlices &S,
+                                            AllocaSlices::const_iterator I,
+                                            bool &WholeAllocaOp) {
+  uint64_t RelBegin = I->beginOffset() - AllocBeginOffset;
+  uint64_t RelEnd = I->endOffset() - AllocBeginOffset;
+
+  // We can't reasonably handle cases where the load or store extends past
+  // the end of the aloca's type and into its padding.
+  if (RelEnd > Size)
+    return false;
+
+  Use *U = I->getUse();
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
       return false;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
-      if (LI->isVolatile())
-        return false;
-      if (!canConvertValue(TD, PartitionTy, LI->getType()))
-        return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
-      if (SI->isVolatile())
+    if (RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
         return false;
-      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, AllocaTy, LI->getType())) {
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    Type *ValueTy = SI->getValueOperand()->getType();
+    if (SI->isVolatile())
+      return false;
+    if (RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
         return false;
-    } else {
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, ValueTy, AllocaTy)) {
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
       return false;
     }
+  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+      return false;
+    if (!I->isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  } else {
+    return false;
   }
+
   return true;
 }
 
@@ -2110,97 +1716,50 @@ static bool isVectorPromotionViable(const DataLayout &TD,
 /// This is a quick test to check whether we can rewrite the integer loads and
 /// stores to a particular alloca into wider loads and stores and be able to
 /// promote the resulting alloca.
-static bool isIntegerWideningViable(const DataLayout &TD,
-                                    Type *AllocaTy,
-                                    uint64_t AllocBeginOffset,
-                                    AllocaPartitioning &P,
-                                    AllocaPartitioning::const_use_iterator I,
-                                    AllocaPartitioning::const_use_iterator E) {
-  uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
+static bool
+isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
+                        uint64_t AllocBeginOffset, AllocaSlices &S,
+                        AllocaSlices::const_iterator I,
+                        AllocaSlices::const_iterator E,
+                        ArrayRef<AllocaSlices::iterator> SplitUses) {
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
   // Don't create integer types larger than the maximum bitwidth.
   if (SizeInBits > IntegerType::MAX_INT_BITS)
     return false;
 
   // Don't try to handle allocas with bit-padding.
-  if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
     return false;
 
   // We need to ensure that an integer type with the appropriate bitwidth can
   // be converted to the alloca type, whatever that is. We don't want to force
   // the alloca itself to have an integer type if there is a more suitable one.
   Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
-  if (!canConvertValue(TD, AllocaTy, IntTy) ||
-      !canConvertValue(TD, IntTy, AllocaTy))
+  if (!canConvertValue(DL, AllocaTy, IntTy) ||
+      !canConvertValue(DL, IntTy, AllocaTy))
     return false;
 
-  uint64_t Size = TD.getTypeStoreSize(AllocaTy);
-
-  // Check the uses to ensure the uses are (likely) promotable integer uses.
-  // Also ensure that the alloca has a covering load or store. We don't want
-  // to widen the integer operations only to fail to promote due to some other
-  // unsplittable entry (which we may make splittable later).
-  bool WholeAllocaOp = false;
-  for (; I != E; ++I) {
-    Use *U = I->getUse();
-    if (!U)
-      continue; // Skip dead use.
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
 
-    uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
-    uint64_t RelEnd = I->EndOffset - AllocBeginOffset;
+  // While examining uses, we ensure that the alloca has a covering load or
+  // store. We don't want to widen the integer operations only to fail to
+  // promote due to some other unsplittable entry (which we may make splittable
+  // later). However, if there are only splittable uses, go ahead and assume
+  // that we cover the alloca.
+  bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits);
 
-    // We can't reasonably handle cases where the load or store extends past
-    // the end of the aloca's type and into its padding.
-    if (RelEnd > Size)
+  for (; I != E; ++I)
+    if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
+                                         S, I, WholeAllocaOp))
       return false;
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
-      if (LI->isVolatile())
-        return false;
-      if (RelBegin == 0 && RelEnd == Size)
-        WholeAllocaOp = true;
-      if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
-        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
-          return false;
-        continue;
-      }
-      // Non-integer loads need to be convertible from the alloca type so that
-      // they are promotable.
-      if (RelBegin != 0 || RelEnd != Size ||
-          !canConvertValue(TD, AllocaTy, LI->getType()))
-        return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
-      Type *ValueTy = SI->getValueOperand()->getType();
-      if (SI->isVolatile())
-        return false;
-      if (RelBegin == 0 && RelEnd == Size)
-        WholeAllocaOp = true;
-      if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
-        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
-          return false;
-        continue;
-      }
-      // Non-integer stores need to be convertible to the alloca type so that
-      // they are promotable.
-      if (RelBegin != 0 || RelEnd != Size ||
-          !canConvertValue(TD, ValueTy, AllocaTy))
-        return false;
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
-      if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
-        return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
-        const AllocaPartitioning::MemTransferOffsets &MTO
-          = P.getMemTransferOffsets(*MTI);
-        if (!MTO.IsSplittable)
-          return false;
-      }
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
-      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
-          II->getIntrinsicID() != Intrinsic::lifetime_end)
-        return false;
-    } else {
+  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
+                                                        SUE = SplitUses.end();
+       SUI != SUE; ++SUI)
+    if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
+                                         S, *SUI, WholeAllocaOp))
       return false;
-    }
-  }
+
   return WholeAllocaOp;
 }
 
@@ -2335,19 +1894,19 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
 }
 
 namespace {
-/// \brief Visitor to rewrite instructions using a partition of an alloca to
-/// use a new alloca.
+/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
 ///
 /// Also implements the rewriting to vector-based accesses when the partition
 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
 /// lives here.
-class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
-                                                   bool> {
+class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
+  friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
+  typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
 
-  const DataLayout &TD;
-  AllocaPartitioning &P;
+  const DataLayout &DL;
+  AllocaSlices &S;
   SROA &Pass;
   AllocaInst &OldAI, &NewAI;
   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
@@ -2372,106 +1931,112 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
   // integer type will be stored here for easy access during rewriting.
   IntegerType *IntTy;
 
-  // The offset of the partition user currently being rewritten.
+  // The offset of the slice currently being rewritten.
   uint64_t BeginOffset, EndOffset;
+  bool IsSplittable;
   bool IsSplit;
   Use *OldUse;
   Instruction *OldPtr;
 
+  // Output members carrying state about the result of visiting and rewriting
+  // the slice of the alloca.
+  bool IsUsedByRewrittenSpeculatableInstructions;
+
   // Utility IR builder, whose name prefix is setup for each visited use, and
   // the insertion point is set to point to the user.
   IRBuilderTy IRB;
 
 public:
-  AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
-                          AllocaPartitioning::iterator PI,
-                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
-                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
-    : TD(TD), P(P), Pass(Pass),
-      OldAI(OldAI), NewAI(NewAI),
-      NewAllocaBeginOffset(NewBeginOffset),
-      NewAllocaEndOffset(NewEndOffset),
-      NewAllocaTy(NewAI.getAllocatedType()),
-      VecTy(), ElementTy(), ElementSize(), IntTy(),
-      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
-      IRB(NewAI.getContext(), ConstantFolder()) {
-  }
-
-  /// \brief Visit the users of the alloca partition and rewrite them.
-  bool visitUsers(AllocaPartitioning::const_use_iterator I,
-                  AllocaPartitioning::const_use_iterator E) {
-    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
-                                NewAllocaBeginOffset, NewAllocaEndOffset,
-                                I, E)) {
-      ++NumVectorized;
-      VecTy = cast<VectorType>(NewAI.getAllocatedType());
-      ElementTy = VecTy->getElementType();
-      assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 &&
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass,
+                      AllocaInst &OldAI, AllocaInst &NewAI,
+                      uint64_t NewBeginOffset, uint64_t NewEndOffset,
+                      bool IsVectorPromotable = false,
+                      bool IsIntegerPromotable = false)
+      : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+        NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset),
+        NewAllocaTy(NewAI.getAllocatedType()),
+        VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0),
+        ElementTy(VecTy ? VecTy->getElementType() : 0),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        IntTy(IsIntegerPromotable
+                  ? Type::getIntNTy(
+                        NewAI.getContext(),
+                        DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+                  : 0),
+        BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
+        OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false),
+        IRB(NewAI.getContext(), ConstantFolder()) {
+    if (VecTy) {
+      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
-      ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8;
-    } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
-                                       NewAllocaBeginOffset, P, I, E)) {
-      IntTy = Type::getIntNTy(NewAI.getContext(),
-                              TD.getTypeSizeInBits(NewAI.getAllocatedType()));
+      ++NumVectorized;
     }
+    assert((!IsVectorPromotable && !IsIntegerPromotable) ||
+           IsVectorPromotable != IsIntegerPromotable);
+  }
+
+  bool visit(AllocaSlices::const_iterator I) {
     bool CanSROA = true;
-    for (; I != E; ++I) {
-      if (!I->getUse())
-        continue; // Skip dead uses.
-      BeginOffset = I->BeginOffset;
-      EndOffset = I->EndOffset;
-      IsSplit = I->isSplit();
-      OldUse = I->getUse();
-      OldPtr = cast<Instruction>(OldUse->get());
-
-      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
-      IRB.SetInsertPoint(OldUserI);
-      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
-                        ".");
-
-      CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
-    }
-    if (VecTy) {
+    BeginOffset = I->beginOffset();
+    EndOffset = I->endOffset();
+    IsSplittable = I->isSplittable();
+    IsSplit =
+        BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+
+    OldUse = I->getUse();
+    OldPtr = cast<Instruction>(OldUse->get());
+
+    Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+    IRB.SetInsertPoint(OldUserI);
+    IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+    IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+    CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+    if (VecTy || IntTy)
       assert(CanSROA);
-      VecTy = 0;
-      ElementTy = 0;
-      ElementSize = 0;
-    }
-    if (IntTy) {
-      assert(CanSROA);
-      IntTy = 0;
-    }
     return CanSROA;
   }
 
+  /// \brief Query whether this slice is used by speculatable instructions after
+  /// rewriting.
+  ///
+  /// These instructions (PHIs and Selects currently) require the alloca slice
+  /// to run back through the rewriter. Thus, they are promotable, but not on
+  /// this iteration. This is distinct from a slice which is unpromotable for
+  /// some other reason, in which case we don't even want to perform the
+  /// speculation. This can be querried at any time and reflects whether (at
+  /// that point) a visit call has rewritten a speculatable instruction on the
+  /// current slice.
+  bool isUsedByRewrittenSpeculatableInstructions() const {
+    return IsUsedByRewrittenSpeculatableInstructions;
+  }
+
 private:
+  // Make sure the other visit overloads are visible.
+  using Base::visit;
+
   // Every instruction which can end up as a user must have a rewrite rule.
   bool visitInstruction(Instruction &I) {
     DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
-  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
-    assert(BeginOffset >= NewAllocaBeginOffset);
-    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
+  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset,
+                              Type *PointerTy) {
+    assert(Offset >= NewAllocaBeginOffset);
+    return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(),
+                                                 Offset - NewAllocaBeginOffset),
+                          PointerTy);
   }
 
   /// \brief Compute suitable alignment to access an offset into the new alloca.
   unsigned getOffsetAlign(uint64_t Offset) {
     unsigned NewAIAlign = NewAI.getAlignment();
     if (!NewAIAlign)
-      NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType());
+      NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
     return MinAlign(NewAIAlign, Offset);
   }
 
-  /// \brief Compute suitable alignment to access this partition of the new
-  /// alloca.
-  unsigned getPartitionAlign() {
-    return getOffsetAlign(BeginOffset - NewAllocaBeginOffset);
-  }
-
   /// \brief Compute suitable alignment to access a type at an offset of the
   /// new alloca.
   ///
@@ -2479,15 +2044,7 @@ private:
   /// otherwise returns the maximal suitable alignment.
   unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
     unsigned Align = getOffsetAlign(Offset);
-    return Align == TD.getABITypeAlignment(Ty) ? 0 : Align;
-  }
-
-  /// \brief Compute suitable alignment to access a type at the beginning of
-  /// this partition of the new alloca.
-  ///
-  /// See \c getOffsetTypeAlign for details; this routine delegates to it.
-  unsigned getPartitionTypeAlign(Type *Ty) {
-    return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
+    return Align == DL.getABITypeAlignment(Ty) ? 0 : Align;
   }
 
   unsigned getIndex(uint64_t Offset) {
@@ -2505,9 +2062,10 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst() {
-    unsigned BeginIndex = getIndex(BeginOffset);
-    unsigned EndIndex = getIndex(EndOffset);
+  Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset,
+                                   uint64_t NewEndOffset) {
+    unsigned BeginIndex = getIndex(NewBeginOffset);
+    unsigned EndIndex = getIndex(NewEndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
 
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
@@ -2515,16 +2073,17 @@ private:
     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
-  Value *rewriteIntegerLoad(LoadInst &LI) {
+  Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset,
+                            uint64_t NewEndOffset) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                      "load");
-    V = convertValue(TD, IRB, V, IntTy);
-    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-    if (Offset > 0 || EndOffset < NewAllocaEndOffset)
-      V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
+    V = convertValue(DL, IRB, V, IntTy);
+    assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset)
+      V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset,
                          "extract");
     return V;
   }
@@ -2534,37 +2093,44 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
-    uint64_t Size = EndOffset - BeginOffset;
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    uint64_t Size = NewEndOffset - NewBeginOffset;
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
                              : LI.getType();
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst();
+      V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset);
     } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(LI);
-    } else if (BeginOffset == NewAllocaBeginOffset &&
-               canConvertValue(TD, NewAllocaTy, LI.getType())) {
+      V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset);
+    } else if (NewBeginOffset == NewAllocaBeginOffset &&
+               canConvertValue(DL, NewAllocaTy, LI.getType())) {
       V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                 LI.isVolatile(), "load");
     } else {
       Type *LTy = TargetTy->getPointerTo();
-      V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
-                                getPartitionTypeAlign(TargetTy),
-                                LI.isVolatile(), "load");
+      V = IRB.CreateAlignedLoad(
+          getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy),
+          getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset),
+          LI.isVolatile(), "load");
       IsPtrAdjusted = true;
     }
-    V = convertValue(TD, IRB, V, TargetTy);
+    V = convertValue(DL, IRB, V, TargetTy);
 
     if (IsSplit) {
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
-      assert(Size < TD.getTypeStoreSize(LI.getType()) &&
+      assert(Size < DL.getTypeStoreSize(LI.getType()) &&
              "Split load isn't smaller than original load");
       assert(LI.getType()->getIntegerBitWidth() ==
-             TD.getTypeStoreSizeInBits(LI.getType()) &&
+             DL.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
       IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
@@ -2574,7 +2140,7 @@ private:
       // LI only used for this computation.
       Value *Placeholder
         = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
-      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
@@ -2589,24 +2155,26 @@ private:
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(Value *V,
-                                  StoreInst &SI, Value *OldOp) {
-    unsigned BeginIndex = getIndex(BeginOffset);
-    unsigned EndIndex = getIndex(EndOffset);
-    assert(EndIndex > BeginIndex && "Empty vector!");
-    unsigned NumElements = EndIndex - BeginIndex;
-    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
-    Type *PartitionTy
-      = (NumElements == 1) ? ElementTy
-                           : VectorType::get(ElementTy, NumElements);
-    if (V->getType() != PartitionTy)
-      V = convertValue(TD, IRB, V, PartitionTy);
-
-    // Mix in the existing elements.
-    Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                       "load");
-    V = insertVector(IRB, Old, V, BeginIndex, "vec");
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+                                  uint64_t NewBeginOffset,
+                                  uint64_t NewEndOffset) {
+    if (V->getType() != VecTy) {
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+      Type *SliceTy =
+          (NumElements == 1) ? ElementTy
+                             : VectorType::get(ElementTy, NumElements);
+      if (V->getType() != SliceTy)
+        V = convertValue(DL, IRB, V, SliceTy);
 
+      // Mix in the existing elements.
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "load");
+      V = insertVector(IRB, Old, V, BeginIndex, "vec");
+    }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
 
@@ -2615,19 +2183,20 @@ private:
     return true;
   }
 
-  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI,
+                           uint64_t NewBeginOffset, uint64_t NewEndOffset) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
-    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+    if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                          "oldload");
-      Old = convertValue(TD, IRB, Old, IntTy);
+      Old = convertValue(DL, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
+      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset,
                         "insert");
     }
-    V = convertValue(TD, IRB, V, NewAllocaTy);
+    V = convertValue(DL, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
     (void)Store;
@@ -2648,37 +2217,45 @@ private:
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    uint64_t Size = EndOffset - BeginOffset;
-    if (Size < TD.getTypeStoreSize(V->getType())) {
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    if (Size < DL.getTypeStoreSize(V->getType())) {
       assert(!SI.isVolatile());
-      assert(IsSplit && "A seemingly split store isn't splittable");
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
       assert(V->getType()->getIntegerBitWidth() ==
-             TD.getTypeStoreSizeInBits(V->getType()) &&
+             DL.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
-      V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
                          "extract");
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset,
+                                        NewEndOffset);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(V, SI);
+      return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset);
 
     StoreInst *NewSI;
-    if (BeginOffset == NewAllocaBeginOffset &&
-        EndOffset == NewAllocaEndOffset &&
-        canConvertValue(TD, V->getType(), NewAllocaTy)) {
-      V = convertValue(TD, IRB, V, NewAllocaTy);
+    if (NewBeginOffset == NewAllocaBeginOffset &&
+        NewEndOffset == NewAllocaEndOffset &&
+        canConvertValue(DL, V->getType(), NewAllocaTy)) {
+      V = convertValue(DL, IRB, V, NewAllocaTy);
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo());
-      NewSI = IRB.CreateAlignedStore(V, NewPtr,
-                                     getPartitionTypeAlign(V->getType()),
-                                     SI.isVolatile());
+      Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset,
+                                           V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(
+          V, NewPtr, getOffsetTypeAlign(
+                         V->getType(), NewBeginOffset - NewAllocaBeginOffset),
+          SI.isVolatile());
     }
     (void)NewSI;
     Pass.DeadInsts.insert(&SI);
@@ -2729,9 +2306,12 @@ private:
     // If the memset has a variable size, it cannot be split, just adjust the
     // pointer to the new alloca.
     if (!isa<Constant>(II.getLength())) {
-      II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      assert(!IsSplit);
+      assert(BeginOffset >= NewAllocaBeginOffset);
+      II.setDest(
+          getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
       Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign()));
+      II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset)));
 
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -2743,21 +2323,26 @@ private:
     Type *AllocaTy = NewAI.getAllocatedType();
     Type *ScalarTy = AllocaTy->getScalarType();
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
+
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memset.
     if (!VecTy && !IntTy &&
-        (BeginOffset != NewAllocaBeginOffset ||
-         EndOffset != NewAllocaEndOffset ||
+        (BeginOffset > NewAllocaBeginOffset ||
+         EndOffset < NewAllocaEndOffset ||
          !AllocaTy->isSingleValueType() ||
-         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
-         TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
+         !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
+         DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
       Type *SizeTy = II.getLength()->getType();
-      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
-      CallInst *New
-        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
-                                                II.getRawDest()->getType()),
-                           II.getValue(), Size, getPartitionAlign(),
-                           II.isVolatile());
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+      CallInst *New = IRB.CreateMemSet(
+          getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()),
+          II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile());
       (void)New;
       DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
@@ -2774,15 +2359,15 @@ private:
       // If this is a memset of a vectorized alloca, insert it.
       assert(ElementTy == ScalarTy);
 
-      unsigned BeginIndex = getIndex(BeginOffset);
-      unsigned EndIndex = getIndex(EndOffset);
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
       assert(EndIndex > BeginIndex && "Empty vector!");
       unsigned NumElements = EndIndex - BeginIndex;
       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
 
       Value *Splat =
-          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
-      Splat = convertValue(TD, IRB, Splat, ElementTy);
+          getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+      Splat = convertValue(DL, IRB, Splat, ElementTy);
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
 
@@ -2794,32 +2379,31 @@ private:
       // set integer.
       assert(!II.isVolatile());
 
-      uint64_t Size = EndOffset - BeginOffset;
+      uint64_t Size = NewEndOffset - NewBeginOffset;
       V = getIntegerSplat(II.getValue(), Size);
 
       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                            "oldload");
-        Old = convertValue(TD, IRB, Old, IntTy);
-        assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-        uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
+        Old = convertValue(DL, IRB, Old, IntTy);
+        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(DL, IRB, Old, V, Offset, "insert");
       } else {
         assert(V->getType() == IntTy &&
                "Wrong type for an alloca wide integer!");
       }
-      V = convertValue(TD, IRB, V, AllocaTy);
+      V = convertValue(DL, IRB, V, AllocaTy);
     } else {
       // Established these invariants above.
-      assert(BeginOffset == NewAllocaBeginOffset);
-      assert(EndOffset == NewAllocaEndOffset);
+      assert(NewBeginOffset == NewAllocaBeginOffset);
+      assert(NewEndOffset == NewAllocaEndOffset);
 
-      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
+      V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
         V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
-      V = convertValue(TD, IRB, V, AllocaTy);
+      V = convertValue(DL, IRB, V, AllocaTy);
     }
 
     Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
@@ -2835,21 +2419,25 @@ private:
 
     DEBUG(dbgs() << "    original: " << II << "\n");
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
     assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
     bool IsDest = II.getRawDest() == OldPtr;
 
-    const AllocaPartitioning::MemTransferOffsets &MTO
-      = P.getMemTransferOffsets(II);
-
     // Compute the relative offset within the transfer.
-    unsigned IntPtrWidth = TD.getPointerSizeInBits();
-    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
-                                                       : MTO.SourceBegin));
+    unsigned IntPtrWidth = DL.getPointerSizeInBits();
+    APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
 
     unsigned Align = II.getAlignment();
+    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
     if (Align > 1)
-      Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
-                       MinAlign(II.getAlignment(), getPartitionAlign()));
+      Align =
+          MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
+                   MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset)));
 
     // For unsplit intrinsics, we simply modify the source and destination
     // pointers in place. This isn't just an optimization, it is a matter of
@@ -2858,12 +2446,14 @@ private:
     // a variable length. We may also be dealing with memmove instead of
     // memcpy, and so simply updating the pointers is the necessary for us to
     // update both source and dest of a single call.
-    if (!MTO.IsSplittable) {
+    if (!IsSplittable) {
       Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
       if (IsDest)
-        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+        II.setDest(
+            getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
       else
-        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
+        II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset,
+                                          II.getRawSource()->getType()));
 
       Type *CstTy = II.getAlignmentCst()->getType();
       II.setAlignment(ConstantInt::get(CstTy, Align));
@@ -2881,24 +2471,21 @@ private:
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memcpy.
     bool EmitMemCpy
-      = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset ||
-                             EndOffset != NewAllocaEndOffset ||
+      = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset ||
+                             EndOffset < NewAllocaEndOffset ||
                              !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
     // size hasn't been shrunk based on analysis of the viable range, this is
     // a no-op.
     if (EmitMemCpy && &OldAI == &NewAI) {
-      uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
-      uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
       // Ensure the start lines up.
-      assert(BeginOffset == OrigBegin);
-      (void)OrigBegin;
+      assert(NewBeginOffset == BeginOffset);
 
       // Rewrite the size as needed.
-      if (EndOffset != OrigEnd)
+      if (NewEndOffset != EndOffset)
         II.setLength(ConstantInt::get(II.getLength()->getType(),
-                                      EndOffset - BeginOffset));
+                                      NewEndOffset - NewBeginOffset));
       return false;
     }
     // Record this instruction for deletion.
@@ -2917,13 +2504,13 @@ private:
 
       // Compute the other pointer, folding as much as possible to produce
       // a single, simple GEP in most cases.
-      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
 
-      Value *OurPtr
-        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
-                                           : II.getRawSource()->getType());
+      Value *OurPtr = getAdjustedAllocaPtr(
+          IRB, NewBeginOffset,
+          IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType());
       Type *SizeTy = II.getLength()->getType();
-      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
 
       CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
                                        IsDest ? OtherPtr : OurPtr,
@@ -2939,11 +2526,11 @@ private:
     if (!Align)
       Align = 1;
 
-    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
-                         EndOffset == NewAllocaEndOffset;
-    uint64_t Size = EndOffset - BeginOffset;
-    unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0;
-    unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0;
+    bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+                         NewEndOffset == NewAllocaEndOffset;
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
     unsigned NumElements = EndIndex - BeginIndex;
     IntegerType *SubIntTy
       = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
@@ -2960,7 +2547,7 @@ private:
       OtherPtrTy = SubIntTy->getPointerTo();
     }
 
-    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
     Value *DstPtr = &NewAI;
     if (!IsDest)
       std::swap(SrcPtr, DstPtr);
@@ -2973,10 +2560,9 @@ private:
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                   "load");
-      Src = convertValue(TD, IRB, Src, IntTy);
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
+      Src = convertValue(DL, IRB, Src, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
       Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
                                   "copyload");
@@ -2989,11 +2575,10 @@ private:
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                          "oldload");
-      Old = convertValue(TD, IRB, Old, IntTy);
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
-      Src = convertValue(TD, IRB, Src, NewAllocaTy);
+      Old = convertValue(DL, IRB, Old, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+      Src = convertValue(DL, IRB, Src, NewAllocaTy);
     }
 
     StoreInst *Store = cast<StoreInst>(
@@ -3009,13 +2594,20 @@ private:
     DEBUG(dbgs() << "    original: " << II << "\n");
     assert(II.getArgOperand(1) == OldPtr);
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
     // Record this instruction for deletion.
     Pass.DeadInsts.insert(&II);
 
     ConstantInt *Size
       = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
-                         EndOffset - BeginOffset);
-    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
+                         NewEndOffset - NewBeginOffset);
+    Value *Ptr =
+        getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType());
     Value *New;
     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
       New = IRB.CreateLifetimeStart(Ptr, Size);
@@ -3029,30 +2621,45 @@ private:
 
   bool visitPHINode(PHINode &PN) {
     DEBUG(dbgs() << "    original: " << PN << "\n");
+    assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
 
     // We would like to compute a new pointer in only one place, but have it be
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
+    IRBuilderTy PtrBuilder(OldPtr);
     PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
                              ".");
 
-    Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
+    Value *NewPtr =
+        getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
 
     DEBUG(dbgs() << "          to: " << PN << "\n");
     deleteIfTriviallyDead(OldPtr);
-    return false;
+
+    // Check whether we can speculate this PHI node, and if so remember that
+    // fact and queue it up for another iteration after the speculation
+    // occurs.
+    if (isSafePHIToSpeculate(PN, &DL)) {
+      Pass.SpeculatablePHIs.insert(&PN);
+      IsUsedByRewrittenSpeculatableInstructions = true;
+      return true;
+    }
+
+    return false; // PHIs can't be promoted on their own.
   }
 
   bool visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
     assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
            "Pointer isn't an operand!");
+    assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
 
-    Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
+    Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     if (SI.getOperand(1) == OldPtr)
       SI.setOperand(1, NewPtr);
@@ -3061,7 +2668,17 @@ private:
 
     DEBUG(dbgs() << "          to: " << SI << "\n");
     deleteIfTriviallyDead(OldPtr);
-    return false;
+
+    // Check whether we can speculate this select instruction, and if so
+    // remember that fact and queue it up for another iteration after the
+    // speculation occurs.
+    if (isSafeSelectToSpeculate(SI, &DL)) {
+      Pass.SpeculatableSelects.insert(&SI);
+      IsUsedByRewrittenSpeculatableInstructions = true;
+      return true;
+    }
+
+    return false; // Selects can't be promoted on their own.
   }
 
 };
@@ -3077,7 +2694,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
   friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
 
-  const DataLayout &TD;
+  const DataLayout &DL;
 
   /// Queue of pointer uses to analyze and potentially rewrite.
   SmallVector<Use *, 8> Queue;
@@ -3090,7 +2707,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   Use *U;
 
 public:
-  AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {}
+  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
 
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
@@ -3319,12 +2936,12 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
 /// when the size or offset cause either end of type-based partition to be off.
 /// Also, this is a best-effort routine. It is reasonable to give up and not
 /// return a type if necessary.
-static Type *getTypePartition(const DataLayout &TD, Type *Ty,
+static Type *getTypePartition(const DataLayout &DL, Type *Ty,
                               uint64_t Offset, uint64_t Size) {
-  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
-    return stripAggregateTypeWrapping(TD, Ty);
-  if (Offset > TD.getTypeAllocSize(Ty) ||
-      (TD.getTypeAllocSize(Ty) - Offset) < Size)
+  if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+    return stripAggregateTypeWrapping(DL, Ty);
+  if (Offset > DL.getTypeAllocSize(Ty) ||
+      (DL.getTypeAllocSize(Ty) - Offset) < Size)
     return 0;
 
   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
@@ -3333,7 +2950,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
       return 0;
 
     Type *ElementTy = SeqTy->getElementType();
-    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
     uint64_t NumSkippedElements = Offset / ElementSize;
     if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
       if (NumSkippedElements >= ArrTy->getNumElements())
@@ -3350,12 +2967,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
       if ((Offset + Size) > ElementSize)
         return 0;
       // Recurse through the element type trying to peel off offset bytes.
-      return getTypePartition(TD, ElementTy, Offset, Size);
+      return getTypePartition(DL, ElementTy, Offset, Size);
     }
     assert(Offset == 0);
 
     if (Size == ElementSize)
-      return stripAggregateTypeWrapping(TD, ElementTy);
+      return stripAggregateTypeWrapping(DL, ElementTy);
     assert(Size > ElementSize);
     uint64_t NumElements = Size / ElementSize;
     if (NumElements * ElementSize != Size)
@@ -3367,7 +2984,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   if (!STy)
     return 0;
 
-  const StructLayout *SL = TD.getStructLayout(STy);
+  const StructLayout *SL = DL.getStructLayout(STy);
   if (Offset >= SL->getSizeInBytes())
     return 0;
   uint64_t EndOffset = Offset + Size;
@@ -3378,7 +2995,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   Offset -= SL->getElementOffset(Index);
 
   Type *ElementTy = STy->getElementType(Index);
-  uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
   if (Offset >= ElementSize)
     return 0; // The offset points into alignment padding.
 
@@ -3386,12 +3003,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   if (Offset > 0 || Size < ElementSize) {
     if ((Offset + Size) > ElementSize)
       return 0;
-    return getTypePartition(TD, ElementTy, Offset, Size);
+    return getTypePartition(DL, ElementTy, Offset, Size);
   }
   assert(Offset == 0);
 
   if (Size == ElementSize)
-    return stripAggregateTypeWrapping(TD, ElementTy);
+    return stripAggregateTypeWrapping(DL, ElementTy);
 
   StructType::element_iterator EI = STy->element_begin() + Index,
                                EE = STy->element_end();
@@ -3414,7 +3031,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   // Try to build up a sub-structure.
   StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
                                       STy->isPacked());
-  const StructLayout *SubSL = TD.getStructLayout(SubTy);
+  const StructLayout *SubSL = DL.getStructLayout(SubTy);
   if (Size != SubSL->getSizeInBytes())
     return 0; // The sub-struct doesn't have quite the size needed.
 
@@ -3431,113 +3048,280 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
 /// appropriate new offsets. It also evaluates how successful the rewrite was
 /// at enabling promotion and if it was successful queues the alloca to be
 /// promoted.
-bool SROA::rewriteAllocaPartition(AllocaInst &AI,
-                                  AllocaPartitioning &P,
-                                  AllocaPartitioning::iterator PI) {
-  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
-  bool IsLive = false;
-  for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
-                                        UE = P.use_end(PI);
-       UI != UE && !IsLive; ++UI)
-    if (UI->getUse())
-      IsLive = true;
-  if (!IsLive)
-    return false; // No live uses left of this partition.
-
-  DEBUG(dbgs() << "Speculating PHIs and selects in partition "
-               << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n");
-
-  PHIOrSelectSpeculator Speculator(*TD, P, *this);
-  DEBUG(dbgs() << "  speculating ");
-  DEBUG(P.print(dbgs(), PI, ""));
-  Speculator.visitUsers(PI);
+bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
+                            AllocaSlices::iterator B, AllocaSlices::iterator E,
+                            int64_t BeginOffset, int64_t EndOffset,
+                            ArrayRef<AllocaSlices::iterator> SplitUses) {
+  assert(BeginOffset < EndOffset);
+  uint64_t SliceSize = EndOffset - BeginOffset;
 
   // Try to compute a friendly type for this partition of the alloca. This
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
-  Type *AllocaTy = 0;
-  if (Type *PartitionTy = P.getCommonType(PI))
-    if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
-      AllocaTy = PartitionTy;
-  if (!AllocaTy)
-    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
-                                             PI->BeginOffset, AllocaSize))
-      AllocaTy = PartitionTy;
-  if ((!AllocaTy ||
-       (AllocaTy->isArrayTy() &&
-        AllocaTy->getArrayElementType()->isIntegerTy())) &&
-      TD->isLegalInteger(AllocaSize * 8))
-    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
-  if (!AllocaTy)
-    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
-  assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
+  Type *SliceTy = 0;
+  if (Type *CommonUseTy = findCommonType(B, E, EndOffset))
+    if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize)
+      SliceTy = CommonUseTy;
+  if (!SliceTy)
+    if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
+                                                 BeginOffset, SliceSize))
+      SliceTy = TypePartitionTy;
+  if ((!SliceTy || (SliceTy->isArrayTy() &&
+                    SliceTy->getArrayElementType()->isIntegerTy())) &&
+      DL->isLegalInteger(SliceSize * 8))
+    SliceTy = Type::getIntNTy(*C, SliceSize * 8);
+  if (!SliceTy)
+    SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize);
+  assert(DL->getTypeAllocSize(SliceTy) >= SliceSize);
+
+  bool IsVectorPromotable = isVectorPromotionViable(
+      *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses);
+
+  bool IsIntegerPromotable =
+      !IsVectorPromotable &&
+      isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses);
 
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
   // case, re-use the existing alloca, but still run through the rewriter to
   // perform phi and select speculation.
   AllocaInst *NewAI;
-  if (AllocaTy == AI.getAllocatedType()) {
-    assert(PI->BeginOffset == 0 &&
+  if (SliceTy == AI.getAllocatedType()) {
+    assert(BeginOffset == 0 &&
            "Non-zero begin offset but same alloca type");
-    assert(PI == P.begin() && "Begin offset is zero on later partition");
     NewAI = &AI;
+    // FIXME: We should be able to bail at this point with "nothing changed".
+    // FIXME: We might want to defer PHI speculation until after here.
   } else {
     unsigned Alignment = AI.getAlignment();
     if (!Alignment) {
       // The minimum alignment which users can rely on when the explicit
       // alignment is omitted or zero is that required by the ABI for this
       // type.
-      Alignment = TD->getABITypeAlignment(AI.getAllocatedType());
+      Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
     }
-    Alignment = MinAlign(Alignment, PI->BeginOffset);
+    Alignment = MinAlign(Alignment, BeginOffset);
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
-    if (Alignment <= TD->getABITypeAlignment(AllocaTy))
+    if (Alignment <= DL->getABITypeAlignment(SliceTy))
       Alignment = 0;
-    NewAI = new AllocaInst(AllocaTy, 0, Alignment,
-                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
-                           &AI);
+    NewAI = new AllocaInst(SliceTy, 0, Alignment,
+                           AI.getName() + ".sroa." + Twine(B - S.begin()), &AI);
     ++NumNewAllocas;
   }
 
   DEBUG(dbgs() << "Rewriting alloca partition "
-               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
-               << *NewAI << "\n");
+               << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
+               << "\n");
 
-  // Track the high watermark of the post-promotion worklist. We will reset it
-  // to this point if the alloca is not in fact scheduled for promotion.
+  // Track the high watermark on several worklists that are only relevant for
+  // promoted allocas. We will reset it to this point if the alloca is not in
+  // fact scheduled for promotion.
   unsigned PPWOldSize = PostPromotionWorklist.size();
+  unsigned SPOldSize = SpeculatablePHIs.size();
+  unsigned SSOldSize = SpeculatableSelects.size();
+  unsigned NumUses = 0;
+
+  AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset,
+                               EndOffset, IsVectorPromotable,
+                               IsIntegerPromotable);
+  bool Promotable = true;
+  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
+                                                        SUE = SplitUses.end();
+       SUI != SUE; ++SUI) {
+    DEBUG(dbgs() << "  rewriting split ");
+    DEBUG(S.printSlice(dbgs(), *SUI, ""));
+    Promotable &= Rewriter.visit(*SUI);
+    ++NumUses;
+  }
+  for (AllocaSlices::iterator I = B; I != E; ++I) {
+    DEBUG(dbgs() << "  rewriting ");
+    DEBUG(S.printSlice(dbgs(), I, ""));
+    Promotable &= Rewriter.visit(I);
+    ++NumUses;
+  }
+
+  NumAllocaPartitionUses += NumUses;
+  MaxUsesPerAllocaPartition =
+      std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
 
-  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
-                                   PI->BeginOffset, PI->EndOffset);
-  DEBUG(dbgs() << "  rewriting ");
-  DEBUG(P.print(dbgs(), PI, ""));
-  bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI));
-  if (Promotable) {
+  if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) {
     DEBUG(dbgs() << "  and queuing for promotion\n");
     PromotableAllocas.push_back(NewAI);
-  } else if (NewAI != &AI) {
+  } else if (NewAI != &AI ||
+             (Promotable &&
+              Rewriter.isUsedByRewrittenSpeculatableInstructions())) {
     // If we can't promote the alloca, iterate on it to check for new
     // refinements exposed by splitting the current alloca. Don't iterate on an
     // alloca which didn't actually change and didn't get promoted.
+    //
+    // Alternatively, if we could promote the alloca but have speculatable
+    // instructions then we will speculate them after finishing our processing
+    // of the original alloca. Mark the new one for re-visiting in the next
+    // iteration so the speculated operations can be rewritten.
+    //
+    // FIXME: We should actually track whether the rewriter changed anything.
     Worklist.insert(NewAI);
   }
 
   // Drop any post-promotion work items if promotion didn't happen.
-  if (!Promotable)
+  if (!Promotable) {
     while (PostPromotionWorklist.size() > PPWOldSize)
       PostPromotionWorklist.pop_back();
+    while (SpeculatablePHIs.size() > SPOldSize)
+      SpeculatablePHIs.pop_back();
+    while (SpeculatableSelects.size() > SSOldSize)
+      SpeculatableSelects.pop_back();
+  }
 
   return true;
 }
 
-/// \brief Walks the partitioning of an alloca rewriting uses of each partition.
-bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
+namespace {
+struct IsSliceEndLessOrEqualTo {
+  uint64_t UpperBound;
+
+  IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {}
+
+  bool operator()(const AllocaSlices::iterator &I) {
+    return I->endOffset() <= UpperBound;
+  }
+};
+}
+
+static void
+removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
+                        uint64_t &MaxSplitUseEndOffset, uint64_t Offset) {
+  if (Offset >= MaxSplitUseEndOffset) {
+    SplitUses.clear();
+    MaxSplitUseEndOffset = 0;
+    return;
+  }
+
+  size_t SplitUsesOldSize = SplitUses.size();
+  SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(),
+                                 IsSliceEndLessOrEqualTo(Offset)),
+                  SplitUses.end());
+  if (SplitUsesOldSize == SplitUses.size())
+    return;
+
+  // Recompute the max. While this is linear, so is remove_if.
+  MaxSplitUseEndOffset = 0;
+  for (SmallVectorImpl<AllocaSlices::iterator>::iterator
+           SUI = SplitUses.begin(),
+           SUE = SplitUses.end();
+       SUI != SUE; ++SUI)
+    MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset);
+}
+
+/// \brief Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
+  if (S.begin() == S.end())
+    return false;
+
+  unsigned NumPartitions = 0;
   bool Changed = false;
-  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
-       ++PI)
-    Changed |= rewriteAllocaPartition(AI, P, PI);
+  SmallVector<AllocaSlices::iterator, 4> SplitUses;
+  uint64_t MaxSplitUseEndOffset = 0;
+
+  uint64_t BeginOffset = S.begin()->beginOffset();
+
+  for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end();
+       SI != SE; SI = SJ) {
+    uint64_t MaxEndOffset = SI->endOffset();
+
+    if (!SI->isSplittable()) {
+      // When we're forming an unsplittable region, it must always start at the
+      // first slice and will extend through its end.
+      assert(BeginOffset == SI->beginOffset());
+
+      // Form a partition including all of the overlapping slices with this
+      // unsplittable slice.
+      while (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
+        if (!SJ->isSplittable())
+          MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
+        ++SJ;
+      }
+    } else {
+      assert(SI->isSplittable()); // Established above.
+
+      // Collect all of the overlapping splittable slices.
+      while (SJ != SE && SJ->beginOffset() < MaxEndOffset &&
+             SJ->isSplittable()) {
+        MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
+        ++SJ;
+      }
+
+      // Back up MaxEndOffset and SJ if we ended the span early when
+      // encountering an unsplittable slice.
+      if (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
+        assert(!SJ->isSplittable());
+        MaxEndOffset = SJ->beginOffset();
+      }
+    }
+
+    // Check if we have managed to move the end offset forward yet. If so,
+    // we'll have to rewrite uses and erase old split uses.
+    if (BeginOffset < MaxEndOffset) {
+      // Rewrite a sequence of overlapping slices.
+      Changed |=
+          rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses);
+      ++NumPartitions;
+
+      removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset);
+    }
+
+    // Accumulate all the splittable slices from the [SI,SJ) region which
+    // overlap going forward.
+    for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK)
+      if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) {
+        SplitUses.push_back(SK);
+        MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset);
+      }
+
+    // If we're already at the end and we have no split uses, we're done.
+    if (SJ == SE && SplitUses.empty())
+      break;
+
+    // If we have no split uses or no gap in offsets, we're ready to move to
+    // the next slice.
+    if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) {
+      BeginOffset = SJ->beginOffset();
+      continue;
+    }
+
+    // Even if we have split slices, if the next slice is splittable and the
+    // split slices reach it, we can simply set up the beginning offset of the
+    // next iteration to bridge between them.
+    if (SJ != SE && SJ->isSplittable() &&
+        MaxSplitUseEndOffset > SJ->beginOffset()) {
+      BeginOffset = MaxEndOffset;
+      continue;
+    }
+
+    // Otherwise, we have a tail of split slices. Rewrite them with an empty
+    // range of slices.
+    uint64_t PostSplitEndOffset =
+        SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset();
+
+    Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset,
+                                SplitUses);
+    ++NumPartitions;
+
+    if (SJ == SE)
+      break; // Skip the rest, we don't need to do any cleanup.
+
+    removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset,
+                            PostSplitEndOffset);
+
+    // Now just reset the begin offset for the next iteration.
+    BeginOffset = SJ->beginOffset();
+  }
+
+  NumAllocaPartitions += NumPartitions;
+  MaxPartitionsPerAlloca =
+      std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
 
   return Changed;
 }
@@ -3545,7 +3329,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
 /// \brief Analyze an alloca for SROA.
 ///
 /// This analyzes the alloca to ensure we can reason about it, builds
-/// a partitioning of the alloca, and then hands it off to be split and
+/// the slices of the alloca, and then hands it off to be split and
 /// rewritten as needed.
 bool SROA::runOnAlloca(AllocaInst &AI) {
   DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
@@ -3559,32 +3343,32 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // Skip alloca forms that this analysis can't handle.
   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+      DL->getTypeAllocSize(AI.getAllocatedType()) == 0)
     return false;
 
   bool Changed = false;
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(*TD);
+  AggLoadStoreRewriter AggRewriter(*DL);
   Changed |= AggRewriter.rewrite(AI);
 
-  // Build the partition set using a recursive instruction-visiting builder.
-  AllocaPartitioning P(*TD, AI);
-  DEBUG(P.print(dbgs()));
-  if (P.isEscaped())
+  // Build the slices using a recursive instruction-visiting builder.
+  AllocaSlices S(*DL, AI);
+  DEBUG(S.print(dbgs()));
+  if (S.isEscaped())
     return Changed;
 
   // Delete all the dead users of this alloca before splitting and rewriting it.
-  for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
-                                              DE = P.dead_user_end();
+  for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(),
+                                        DE = S.dead_user_end();
        DI != DE; ++DI) {
     Changed = true;
     (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
     DeadInsts.insert(*DI);
   }
-  for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
-                                            DE = P.dead_op_end();
+  for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(),
+                                      DE = S.dead_op_end();
        DO != DE; ++DO) {
     Value *OldV = **DO;
     // Clobber the use with an undef value.
@@ -3596,11 +3380,21 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
       }
   }
 
-  // No partitions to split. Leave the dead alloca for a later pass to clean up.
-  if (P.begin() == P.end())
+  // No slices to split. Leave the dead alloca for a later pass to clean up.
+  if (S.begin() == S.end())
     return Changed;
 
-  return splitAlloca(AI, P) || Changed;
+  Changed |= splitAlloca(AI, S);
+
+  DEBUG(dbgs() << "  Speculating PHIs\n");
+  while (!SpeculatablePHIs.empty())
+    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+  DEBUG(dbgs() << "  Speculating Selects\n");
+  while (!SpeculatableSelects.empty())
+    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+  return Changed;
 }
 
 /// \brief Delete the dead instructions accumulated in this run.
@@ -3635,6 +3429,15 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
   }
 }
 
+static void enqueueUsersInWorklist(Instruction &I,
+                                   SmallVectorImpl<Instruction *> &Worklist,
+                                   SmallPtrSet<Instruction *, 8> &Visited) {
+  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+       ++UI)
+    if (Visited.insert(cast<Instruction>(*UI)))
+      Worklist.push_back(cast<Instruction>(*UI));
+}
+
 /// \brief Promote the allocas, using the best available technique.
 ///
 /// This attempts to promote whatever allocas have been identified as viable in
@@ -3659,25 +3462,28 @@ bool SROA::promoteAllocas(Function &F) {
   DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
   SSAUpdater SSA;
   DIBuilder DIB(*F.getParent());
-  SmallVector<Instruction*, 64> Insts;
+  SmallVector<Instruction *, 64> Insts;
+
+  // We need a worklist to walk the uses of each alloca.
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<Instruction *, 32> DeadInsts;
 
   for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
     AllocaInst *AI = PromotableAllocas[Idx];
-    for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
-         UI != UE;) {
-      Instruction *I = cast<Instruction>(*UI++);
+    Insts.clear();
+    Worklist.clear();
+    Visited.clear();
+
+    enqueueUsersInWorklist(*AI, Worklist, Visited);
+
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+
       // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
       // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
       // leading to them) here. Eventually it should use them to optimize the
       // scalar values produced.
-      if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
-        assert(onlyUsedByLifetimeMarkers(I) &&
-               "Found a bitcast used outside of a lifetime marker.");
-        while (!I->use_empty())
-          cast<Instruction>(*I->use_begin())->eraseFromParent();
-        I->eraseFromParent();
-        continue;
-      }
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
                II->getIntrinsicID() == Intrinsic::lifetime_end);
@@ -3685,10 +3491,30 @@ bool SROA::promoteAllocas(Function &F) {
         continue;
       }
 
-      Insts.push_back(I);
+      // Push the loads and stores we find onto the list. SROA will already
+      // have validated that all loads and stores are viable candidates for
+      // promotion.
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        assert(LI->getType() == AI->getAllocatedType());
+        Insts.push_back(LI);
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
+        Insts.push_back(SI);
+        continue;
+      }
+
+      // For everything else, we know that only no-op bitcasts and GEPs will
+      // make it this far, just recurse through them and recall them for later
+      // removal.
+      DeadInsts.push_back(I);
+      enqueueUsersInWorklist(*I, Worklist, Visited);
     }
     AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
-    Insts.clear();
+    while (!DeadInsts.empty())
+      DeadInsts.pop_back_val()->eraseFromParent();
+    AI->eraseFromParent();
   }
 
   PromotableAllocas.clear();
@@ -3712,8 +3538,8 @@ namespace {
 bool SROA::runOnFunction(Function &F) {
   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
-  TD = getAnalysisIfAvailable<DataLayout>();
-  if (!TD) {
+  DL = getAnalysisIfAvailable<DataLayout>();
+  if (!DL) {
     DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
     return false;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp
new file mode 100644
index 0000000..9bcd702
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp
@@ -0,0 +1,479 @@
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+//      to indicate the weights of each edge coming out of the branch.
+//      The weight of each edge is the weight of the target block for
+//      that edge. The weight of a block B is computed as the maximum
+//      number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sample-profile"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+    "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
+namespace {
+/// \brief Sample-based profile reader.
+///
+/// Each profile contains sample counts for all the functions
+/// executed. Inside each function, statements are annotated with the
+/// collected samples on all the instructions associated with that
+/// statement.
+///
+/// For this to produce meaningful data, the program needs to be
+/// compiled with some debug information (at minimum, line numbers:
+/// -gline-tables-only). Otherwise, it will be impossible to match IR
+/// instructions to the line numbers collected by the profiler.
+///
+/// From the profile file, we are interested in collecting the
+/// following information:
+///
+/// * A list of functions included in the profile (mangled names).
+///
+/// * For each function F:
+///   1. The total number of samples collected in F.
+///
+///   2. The samples collected at each line in F. To provide some
+///      protection against source code shuffling, line numbers should
+///      be relative to the start of the function.
+class SampleProfile {
+public:
+  SampleProfile(StringRef F) : Profiles(0), Filename(F) {}
+
+  void dump();
+  void loadText();
+  void loadNative() { llvm_unreachable("not implemented"); }
+  bool emitAnnotations(Function &F);
+  void printFunctionProfile(raw_ostream &OS, StringRef FName);
+  void dumpFunctionProfile(StringRef FName);
+
+protected:
+  typedef DenseMap<uint32_t, uint32_t> BodySampleMap;
+  typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap;
+
+  /// \brief Representation of the runtime profile for a function.
+  ///
+  /// This data structure contains the runtime profile for a given
+  /// function. It contains the total number of samples collected
+  /// in the function and a map of samples collected in every statement.
+  struct FunctionProfile {
+    /// \brief Total number of samples collected inside this function.
+    ///
+    /// Samples are cumulative, they include all the samples collected
+    /// inside this function and all its inlined callees.
+    unsigned TotalSamples;
+
+    // \brief Total number of samples collected at the head of the function.
+    unsigned TotalHeadSamples;
+
+    /// \brief Map line offsets to collected samples.
+    ///
+    /// Each entry in this map contains the number of samples
+    /// collected at the corresponding line offset. All line locations
+    /// are an offset from the start of the function.
+    BodySampleMap BodySamples;
+
+    /// \brief Map basic blocks to their computed weights.
+    ///
+    /// The weight of a basic block is defined to be the maximum
+    /// of all the instruction weights in that block.
+    BlockWeightMap BlockWeights;
+  };
+
+  uint32_t getInstWeight(Instruction &I, unsigned FirstLineno,
+                         BodySampleMap &BodySamples);
+  uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                              BodySampleMap &BodySamples);
+
+  /// \brief Map every function to its associated profile.
+  ///
+  /// The profile of every function executed at runtime is collected
+  /// in the structure FunctionProfile. This maps function objects
+  /// to their corresponding profiles.
+  StringMap<FunctionProfile> Profiles;
+
+  /// \brief Path name to the file holding the profile data.
+  ///
+  /// The format of this file is defined by each profiler
+  /// independently. If possible, the profiler should have a text
+  /// version of the profile format to be used in constructing test
+  /// cases and debugging.
+  StringRef Filename;
+};
+
+/// \brief Loader class for text-based profiles.
+///
+/// This class defines a simple interface to read text files containing
+/// profiles. It keeps track of line number information and location of
+/// the file pointer. Users of this class are responsible for actually
+/// parsing the lines returned by the readLine function.
+///
+/// TODO - This does not really belong here. It is a generic text file
+/// reader. It should be moved to the Support library and made more general.
+class ExternalProfileTextLoader {
+public:
+  ExternalProfileTextLoader(StringRef F) : Filename(F) {
+    error_code EC;
+    EC = MemoryBuffer::getFile(Filename, Buffer);
+    if (EC)
+      report_fatal_error("Could not open profile file " + Filename + ": " +
+                         EC.message());
+    FP = Buffer->getBufferStart();
+    Lineno = 0;
+  }
+
+  /// \brief Read a line from the mapped file.
+  StringRef readLine() {
+    size_t Length = 0;
+    const char *start = FP;
+    while (FP != Buffer->getBufferEnd() && *FP != '\n') {
+      Length++;
+      FP++;
+    }
+    if (FP != Buffer->getBufferEnd())
+      FP++;
+    Lineno++;
+    return StringRef(start, Length);
+  }
+
+  /// \brief Return true, if we've reached EOF.
+  bool atEOF() const { return FP == Buffer->getBufferEnd(); }
+
+  /// \brief Report a parse error message and stop compilation.
+  void reportParseError(Twine Msg) const {
+    report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n");
+  }
+
+private:
+  /// \brief Memory buffer holding the text file.
+  OwningPtr<MemoryBuffer> Buffer;
+
+  /// \brief Current position into the memory buffer.
+  const char *FP;
+
+  /// \brief Current line number.
+  int64_t Lineno;
+
+  /// \brief Path name where to the profile file.
+  StringRef Filename;
+};
+
+/// \brief Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader : public FunctionPass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
+  SampleProfileLoader(StringRef Name = SampleProfileFile)
+      : FunctionPass(ID), Profiler(0), Filename(Name) {
+    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool doInitialization(Module &M);
+
+  void dump() { Profiler->dump(); }
+
+  virtual const char *getPassName() const { return "Sample profile pass"; }
+
+  virtual bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+  }
+
+protected:
+  /// \brief Profile reader object.
+  OwningPtr<SampleProfile> Profiler;
+
+  /// \brief Name of the profile file to load.
+  StringRef Filename;
+};
+}
+
+/// \brief Print the function profile for \p FName on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param FName Name of the function to print.
+void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) {
+  FunctionProfile FProfile = Profiles[FName];
+  OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", "
+     << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size()
+     << " sampled lines\n";
+  for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(),
+                                     SE = FProfile.BodySamples.end();
+       SI != SE; ++SI)
+    OS << "\tline offset: " << SI->first
+       << ", number of samples: " << SI->second << "\n";
+  OS << "\n";
+}
+
+/// \brief Dump the function profile for \p FName.
+///
+/// \param FName Name of the function to print.
+void SampleProfile::dumpFunctionProfile(StringRef FName) {
+  printFunctionProfile(dbgs(), FName);
+}
+
+/// \brief Dump all the function profiles found.
+void SampleProfile::dump() {
+  for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(),
+                                                  E = Profiles.end();
+       I != E; ++I)
+    dumpFunctionProfile(I->getKey());
+}
+
+/// \brief Load samples from a text file.
+///
+/// The file is divided in two segments:
+///
+/// Symbol table (represented with the string "symbol table")
+///    Number of symbols in the table
+///    symbol 1
+///    symbol 2
+///    ...
+///    symbol N
+///
+/// Function body profiles
+///    function1:total_samples:total_head_samples:number_of_locations
+///    location_offset_1: number_of_samples
+///    location_offset_2: number_of_samples
+///    ...
+///    location_offset_N: number_of_samples
+///
+/// Function names must be mangled in order for the profile loader to
+/// match them in the current translation unit.
+///
+/// Since this is a flat profile, a function that shows up more than
+/// once gets all its samples aggregated across all its instances.
+/// TODO - flat profiles are too imprecise to provide good optimization
+/// opportunities. Convert them to context-sensitive profile.
+///
+/// This textual representation is useful to generate unit tests and
+/// for debugging purposes, but it should not be used to generate
+/// profiles for large programs, as the representation is extremely
+/// inefficient.
+void SampleProfile::loadText() {
+  ExternalProfileTextLoader Loader(Filename);
+
+  // Read the symbol table.
+  StringRef Line = Loader.readLine();
+  if (Line != "symbol table")
+    Loader.reportParseError("Expected 'symbol table', found " + Line);
+  int NumSymbols;
+  Line = Loader.readLine();
+  if (Line.getAsInteger(10, NumSymbols))
+    Loader.reportParseError("Expected a number, found " + Line);
+  for (int I = 0; I < NumSymbols; I++) {
+    StringRef FName = Loader.readLine();
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.BodySamples.clear();
+    FProfile.TotalSamples = 0;
+    FProfile.TotalHeadSamples = 0;
+  }
+
+  // Read the profile of each function. Since each function may be
+  // mentioned more than once, and we are collecting flat profiles,
+  // accumulate samples as we parse them.
+  Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$");
+  Regex LineSample("^([0-9]+): ([0-9]+)$");
+  while (!Loader.atEOF()) {
+    SmallVector<StringRef, 4> Matches;
+    Line = Loader.readLine();
+    if (!HeadRE.match(Line, &Matches))
+      Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " +
+                              Line);
+    assert(Matches.size() == 5);
+    StringRef FName = Matches[1];
+    unsigned NumSamples, NumHeadSamples, NumSampledLines;
+    Matches[2].getAsInteger(10, NumSamples);
+    Matches[3].getAsInteger(10, NumHeadSamples);
+    Matches[4].getAsInteger(10, NumSampledLines);
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.TotalSamples += NumSamples;
+    FProfile.TotalHeadSamples += NumHeadSamples;
+    BodySampleMap &SampleMap = FProfile.BodySamples;
+    unsigned I;
+    for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) {
+      Line = Loader.readLine();
+      if (!LineSample.match(Line, &Matches))
+        Loader.reportParseError("Expected 'NUM: NUM', found " + Line);
+      assert(Matches.size() == 3);
+      unsigned LineOffset, NumSamples;
+      Matches[1].getAsInteger(10, LineOffset);
+      Matches[2].getAsInteger(10, NumSamples);
+      SampleMap[LineOffset] += NumSamples;
+    }
+
+    if (I < NumSampledLines)
+      Loader.reportParseError("Unexpected end of file");
+  }
+}
+
+/// \brief Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use \p FirstLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using \p BodySamples.
+///
+/// \param Inst Instruction to query.
+/// \param FirstLineno Line number of the first instruction in the function.
+/// \param BodySamples Map of relative source line locations to samples.
+///
+/// \returns The profiled weight of I.
+uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno,
+                                      BodySampleMap &BodySamples) {
+  unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1;
+  return BodySamples.lookup(LOffset);
+}
+
+/// \brief Compute the weight of a basic block.
+///
+/// The weight of basic block \p B is the maximum weight of all the
+/// instructions in B.
+///
+/// \param B The basic block to query.
+/// \param FirstLineno The line number for the first line in the
+///     function holding B.
+/// \param BodySamples The map containing all the samples collected in that
+///     function.
+///
+/// \returns The computed weight of B.
+uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                                           BodySampleMap &BodySamples) {
+  // If we've computed B's weight before, return it.
+  Function *F = B->getParent();
+  FunctionProfile &FProfile = Profiles[F->getName()];
+  std::pair<BlockWeightMap::iterator, bool> Entry =
+      FProfile.BlockWeights.insert(std::make_pair(B, 0));
+  if (!Entry.second)
+    return Entry.first->second;
+
+  // Otherwise, compute and cache B's weight.
+  uint32_t Weight = 0;
+  for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples);
+    if (InstWeight > Weight)
+      Weight = InstWeight;
+  }
+  Entry.first->second = Weight;
+  return Weight;
+}
+
+/// \brief Generate branch weight metadata for all branches in \p F.
+///
+/// For every branch instruction B in \p F, we compute the weight of the
+/// target block for each of the edges out of B. This is the weight
+/// that we associate with that branch.
+///
+/// TODO - This weight assignment will most likely be wrong if the
+/// target branch has more than two predecessors. This needs to be done
+/// using some form of flow propagation.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on B using the computed values.
+///
+/// \param F The function to query.
+bool SampleProfile::emitAnnotations(Function &F) {
+  bool Changed = false;
+  FunctionProfile &FProfile = Profiles[F.getName()];
+  unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine();
+  MDBuilder MDB(F.getContext());
+
+  // Clear the block weights cache.
+  FProfile.BlockWeights.clear();
+
+  // When we find a branch instruction: For each edge E out of the branch,
+  // the weight of E is the weight of the target block.
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *B = I;
+    TerminatorInst *TI = B->getTerminator();
+    if (TI->getNumSuccessors() == 1)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+
+    SmallVector<uint32_t, 4> Weights;
+    unsigned NSuccs = TI->getNumSuccessors();
+    for (unsigned I = 0; I < NSuccs; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Weight =
+          computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples);
+      Weights.push_back(Weight);
+    }
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+char SampleProfileLoader::ID = 0;
+INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader",
+                false, false)
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
+  return Profiler->emitAnnotations(F);
+}
+
+bool SampleProfileLoader::doInitialization(Module &M) {
+  Profiler.reset(new SampleProfile(Filename));
+  Profiler->loadText();
+  return true;
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass() {
+  return new SampleProfileLoader(SampleProfileFile);
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+  return new SampleProfileLoader(Name);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index 8a9c7da..857597e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
-  initializeBlockPlacementPass(Registry);
+  initializeSampleProfileLoaderPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
@@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopInstSimplifyPass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
+  initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeLoopIdiomRecognizePass(Registry);
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
+  initializePartiallyInlineLibCallsPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
@@ -58,7 +60,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSROA_DTPass(Registry);
   initializeSROA_SSAUpPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
-  initializeSimplifyLibCallsPass(Registry);
+  initializeStructurizeCFGPass(Registry);
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
 }
@@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRotatePass());
 }
 
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollPass());
 }
@@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createMemCpyOptPass());
 }
 
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPromoteMemoryToRegisterPass());
 }
@@ -149,7 +159,7 @@ void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
 }
 
 void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createSimplifyLibCallsPass());
+  // NOTE: The simplify-libcalls pass has been removed.
 }
 
 void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index bfde334..57b290e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -166,21 +166,21 @@ namespace {
     void DeleteDeadInstructions();
 
     void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
-                              SmallVector<AllocaInst*, 32> &NewElts);
+                              SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
-                        SmallVector<AllocaInst*, 32> &NewElts);
+                        SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
-                    SmallVector<AllocaInst*, 32> &NewElts);
+                    SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
                                   uint64_t Offset,
-                                  SmallVector<AllocaInst*, 32> &NewElts);
+                                  SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                       AllocaInst *AI,
-                                      SmallVector<AllocaInst*, 32> &NewElts);
+                                      SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
-                                       SmallVector<AllocaInst*, 32> &NewElts);
+                                       SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
-                                      SmallVector<AllocaInst*, 32> &NewElts);
+                                      SmallVectorImpl<AllocaInst *> &NewElts);
     bool ShouldAttemptScalarRepl(AllocaInst *AI);
   };
 
@@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
     SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
@@ -1066,12 +1066,12 @@ public:
 
     LoadAndStorePromoter::run(Insts);
     AI->eraseFromParent();
-    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(),
+    for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       DDI->eraseFromParent();
     }
-    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(),
+    for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       DVI->eraseFromParent();
@@ -1086,7 +1086,7 @@ public:
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
-    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
@@ -1094,7 +1094,7 @@ public:
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
     }
-    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       Value *Arg = NULL;
@@ -1865,7 +1865,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
 /// Offset indicates the position within AI that is referenced by this
 /// instruction.
 void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
-                                SmallVector<AllocaInst*, 32> &NewElts) {
+                                SmallVectorImpl<AllocaInst *> &NewElts) {
   for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
     Use &TheUse = UI.getUse();
     Instruction *User = cast<Instruction>(*UI++);
@@ -1979,7 +1979,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
 /// RewriteBitCast - Update a bitcast reference to the alloca being replaced
 /// and recursively continue updating all of its uses.
 void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
-                          SmallVector<AllocaInst*, 32> &NewElts) {
+                          SmallVectorImpl<AllocaInst *> &NewElts) {
   RewriteForScalarRepl(BC, AI, Offset, NewElts);
   if (BC->getOperand(0) != AI)
     return;
@@ -2037,7 +2037,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
 /// elements of the alloca that are being split apart, and if so, rewrite
 /// the GEP to be relative to the new element.
 void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
-                      SmallVector<AllocaInst*, 32> &NewElts) {
+                      SmallVectorImpl<AllocaInst *> &NewElts) {
   uint64_t OldOffset = Offset;
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
   // If the GEP was dynamic then it must have been a dynamic vector lookup.
@@ -2099,7 +2099,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
 /// to mark the lifetime of the scalarized memory.
 void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
                                     uint64_t Offset,
-                                    SmallVector<AllocaInst*, 32> &NewElts) {
+                                    SmallVectorImpl<AllocaInst *> &NewElts) {
   ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
   // Put matching lifetime markers on everything from Offset up to
   // Offset+OldSize.
@@ -2153,9 +2153,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
 
 /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
 /// Rewrite it to copy or set the elements of the scalarized memory.
-void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
-                                        AllocaInst *AI,
-                                        SmallVector<AllocaInst*, 32> &NewElts) {
+void
+SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+                                   AllocaInst *AI,
+                                   SmallVectorImpl<AllocaInst *> &NewElts) {
   // If this is a memcpy/memmove, construct the other pointer as the
   // appropriate type.  The "Other" pointer is the pointer that goes to memory
   // that doesn't have anything to do with the alloca that we are promoting. For
@@ -2189,7 +2190,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     if (OtherPtr == AI || OtherPtr == NewElts[0]) {
       // This code will run twice for a no-op memcpy -- once for each operand.
       // Put only one reference to MI on the DeadInsts list.
-      for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(),
+      for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(),
              E = DeadInsts.end(); I != E; ++I)
         if (*I == MI) return;
       DeadInsts.push_back(MI);
@@ -2326,8 +2327,9 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
 /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
 /// overwrites the entire allocation.  Extract out the pieces of the stored
 /// integer and store them individually.
-void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
-                                         SmallVector<AllocaInst*, 32> &NewElts){
+void
+SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+                                    SmallVectorImpl<AllocaInst *> &NewElts) {
   // Extract each element out of the integer according to its structure offset
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
@@ -2440,8 +2442,9 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
 
 /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
 /// an integer.  Load the individual pieces to form the aggregate value.
-void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
-                                        SmallVector<AllocaInst*, 32> &NewElts) {
+void
+SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+                                   SmallVectorImpl<AllocaInst *> &NewElts) {
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
   Type *AllocaEltTy = AI->getAllocatedType();
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index c243d34..8371f6d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -41,187 +41,31 @@ using namespace llvm;
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
 namespace {
-  struct CFGSimplifyPass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    CFGSimplifyPass() : FunctionPass(ID) {
-      initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual bool runOnFunction(Function &F);
+struct CFGSimplifyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  CFGSimplifyPass() : FunctionPass(ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnFunction(Function &F);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<TargetTransformInfo>();
-    }
-  };
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetTransformInfo>();
+  }
+};
 }
 
 char CFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
-                      false, false)
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                      false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
-                    false, false)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                    false)
 
 // Public interface to the CFGSimplification pass
 FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// changeToUnreachable - Insert an unreachable instruction before the specified
-/// instruction, making it and the rest of the code in the block dead.
-static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
-  BasicBlock *BB = I->getParent();
-  // Loop over all of the successors, removing BB's entry from any PHI
-  // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
-
-  // Insert a call to llvm.trap right before this.  This turns the undefined
-  // behavior into a hard fail instead of falling through into random code.
-  if (UseLLVMTrap) {
-    Function *TrapFn =
-      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
-    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
-    CallTrap->setDebugLoc(I->getDebugLoc());
-  }
-  new UnreachableInst(I->getContext(), I);
-
-  // All instructions after this are dead.
-  BasicBlock::iterator BBI = I, BBE = BB->end();
-  while (BBI != BBE) {
-    if (!BBI->use_empty())
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
-    BB->getInstList().erase(BBI++);
-  }
-}
-
-/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
-  NewCall->takeName(II);
-  NewCall->setCallingConv(II->getCallingConv());
-  NewCall->setAttributes(II->getAttributes());
-  NewCall->setDebugLoc(II->getDebugLoc());
-  II->replaceAllUsesWith(NewCall);
-
-  // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
-
-  // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(II->getParent());
-  II->eraseFromParent();
-}
-
-static bool markAliveBlocks(BasicBlock *BB,
-                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
-
-  SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(BB);
-  Reachable.insert(BB);
-  bool Changed = false;
-  do {
-    BB = Worklist.pop_back_val();
-
-    // Do a quick scan of the basic block, turning any obviously unreachable
-    // instructions into LLVM unreachable insts.  The instruction combining pass
-    // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
-        if (CI->doesNotReturn()) {
-          // If we found a call to a no-return function, insert an unreachable
-          // instruction after it.  Make sure there isn't *already* one there
-          // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
-            // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(BBI, false);
-            Changed = true;
-          }
-          break;
-        }
-      }
-
-      // Store to undef and store to null are undefined and used to signal that
-      // they should be changed to unreachable by passes that can't modify the
-      // CFG.
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-        // Don't touch volatile stores.
-        if (SI->isVolatile()) continue;
-
-        Value *Ptr = SI->getOperand(1);
-
-        if (isa<UndefValue>(Ptr) ||
-            (isa<ConstantPointerNull>(Ptr) &&
-             SI->getPointerAddressSpace() == 0)) {
-          changeToUnreachable(SI, true);
-          Changed = true;
-          break;
-        }
-      }
-    }
-
-    // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      Value *Callee = II->getCalledValue();
-      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        changeToUnreachable(II, true);
-        Changed = true;
-      } else if (II->doesNotThrow()) {
-        if (II->use_empty() && II->onlyReadsMemory()) {
-          // jump to the normal destination branch.
-          BranchInst::Create(II->getNormalDest(), II);
-          II->getUnwindDest()->removePredecessor(II->getParent());
-          II->eraseFromParent();
-        } else
-          changeToCall(II);
-        Changed = true;
-      }
-    }
-
-    Changed |= ConstantFoldTerminator(BB, true);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI))
-        Worklist.push_back(*SI);
-  } while (!Worklist.empty());
-  return Changed;
-}
-
-/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
-/// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise.
-static bool removeUnreachableBlocksFromFn(Function &F) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = markAliveBlocks(F.begin(), Reachable);
-
-  // If there are unreachable blocks in the CFG...
-  if (Reachable.size() == F.size())
-    return Changed;
-
-  assert(Reachable.size() < F.size());
-  NumSimpl += F.size()-Reachable.size();
-
-  // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
-      continue;
-
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(BB);
-    BB->dropAllReferences();
-  }
-
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(I))
-      I = F.getBasicBlockList().erase(I);
-    else
-      ++I;
-
-  return true;
-}
-
 /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool mergeEmptyReturnBlocks(Function &F) {
@@ -326,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
 
@@ -334,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
   if (!EverChanged) return false;
 
   // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // removeUnreachableBlocks is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
   // avoid reruning iterativelySimplifyCFG if the second pass of
-  // removeUnreachableBlocksFromFn doesn't do anything.
-  if (!removeUnreachableBlocksFromFn(F))
+  // removeUnreachableBlocks doesn't do anything.
+  if (!removeUnreachableBlocks(F))
     return true;
 
   do {
     EverChanged = iterativelySimplifyCFG(F, TTI, TD);
-    EverChanged |= removeUnreachableBlocksFromFn(F);
+    EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
   return true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
deleted file mode 100644
index 3514e6c..0000000
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a simple pass that applies a variety of small
-// optimizations for calls to specific well-known function calls (e.g. runtime
-// library functions).   Any optimization that takes the very simple form
-// "replace call to library function with simpler code that provides the same
-// result" belongs in this file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "simplify-libcalls"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-using namespace llvm;
-
-
-//===----------------------------------------------------------------------===//
-// Optimizer Base Class
-//===----------------------------------------------------------------------===//
-
-/// This class is the abstract base class for the set of optimizations that
-/// corresponds to one library call.
-namespace {
-class LibCallOptimization {
-protected:
-  Function *Caller;
-  const DataLayout *TD;
-  const TargetLibraryInfo *TLI;
-  LLVMContext* Context;
-public:
-  LibCallOptimization() { }
-  virtual ~LibCallOptimization() {}
-
-  /// CallOptimizer - This pure virtual method is implemented by base classes to
-  /// do various optimizations.  If this returns null then no transformation was
-  /// performed.  If it returns CI, then it transformed the call and CI is to be
-  /// deleted.  If it returns something else, replace CI with the new value and
-  /// delete CI.
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
-    =0;
-
-  Value *OptimizeCall(CallInst *CI, const DataLayout *TD,
-                      const TargetLibraryInfo *TLI, IRBuilder<> &B) {
-    Caller = CI->getParent()->getParent();
-    this->TD = TD;
-    this->TLI = TLI;
-    if (CI->getCalledFunction())
-      Context = &CI->getCalledFunction()->getContext();
-
-    // We never change the calling convention.
-    if (CI->getCallingConv() != llvm::CallingConv::C)
-      return NULL;
-
-    return CallOptimizer(CI->getCalledFunction(), CI, B);
-  }
-};
-} // End anonymous namespace.
-
-
-//===----------------------------------------------------------------------===//
-// SimplifyLibCalls Pass Implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// This pass optimizes well known library functions from libc and libm.
-  ///
-  class SimplifyLibCalls : public FunctionPass {
-    TargetLibraryInfo *TLI;
-
-    StringMap<LibCallOptimization*> Optimizations;
-  public:
-    static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID) {
-      initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
-    }
-    void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
-    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
-
-    void InitOptimizations();
-    bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<TargetLibraryInfo>();
-    }
-  };
-} // end anonymous namespace.
-
-char SimplifyLibCalls::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SimplifyLibCalls, "simplify-libcalls",
-                      "Simplify well-known library calls", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_END(SimplifyLibCalls, "simplify-libcalls",
-                    "Simplify well-known library calls", false, false)
-
-// Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createSimplifyLibCallsPass() {
-  return new SimplifyLibCalls();
-}
-
-void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
-  if (TLI->has(F))
-    Optimizations[TLI->getName(F)] = Opt;
-}
-
-void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
-                              LibCallOptimization* Opt) {
-  if (TLI->has(F1) && TLI->has(F2))
-    Optimizations[TLI->getName(F1)] = Opt;
-}
-
-/// Optimizations - Populate the Optimizations map with all the optimizations
-/// we know.
-void SimplifyLibCalls::InitOptimizations() {
-}
-
-
-/// runOnFunction - Top level algorithm.
-///
-bool SimplifyLibCalls::runOnFunction(Function &F) {
-  TLI = &getAnalysis<TargetLibraryInfo>();
-
-  if (Optimizations.empty())
-    InitOptimizations();
-
-  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-
-  IRBuilder<> Builder(F.getContext());
-
-  bool Changed = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-      // Ignore non-calls.
-      CallInst *CI = dyn_cast<CallInst>(I++);
-      if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue;
-
-      // Ignore indirect calls and calls to non-external functions.
-      Function *Callee = CI->getCalledFunction();
-      if (Callee == 0 || !Callee->isDeclaration() ||
-          !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage()))
-        continue;
-
-      // Ignore unknown calls.
-      LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
-      if (!LCO) continue;
-
-      // Set the builder to the instruction after the call.
-      Builder.SetInsertPoint(BB, I);
-
-      // Use debug location of CI for all new instructions.
-      Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-      // Try to optimize this call.
-      Value *Result = LCO->OptimizeCall(CI, TD, TLI, Builder);
-      if (Result == 0) continue;
-
-      DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI;
-            dbgs() << "  into: " << *Result << "\n");
-
-      // Something changed!
-      Changed = true;
-
-      // Inspect the instruction after the call (which was potentially just
-      // added) next.
-      I = CI; ++I;
-
-      if (CI != Result && !CI->use_empty()) {
-        CI->replaceAllUsesWith(Result);
-        if (!Result->hasName())
-          Result->takeName(CI);
-      }
-      CI->eraseFromParent();
-    }
-  }
-  return Changed;
-}
-
-// TODO:
-//   Additional cases that we need to add to this file:
-//
-// cbrt:
-//   * cbrt(expN(X))  -> expN(x/3)
-//   * cbrt(sqrt(x))  -> pow(x,1/6)
-//   * cbrt(sqrt(x))  -> pow(x,1/9)
-//
-// exp, expf, expl:
-//   * exp(log(x))  -> x
-//
-// log, logf, logl:
-//   * log(exp(x))   -> x
-//   * log(x**y)     -> y*log(x)
-//   * log(exp(y))   -> y*log(e)
-//   * log(exp2(y))  -> y*log(2)
-//   * log(exp10(y)) -> y*log(10)
-//   * log(sqrt(x))  -> 0.5*log(x)
-//   * log(pow(x,y)) -> y*log(x)
-//
-// lround, lroundf, lroundl:
-//   * lround(cnst) -> cnst'
-//
-// pow, powf, powl:
-//   * pow(exp(x),y)  -> exp(x*y)
-//   * pow(sqrt(x),y) -> pow(x,y*0.5)
-//   * pow(pow(x,y),z)-> pow(x,y*z)
-//
-// round, roundf, roundl:
-//   * round(cnst) -> cnst'
-//
-// signbit:
-//   * signbit(cnst) -> cnst'
-//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
-//
-// sqrt, sqrtf, sqrtl:
-//   * sqrt(expN(x))  -> expN(x*0.5)
-//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
-//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
-//
-// strchr:
-//   * strchr(p, 0) -> strlen(p)
-// tan, tanf, tanl:
-//   * tan(atan(x)) -> x
-//
-// trunc, truncf, truncl:
-//   * trunc(cnst) -> cnst'
-//
-//
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
new file mode 100644
index 0000000..5045ff8f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -0,0 +1,906 @@
+//===-- StructurizeCFG.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "structurizecfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
+
+typedef SmallVector<RegionNode*, 8> RNVector;
+typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BranchInst*, 8> BranchVector;
+typedef SmallVector<BBValuePair, 2> BBValueVector;
+
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+
+typedef MapVector<PHINode *, BBValueVector> PhiMap;
+typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+
+typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
+
+// The name for newly created blocks.
+
+static const char *const FlowBlockName = "Flow";
+
+/// @brief Find the nearest common dominator for multiple BasicBlocks
+///
+/// Helper class for StructurizeCFG
+/// TODO: Maybe move into common code
+class NearestCommonDominator {
+  DominatorTree *DT;
+
+  DTN2UnsignedMap IndexMap;
+
+  BasicBlock *Result;
+  unsigned ResultIndex;
+  bool ExplicitMentioned;
+
+public:
+  /// \brief Start a new query
+  NearestCommonDominator(DominatorTree *DomTree) {
+    DT = DomTree;
+    Result = 0;
+  }
+
+  /// \brief Add BB to the resulting dominator
+  void addBlock(BasicBlock *BB, bool Remember = true) {
+    DomTreeNode *Node = DT->getNode(BB);
+
+    if (Result == 0) {
+      unsigned Numbering = 0;
+      for (;Node;Node = Node->getIDom())
+        IndexMap[Node] = ++Numbering;
+      Result = BB;
+      ResultIndex = 1;
+      ExplicitMentioned = Remember;
+      return;
+    }
+
+    for (;Node;Node = Node->getIDom())
+      if (IndexMap.count(Node))
+        break;
+      else
+        IndexMap[Node] = 0;
+
+    assert(Node && "Dominator tree invalid!");
+
+    unsigned Numbering = IndexMap[Node];
+    if (Numbering > ResultIndex) {
+      Result = Node->getBlock();
+      ResultIndex = Numbering;
+      ExplicitMentioned = Remember && (Result == BB);
+    } else if (Numbering == ResultIndex) {
+      ExplicitMentioned |= Remember;
+    }
+  }
+
+  /// \brief Is "Result" one of the BBs added with "Remember" = True?
+  bool wasResultExplicitMentioned() {
+    return ExplicitMentioned;
+  }
+
+  /// \brief Get the query result
+  BasicBlock *getResult() {
+    return Result;
+  }
+};
+
+/// @brief Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// ||   Where:
+/// | |  1 = "If" block, calculates the condition
+/// 4 |  2 = "Then" subregion, runs if the condition is true
+/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/   4 = "Else" optional subregion, runs if the condition is false
+/// 5    5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incomming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^  Where:
+/// | /  1 = "Entry" block
+/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3    3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+class StructurizeCFG : public RegionPass {
+  Type *Boolean;
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+
+  Function *Func;
+  Region *ParentRegion;
+
+  DominatorTree *DT;
+
+  RNVector Order;
+  BBSet Visited;
+
+  BBPhiMap DeletedPhis;
+  BB2BBVecMap AddedPhis;
+
+  PredMap Predicates;
+  BranchVector Conditions;
+
+  BB2BBMap Loops;
+  PredMap LoopPreds;
+  BranchVector LoopConds;
+
+  RegionNode *PrevNode;
+
+  void orderNodes();
+
+  void analyzeLoops(RegionNode *N);
+
+  Value *invert(Value *Condition);
+
+  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+  void gatherPredicates(RegionNode *N);
+
+  void collectInfos();
+
+  void insertConditions(bool Loops);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void setPhiValues();
+
+  void killTerminator(BasicBlock *BB);
+
+  void changeExit(RegionNode *Node, BasicBlock *NewExit,
+                  bool IncludeDominator);
+
+  BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+  BasicBlock *needPrefix(bool NeedEmpty);
+
+  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+  void setPrevNode(BasicBlock *BB);
+
+  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+  bool isPredictableTrue(RegionNode *Node);
+
+  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void createFlow();
+
+  void rebuildSSA();
+
+public:
+  static char ID;
+
+  StructurizeCFG() :
+    RegionPass(ID) {
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
+  }
+
+  using Pass::doInitialization;
+  virtual bool doInitialization(Region *R, RGPassManager &RGM);
+
+  virtual bool runOnRegion(Region *R, RGPassManager &RGM);
+
+  virtual const char *getPassName() const {
+    return "Structurize control flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<DominatorTree>();
+    RegionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char StructurizeCFG::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(RegionInfo)
+INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                    false, false)
+
+/// \brief Initialize the types and constants used in the pass
+bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+  LLVMContext &Context = R->getEntry()->getContext();
+
+  Boolean = Type::getInt1Ty(Context);
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+
+  return false;
+}
+
+/// \brief Build up the general order of nodes
+void StructurizeCFG::orderNodes() {
+  scc_iterator<Region *> I = scc_begin(ParentRegion),
+                         E = scc_end(ParentRegion);
+  for (Order.clear(); I != E; ++I) {
+    std::vector<RegionNode *> &Nodes = *I;
+    Order.append(Nodes.begin(), Nodes.end());
+  }
+}
+
+/// \brief Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+  if (N->isSubRegion()) {
+    // Test for exit as back edge
+    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+    if (Visited.count(Exit))
+      Loops[Exit] = N->getEntry();
+
+  } else {
+    // Test for sucessors as back edge
+    BasicBlock *BB = N->getNodeAs<BasicBlock>();
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *Succ = Term->getSuccessor(i);
+
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
+    }
+  }
+}
+
+/// \brief Invert the given condition
+Value *StructurizeCFG::invert(Value *Condition) {
+  // First: Check if it's a constant
+  if (Condition == BoolTrue)
+    return BoolFalse;
+
+  if (Condition == BoolFalse)
+    return BoolTrue;
+
+  if (Condition == BoolUndef)
+    return BoolUndef;
+
+  // Second: If the condition is already inverted, return the original value
+  if (match(Condition, m_Not(m_Value(Condition))))
+    return Condition;
+
+  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+    // Third: Check all the users for an invert
+    BasicBlock *Parent = Inst->getParent();
+    for (Value::use_iterator I = Condition->use_begin(),
+           E = Condition->use_end(); I != E; ++I) {
+
+      Instruction *User = dyn_cast<Instruction>(*I);
+      if (!User || User->getParent() != Parent)
+        continue;
+
+      if (match(*I, m_Not(m_Specific(Condition))))
+        return *I;
+    }
+
+    // Last option: Create a new instruction
+    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  }
+
+  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+    return BinaryOperator::CreateNot(Condition,
+                                     Arg->getName() + ".inv",
+                                     EntryBlock.getTerminator());
+  }
+
+  llvm_unreachable("Unhandled condition to invert");
+}
+
+/// \brief Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                      bool Invert) {
+  Value *Cond = Invert ? BoolFalse : BoolTrue;
+  if (Term->isConditional()) {
+    Cond = Term->getCondition();
+
+    if (Idx != (unsigned)Invert)
+      Cond = invert(Cond);
+  }
+  return Cond;
+}
+
+/// \brief Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = N->getEntry();
+  BBPredicates &Pred = Predicates[BB];
+  BBPredicates &LPred = LoopPreds[BB];
+
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+       PI != PE; ++PI) {
+
+    // Ignore it if it's a branch from outside into our region entry
+    if (!ParentRegion->contains(*PI))
+      continue;
+
+    Region *R = RI->getRegionFor(*PI);
+    if (R == ParentRegion) {
+
+      // It's a top level block in our region
+      BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *Succ = Term->getSuccessor(i);
+        if (Succ != BB)
+          continue;
+
+        if (Visited.count(*PI)) {
+          // Normal forward edge
+          if (Term->isConditional()) {
+            // Try to treat it like an ELSE block
+            BasicBlock *Other = Term->getSuccessor(!i);
+            if (Visited.count(Other) && !Loops.count(Other) &&
+                !Pred.count(Other) && !Pred.count(*PI)) {
+
+              Pred[Other] = BoolFalse;
+              Pred[*PI] = BoolTrue;
+              continue;
+            }
+          }
+          Pred[*PI] = buildCondition(Term, i, false);
+
+        } else {
+          // Back edge
+          LPred[*PI] = buildCondition(Term, i, true);
+        }
+      }
+
+    } else {
+
+      // It's an exit from a sub region
+      while(R->getParent() != ParentRegion)
+        R = R->getParent();
+
+      // Edge from inside a subregion to its entry, ignore it
+      if (R == N)
+        continue;
+
+      BasicBlock *Entry = R->getEntry();
+      if (Visited.count(Entry))
+        Pred[Entry] = BoolTrue;
+      else
+        LPred[Entry] = BoolFalse;
+    }
+  }
+}
+
+/// \brief Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+  // Reset predicate
+  Predicates.clear();
+
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();
+
+  // Reset the visited nodes
+  Visited.clear();
+
+  for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
+       OI != OE; ++OI) {
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(*OI);
+
+    // Remember that we've seen this node
+    Visited.insert((*OI)->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(*OI);
+  }
+}
+
+/// \brief Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+  BranchVector &Conds = Loops ? LoopConds : Conditions;
+  Value *Default = Loops ? BoolTrue : BoolFalse;
+  SSAUpdater PhiInserter;
+
+  for (BranchVector::iterator I = Conds.begin(),
+       E = Conds.end(); I != E; ++I) {
+
+    BranchInst *Term = *I;
+    assert(Term->isConditional());
+
+    BasicBlock *Parent = Term->getParent();
+    BasicBlock *SuccTrue = Term->getSuccessor(0);
+    BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+    NearestCommonDominator Dominator(DT);
+    Dominator.addBlock(Parent, false);
+
+    Value *ParentValue = 0;
+    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+         PI != PE; ++PI) {
+
+      if (PI->first == Parent) {
+        ParentValue = PI->second;
+        break;
+      }
+      PhiInserter.AddAvailableValue(PI->first, PI->second);
+      Dominator.addBlock(PI->first);
+    }
+
+    if (ParentValue) {
+      Term->setCondition(ParentValue);
+    } else {
+      if (!Dominator.wasResultExplicitMentioned())
+        PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
+
+      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+    }
+  }
+}
+
+/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+  PhiMap &Map = DeletedPhis[To];
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    while (Phi.getBasicBlockIndex(From) != -1) {
+      Value *Deleted = Phi.removeIncomingValue(From, false);
+      Map[&Phi].push_back(std::make_pair(From, Deleted));
+    }
+  }
+}
+
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    Value *Undef = UndefValue::get(Phi.getType());
+    Phi.addIncoming(Undef, From);
+  }
+  AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+  SSAUpdater Updater;
+  for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
+       AI != AE; ++AI) {
+
+    BasicBlock *To = AI->first;
+    BBVector &From = AI->second;
+
+    if (!DeletedPhis.count(To))
+      continue;
+
+    PhiMap &Map = DeletedPhis[To];
+    for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
+         PI != PE; ++PI) {
+
+      PHINode *Phi = PI->first;
+      Value *Undef = UndefValue::get(Phi->getType());
+      Updater.Initialize(Phi->getType(), "");
+      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+      Updater.AddAvailableValue(To, Undef);
+
+      NearestCommonDominator Dominator(DT);
+      Dominator.addBlock(To, false);
+      for (BBValueVector::iterator VI = PI->second.begin(),
+           VE = PI->second.end(); VI != VE; ++VI) {
+
+        Updater.AddAvailableValue(VI->first, VI->second);
+        Dominator.addBlock(VI->first);
+      }
+
+      if (!Dominator.wasResultExplicitMentioned())
+        Updater.AddAvailableValue(Dominator.getResult(), Undef);
+
+      for (BBVector::iterator FI = From.begin(), FE = From.end();
+           FI != FE; ++FI) {
+
+        int Idx = Phi->getBasicBlockIndex(*FI);
+        assert(Idx != -1);
+        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
+      }
+    }
+
+    DeletedPhis.erase(To);
+  }
+  assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+  TerminatorInst *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI) {
+
+    delPhiValues(BB, *SI);
+  }
+
+  Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                bool IncludeDominator) {
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *OldExit = SubRegion->getExit();
+    BasicBlock *Dominator = 0;
+
+    // Find all the edges from the sub region to the exit
+    for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
+         I != E;) {
+
+      BasicBlock *BB = *I++;
+      if (!SubRegion->contains(BB))
+        continue;
+
+      // Modify the edges to point to the new exit
+      delPhiValues(BB, OldExit);
+      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+      addPhiValues(BB, NewExit);
+
+      // Find the new dominator (if requested)
+      if (IncludeDominator) {
+        if (!Dominator)
+          Dominator = BB;
+        else
+          Dominator = DT->findNearestCommonDominator(Dominator, BB);
+      }
+    }
+
+    // Change the dominator (if requested)
+    if (Dominator)
+      DT->changeImmediateDominator(NewExit, Dominator);
+
+    // Update the region info
+    SubRegion->replaceExit(NewExit);
+
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(NewExit, BB);
+    addPhiValues(BB, NewExit);
+    if (IncludeDominator)
+      DT->changeImmediateDominator(NewExit, BB);
+  }
+}
+
+/// \brief Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+  LLVMContext &Context = Func->getContext();
+  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+                       Order.back()->getEntry();
+  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+                                        Func, Insert);
+  DT->addNewBlock(Flow, Dominator);
+  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+  return Flow;
+}
+
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+  BasicBlock *Entry = PrevNode->getEntry();
+
+  if (!PrevNode->isSubRegion()) {
+    killTerminator(Entry);
+    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+      return Entry;
+
+  }
+
+  // create a new flow node
+  BasicBlock *Flow = getNextFlow(Entry);
+
+  // and wire it up
+  changeExit(PrevNode, Flow, true);
+  PrevNode = ParentRegion->getBBNode(Flow);
+  return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+                                        bool ExitUseAllowed) {
+  if (Order.empty() && ExitUseAllowed) {
+    BasicBlock *Exit = ParentRegion->getExit();
+    DT->changeImmediateDominator(Exit, Flow);
+    addPhiValues(Flow, Exit);
+    return Exit;
+  }
+  return getNextFlow(Flow);
+}
+
+/// \brief Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+  PrevNode =  ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
+}
+
+/// \brief Does BB dominate all the predicates of Node ?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+
+    if (!DT->dominates(BB, PI->first))
+      return false;
+  }
+  return true;
+}
+
+/// \brief Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  bool Dominated = false;
+
+  // Regionentry is always true
+  if (PrevNode == 0)
+    return true;
+
+  for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+
+    if (I->second != BoolTrue)
+      return false;
+
+    if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
+      Dominated = true;
+  }
+
+  // TODO: The dominator check is too strict
+  return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+                              BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.pop_back_val();
+  Visited.insert(Node->getEntry());
+
+  if (isPredictableTrue(Node)) {
+    // Just a linear flow
+    if (PrevNode) {
+      changeExit(PrevNode, Node->getEntry(), true);
+    }
+    PrevNode = Node;
+
+  } else {
+    // Insert extra prefix node (or reuse last one)
+    BasicBlock *Flow = needPrefix(false);
+
+    // Insert extra postfix node (or use exit instead)
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+    // let it point to entry and next block
+    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+    addPhiValues(Flow, Entry);
+    DT->changeImmediateDominator(Entry, Flow);
+
+    PrevNode = Node;
+    while (!Order.empty() && !Visited.count(LoopEnd) &&
+           dominatesPredicates(Entry, Order.back())) {
+      handleLoops(false, LoopEnd);
+    }
+
+    changeExit(PrevNode, Next, false);
+    setPrevNode(Next);
+  }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                 BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.back();
+  BasicBlock *LoopStart = Node->getEntry();
+
+  if (!Loops.count(LoopStart)) {
+    wireFlow(ExitUseAllowed, LoopEnd);
+    return;
+  }
+
+  if (!isPredictableTrue(Node))
+    LoopStart = needPrefix(true);
+
+  LoopEnd = Loops[Node->getEntry()];
+  wireFlow(false, LoopEnd);
+  while (!Visited.count(LoopEnd)) {
+    handleLoops(false, LoopEnd);
+  }
+
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+  }
+
+  // Create an extra loop end node
+  LoopEnd = needPrefix(false);
+  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+                                         BoolUndef, LoopEnd));
+  addPhiValues(LoopEnd, LoopStart);
+  setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+  BasicBlock *Exit = ParentRegion->getExit();
+  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Conditions.clear();
+  LoopConds.clear();
+
+  PrevNode = 0;
+  Visited.clear();
+
+  while (!Order.empty()) {
+    handleLoops(EntryDominatesExit, 0);
+  }
+
+  if (PrevNode)
+    changeExit(PrevNode, Exit, EntryDominatesExit);
+  else
+    assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really nessasary
+void StructurizeCFG::rebuildSSA() {
+  SSAUpdater Updater;
+  for (Region::block_iterator I = ParentRegion->block_begin(),
+                              E = ParentRegion->block_end();
+       I != E; ++I) {
+
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+
+      bool Initialized = false;
+      for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
+
+        Next = I->getNext();
+
+        Instruction *User = cast<Instruction>(I->getUser());
+        if (User->getParent() == BB) {
+          continue;
+
+        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(*I) == BB)
+            continue;
+        }
+
+        if (DT->dominates(II, User))
+          continue;
+
+        if (!Initialized) {
+          Value *Undef = UndefValue::get(II->getType());
+          Updater.Initialize(II->getType(), "");
+          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+          Updater.AddAvailableValue(BB, II);
+          Initialized = true;
+        }
+        Updater.RewriteUseAfterInsertions(*I);
+      }
+    }
+  }
+}
+
+/// \brief Run the transformation for each region found
+bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+  if (R->isTopLevelRegion())
+    return false;
+
+  Func = R->getEntry()->getParent();
+  ParentRegion = R;
+
+  DT = &getAnalysis<DominatorTree>();
+
+  orderNodes();
+  collectInfos();
+  createFlow();
+  insertConditions(false);
+  insertConditions(true);
+  setPhiValues();
+  rebuildSSA();
+
+  // Cleanup
+  Order.clear();
+  Visited.clear();
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Predicates.clear();
+  Conditions.clear();
+  Loops.clear();
+  LoopPreds.clear();
+  LoopConds.clear();
+
+  return true;
+}
+
+/// \brief Create the pass
+Pass *llvm::createStructurizeCFGPass() {
+  return new StructurizeCFG();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 2002e68..9fb8ddc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -53,6 +53,7 @@
 #define DEBUG_TYPE "tailcallelim"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -69,6 +70,7 @@
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -97,16 +99,16 @@ namespace {
     bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                     BasicBlock *&OldEntry,
                                     bool &TailCallsAreMarkedTail,
-                                    SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                     bool CannotTailCallElimCallsMarkedTail);
     bool FoldReturnAndProcessPred(BasicBlock *BB,
                                   ReturnInst *Ret, BasicBlock *&OldEntry,
                                   bool &TailCallsAreMarkedTail,
-                                  SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                   bool CannotTailCallElimCallsMarkedTail);
     bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
                                bool &TailCallsAreMarkedTail,
-                               SmallVector<PHINode*, 8> &ArgumentPHIs,
+                               SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                bool CannotTailCallElimCallsMarkedTail);
     bool CanMoveAboveCall(Instruction *I, CallInst *CI);
     Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
@@ -129,34 +131,44 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfo>();
 }
 
-/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
-/// callees of this function.  We only do very simple analysis right now, this
-/// could be expanded in the future to use mod/ref information for particular
-/// call sites if desired.
-static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
-  // FIXME: do simple 'address taken' analysis.
-  return true;
+/// CanTRE - Scan the specified basic block for alloca instructions.
+/// If it contains any that are variable-sized or not in the entry block,
+/// returns false.
+static bool CanTRE(AllocaInst *AI) {
+  // Because of PR962, we don't TRE allocas outside the entry block.
+
+  // If this alloca is in the body of the function, or if it is a variable
+  // sized allocation, we cannot tail call eliminate calls marked 'tail'
+  // with this mechanism.
+  BasicBlock *BB = AI->getParent();
+  return BB == &BB->getParent()->getEntryBlock() &&
+         isa<ConstantInt>(AI->getArraySize());
 }
 
-/// CheckForEscapingAllocas - Scan the specified basic block for alloca
-/// instructions.  If it contains any that might be accessed by calls, return
-/// true.
-static bool CheckForEscapingAllocas(BasicBlock *BB,
-                                    bool &CannotTCETailMarkedCall) {
-  bool RetVal = false;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      RetVal |= AllocaMightEscapeToCalls(AI);
-
-      // If this alloca is in the body of the function, or if it is a variable
-      // sized allocation, we cannot tail call eliminate calls marked 'tail'
-      // with this mechanism.
-      if (BB != &BB->getParent()->getEntryBlock() ||
-          !isa<ConstantInt>(AI->getArraySize()))
-        CannotTCETailMarkedCall = true;
-    }
-  return RetVal;
-}
+namespace {
+struct AllocaCaptureTracker : public CaptureTracker {
+  AllocaCaptureTracker() : Captured(false) {}
+
+  void tooManyUses() LLVM_OVERRIDE { Captured = true; }
+
+  bool shouldExplore(Use *U) LLVM_OVERRIDE {
+    Value *V = U->getUser();
+    if (isa<CallInst>(V) || isa<InvokeInst>(V))
+      UsesAlloca.insert(V);
+    return true;
+  }
+
+  bool captured(Use *U) LLVM_OVERRIDE {
+    if (isa<ReturnInst>(U->getUser()))
+      return false;
+    Captured = true;
+    return true;
+  }
+
+  bool Captured;
+  SmallPtrSet<const Value *, 16> UsesAlloca;
+};
+} // end anonymous namespace
 
 bool TailCallElim::runOnFunction(Function &F) {
   // If this function is a varargs function, we won't be able to PHI the args
@@ -168,41 +180,44 @@ bool TailCallElim::runOnFunction(Function &F) {
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
   bool MadeChange = false;
-  bool FunctionContainsEscapingAllocas = false;
 
-  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls
   // marked with the 'tail' attribute, because doing so would cause the stack
-  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // size to increase (real TRE would deallocate variable sized allocas, TRE
   // doesn't).
-  bool CannotTCETailMarkedCall = false;
-
-  // Loop over the function, looking for any returning blocks, and keeping track
-  // of whether this function has any non-trivially used allocas.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
-      break;
-
-    FunctionContainsEscapingAllocas |=
-      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  bool CanTRETailMarkedCall = true;
+
+  // Find calls that can be marked tail.
+  AllocaCaptureTracker ACT;
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        CanTRETailMarkedCall &= CanTRE(AI);
+        PointerMayBeCaptured(AI, &ACT);
+        // If any allocas are captured, exit.
+        if (ACT.Captured)
+          return false;
+      }
+    }
   }
 
-  /// FIXME: The code generator produces really bad code when an 'escaping
-  /// alloca' is changed from being a static alloca to being a dynamic alloca.
-  /// Until this is resolved, disable this transformation if that would ever
-  /// happen.  This bug is PR962.
-  if (FunctionContainsEscapingAllocas)
-    return false;
-
-  // Second pass, change any tail calls to loops.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                          ArgumentPHIs,CannotTCETailMarkedCall);
-      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
-                                          TailCallsAreMarkedTail, ArgumentPHIs,
-                                          CannotTCETailMarkedCall);
-      MadeChange |= Change;
+  // Second pass, change any tail recursive calls to loops.
+  //
+  // FIXME: The code generator produces really bad code when an 'escaping
+  // alloca' is changed from being a static alloca to being a dynamic alloca.
+  // Until this is resolved, disable this transformation if that would ever
+  // happen.  This bug is PR962.
+  if (ACT.UsesAlloca.empty()) {
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+        bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                            ArgumentPHIs, !CanTRETailMarkedCall);
+        if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+          Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
+                                            TailCallsAreMarkedTail, ArgumentPHIs,
+                                            !CanTRETailMarkedCall);
+        MadeChange |= Change;
+      }
     }
   }
 
@@ -223,16 +238,24 @@ bool TailCallElim::runOnFunction(Function &F) {
     }
   }
 
-  // Finally, if this function contains no non-escaping allocas, or calls
-  // setjmp, mark all calls in the function as eligible for tail calls
-  //(there is no stack memory for them to access).
-  if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice())
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+  // At this point, we know that the function does not have any captured
+  // allocas. If additionally the function does not call setjmp, mark all calls
+  // in the function that do not access stack memory with the tail keyword. This
+  // implies ensuring that there does not exist any path from a call that takes
+  // in an alloca but does not capture it and the call which we wish to mark
+  // with "tail".
+  if (!F.callsFunctionThatReturnsTwice()) {
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
         if (CallInst *CI = dyn_cast<CallInst>(I)) {
-          CI->setTailCall();
-          MadeChange = true;
+          if (!ACT.UsesAlloca.count(CI)) {
+            CI->setTailCall();
+            MadeChange = true;
+          }
         }
+      }
+    }
+  }
 
   return MadeChange;
 }
@@ -424,7 +447,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
 bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                        BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
-                                       SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                        bool CannotTailCallElimCallsMarkedTail) {
   // If we are introducing accumulator recursion to eliminate operations after
   // the call instruction that are both associative and commutative, the initial
@@ -600,7 +623,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
 bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
                                        ReturnInst *Ret, BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
-                                       SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                        bool CannotTailCallElimCallsMarkedTail) {
   bool Change = false;
 
@@ -634,10 +657,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
   return Change;
 }
 
-bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
-                                         bool &TailCallsAreMarkedTail,
-                                         SmallVector<PHINode*, 8> &ArgumentPHIs,
-                                       bool CannotTailCallElimCallsMarkedTail) {
+bool
+TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                    bool &TailCallsAreMarkedTail,
+                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                    bool CannotTailCallElimCallsMarkedTail) {
   CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
   if (!CI)
     return false;