97 files changed, 10269 insertions, 4275 deletions
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index 10e0cc6..de1353e 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -3,4 +3,5 @@ add_subdirectory(Instrumentation)
 add_subdirectory(InstCombine)
 add_subdirectory(Scalar)
 add_subdirectory(IPO)
+add_subdirectory(Vectorize)
 add_subdirectory(Hello)
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 4d8dbc2..58b3551 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -20,13 +20,3 @@ add_llvm_library(LLVMipo
   StripDeadPrototypes.cpp
   StripSymbols.cpp
   )
-
-add_llvm_library_dependencies(LLVMipo
-  LLVMAnalysis
-  LLVMCore
-  LLVMScalarOpts
-  LLVMSupport
-  LLVMTarget
-  LLVMTransformUtils
-  LLVMipa
-  )
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index c3ecb7a..d8fae8a 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -140,18 +140,24 @@ bool ConstantMerge::runOnModule(Module &M) {
           UsedGlobals.count(GV))
         continue;
 
+      // This transformation is legal for weak ODR globals in the sense it
+      // doesn't change semantics, but we really don't want to perform it
+      // anyway; it's likely to pessimize code generation, and some tools
+      // (like the Darwin linker in cases involving CFString) don't expect it.
+      if (GV->isWeakForLinker())
+        continue;
+
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
       PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
       GlobalVariable *&Slot = CMap[Pair];
 
-      // If this is the first constant we find or if the old on is local,
-      // replace with the current one. It the current is externally visible
+      // If this is the first constant we find or if the old one is local,
+      // replace with the current one. If the current is externally visible
       // it cannot be replace, but can be the canonical constant we merge with.
-      if (Slot == 0 || IsBetterCannonical(*GV, *Slot)) {
+      if (Slot == 0 || IsBetterCannonical(*GV, *Slot))
         Slot = GV;
-      }
     }
 
     // Second: identify all globals that can be merged together, filling in
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4bb6f7a..95aef27 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -74,7 +74,7 @@ namespace {
 
       std::string getDescription() const {
         return std::string((IsArg ? "Argument #" : "Return value #"))
-               + utostr(Idx) + " of function " + F->getNameStr();
+               + utostr(Idx) + " of function " + F->getName().str();
       }
     };
 
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 0edf342..f3f6228 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/UniqueVector.h"
@@ -225,31 +226,247 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
   return MadeChange;
 }
 
+namespace {
+  // For a given pointer Argument, this retains a list of Arguments of functions
+  // in the same SCC that the pointer data flows into. We use this to build an
+  // SCC of the arguments.
+  struct ArgumentGraphNode {
+    Argument *Definition;
+    SmallVector<ArgumentGraphNode*, 4> Uses;
+  };
+
+  class ArgumentGraph {
+    // We store pointers to ArgumentGraphNode objects, so it's important that
+    // that they not move around upon insert.
+    typedef std::map<Argument*, ArgumentGraphNode> ArgumentMapTy;
+
+    ArgumentMapTy ArgumentMap;
+
+    // There is no root node for the argument graph, in fact:
+    //   void f(int *x, int *y) { if (...) f(x, y); }
+    // is an example where the graph is disconnected. The SCCIterator requires a
+    // single entry point, so we maintain a fake ("synthetic") root node that
+    // uses every node. Because the graph is directed and nothing points into
+    // the root, it will not participate in any SCCs (except for its own).
+    ArgumentGraphNode SyntheticRoot;
+
+  public:
+    ArgumentGraph() { SyntheticRoot.Definition = 0; }
+
+    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator;
+
+    iterator begin() { return SyntheticRoot.Uses.begin(); }
+    iterator end() { return SyntheticRoot.Uses.end(); }
+    ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+
+    ArgumentGraphNode *operator[](Argument *A) {
+      ArgumentGraphNode &Node = ArgumentMap[A];
+      Node.Definition = A;
+      SyntheticRoot.Uses.push_back(&Node);
+      return &Node;
+    }
+  };
+
+  // This tracker checks whether callees are in the SCC, and if so it does not
+  // consider that a capture, instead adding it to the "Uses" list and
+  // continuing with the analysis.
+  struct ArgumentUsesTracker : public CaptureTracker {
+    ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes)
+      : Captured(false), SCCNodes(SCCNodes) {}
+
+    void tooManyUses() { Captured = true; }
+
+    bool shouldExplore(Use *U) { return true; }
+
+    bool captured(Use *U) {
+      CallSite CS(U->getUser());
+      if (!CS.getInstruction()) { Captured = true; return true; }
+
+      Function *F = CS.getCalledFunction();
+      if (!F || !SCCNodes.count(F)) { Captured = true; return true; }
+
+      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+      for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end();
+           PI != PE; ++PI, ++AI) {
+        if (AI == AE) {
+          assert(F->isVarArg() && "More params than args in non-varargs call");
+          Captured = true;
+          return true;
+        }
+        if (PI == U) {
+          Uses.push_back(AI);
+          break;
+        }
+      }
+      assert(!Uses.empty() && "Capturing call-site captured nothing?");
+      return false;
+    }
+
+    bool Captured;  // True only if certainly captured (used outside our SCC).
+    SmallVector<Argument*, 4> Uses;  // Uses within our SCC.
+
+    const SmallPtrSet<Function*, 8> &SCCNodes;
+  };
+}
+
+namespace llvm {
+  template<> struct GraphTraits<ArgumentGraphNode*> {
+    typedef ArgumentGraphNode NodeType;
+    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator ChildIteratorType;
+
+    static inline NodeType *getEntryNode(NodeType *A) { return A; }
+    static inline ChildIteratorType child_begin(NodeType *N) {
+      return N->Uses.begin();
+    }
+    static inline ChildIteratorType child_end(NodeType *N) {
+      return N->Uses.end();
+    }
+  };
+  template<> struct GraphTraits<ArgumentGraph*>
+    : public GraphTraits<ArgumentGraphNode*> {
+    static NodeType *getEntryNode(ArgumentGraph *AG) {
+      return AG->getEntryNode();
+    }
+    static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
+      return AG->begin();
+    }
+    static ChildIteratorType nodes_end(ArgumentGraph *AG) {
+      return AG->end();
+    }
+  };
+}
+
 /// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
 bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
   bool Changed = false;
 
+  SmallPtrSet<Function*, 8> SCCNodes;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (F && !F->isDeclaration() && !F->mayBeOverridden())
+      SCCNodes.insert(F);
+  }
+
+  ArgumentGraph AG;
+
   // Check each function in turn, determining which pointer arguments are not
   // captured.
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
     if (F == 0)
-      // External node - skip it;
+      // External node - only a problem for arguments that we pass to it.
       continue;
 
     // Definitions with weak linkage may be overridden at linktime with
-    // something that writes memory, so treat them like declarations.
+    // something that captures pointers, so treat them like declarations.
     if (F->isDeclaration() || F->mayBeOverridden())
       continue;
 
+    // Functions that are readonly (or readnone) and nounwind and don't return
+    // a value can't capture arguments. Don't analyze them.
+    if (F->onlyReadsMemory() && F->doesNotThrow() &&
+        F->getReturnType()->isVoidTy()) {
+      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
+           A != E; ++A) {
+        if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+          A->addAttr(Attribute::NoCapture);
+          ++NumNoCapture;
+          Changed = true;
+        }
+      }
+      continue;
+    }
+
     for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A)
-      if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr() &&
-          !PointerMayBeCaptured(A, true, /*StoreCaptures=*/false)) {
-        A->addAttr(Attribute::NoCapture);
+      if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+        ArgumentUsesTracker Tracker(SCCNodes);
+        PointerMayBeCaptured(A, &Tracker);
+        if (!Tracker.Captured) {
+          if (Tracker.Uses.empty()) {
+            // If it's trivially not captured, mark it nocapture now.
+            A->addAttr(Attribute::NoCapture);
+            ++NumNoCapture;
+            Changed = true;
+          } else {
+            // If it's not trivially captured and not trivially not captured,
+            // then it must be calling into another function in our SCC. Save
+            // its particulars for Argument-SCC analysis later.
+            ArgumentGraphNode *Node = AG[A];
+            for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(),
+                   UE = Tracker.Uses.end(); UI != UE; ++UI)
+              Node->Uses.push_back(AG[*UI]);
+          }
+        }
+        // Otherwise, it's captured. Don't bother doing SCC analysis on it.
+      }
+  }
+
+  // The graph we've collected is partial because we stopped scanning for
+  // argument uses once we solved the argument trivially. These partial nodes
+  // show up as ArgumentGraphNode objects with an empty Uses list, and for
+  // these nodes the final decision about whether they capture has already been
+  // made.  If the definition doesn't have a 'nocapture' attribute by now, it
+  // captures.
+
+  for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG), E = scc_end(&AG);
+       I != E; ++I) {
+    std::vector<ArgumentGraphNode*> &ArgumentSCC = *I;
+    if (ArgumentSCC.size() == 1) {
+      if (!ArgumentSCC[0]->Definition) continue;  // synthetic root node
+
+      // eg. "void f(int* x) { if (...) f(x); }"
+      if (ArgumentSCC[0]->Uses.size() == 1 &&
+          ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
+        ArgumentSCC[0]->Definition->addAttr(Attribute::NoCapture);
         ++NumNoCapture;
         Changed = true;
       }
+      continue;
+    }
+
+    bool SCCCaptured = false;
+    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
+           E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *Node = *I;
+      if (Node->Uses.empty()) {
+        if (!Node->Definition->hasNoCaptureAttr())
+          SCCCaptured = true;
+      }
+    }
+    if (SCCCaptured) continue;
+
+    SmallPtrSet<Argument*, 8> ArgumentSCCNodes;
+    // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
+    // quickly looking up whether a given Argument is in this ArgumentSCC.
+    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
+           E = ArgumentSCC.end(); I != E; ++I) {
+      ArgumentSCCNodes.insert((*I)->Definition);
+    }
+
+    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
+           E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *N = *I;
+      for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(),
+             UE = N->Uses.end(); UI != UE; ++UI) {
+        Argument *A = (*UI)->Definition;
+        if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
+          continue;
+        SCCCaptured = true;
+        break;
+      }
+    }
+    if (SCCCaptured) continue;
+
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      A->addAttr(Attribute::NoCapture);
+      ++NumNoCapture;
+      Changed = true;
+    }
   }
 
   return Changed;
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 3552d03..1522aa4 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -61,6 +62,7 @@ namespace {
   struct GlobalStatus;
   struct GlobalOpt : public ModulePass {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
     }
     static char ID; // Pass identification, replacement for typeid
     GlobalOpt() : ModulePass(ID) {
@@ -80,11 +82,17 @@ namespace {
                                const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
+
+    TargetData *TD;
+    TargetLibraryInfo *TLI;
   };
 }
 
 char GlobalOpt::ID = 0;
-INITIALIZE_PASS(GlobalOpt, "globalopt",
+INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
+                "Global Variable Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(GlobalOpt, "globalopt",
                 "Global Variable Optimizer", false, false)
 
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
@@ -143,18 +151,31 @@ struct GlobalStatus {
   /// HasPHIUser - Set to true if this global has a user that is a PHI node.
   bool HasPHIUser;
 
+  /// AtomicOrdering - Set to the strongest atomic ordering requirement.
+  AtomicOrdering Ordering;
+
   GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
                    StoredOnceValue(0), AccessingFunction(0),
-                   HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
-                   HasPHIUser(false) {}
+                   HasMultipleAccessingFunctions(false),
+                   HasNonInstructionUser(false), HasPHIUser(false),
+                   Ordering(NotAtomic) {}
 };
 
 }
 
-// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
-// by constants itself.  Note that constants cannot be cyclic, so this test is
-// pretty easy to implement recursively.
-//
+/// StrongerOrdering - Return the stronger of the two ordering. If the two
+/// orderings are acquire and release, then return AcquireRelease.
+///
+static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if (X == Acquire && Y == Release) return AcquireRelease;
+  if (Y == Acquire && X == Release) return AcquireRelease;
+  return (AtomicOrdering)std::max(X, Y);
+}
+
+/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
+/// by constants itself.  Note that constants cannot be cyclic, so this test is
+/// pretty easy to implement recursively.
+///
 static bool SafeToDestroyConstant(const Constant *C) {
   if (isa<GlobalValue>(C)) return false;
 
@@ -195,14 +216,16 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
       }
       if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
         GS.isLoaded = true;
-        // Don't hack on volatile/atomic loads.
-        if (!LI->isSimple()) return true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile()) return true;
+        GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering());
       } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // Don't allow a store OF the address, only stores TO the address.
         if (SI->getOperand(0) == V) return true;
 
-        // Don't hack on volatile/atomic stores.
-        if (!SI->isSimple()) return true;
+        // Don't hack on volatile stores.
+        if (SI->isVolatile()) return true;
+        GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering());
 
         // If this is a direct store to the global (i.e., the global is a scalar
         // value, not an aggregate), keep more specific information about
@@ -271,43 +294,12 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
   return false;
 }
 
-static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) {
-  ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
-  if (!CI) return 0;
-  unsigned IdxV = CI->getZExtValue();
-
-  if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Agg)) {
-    if (IdxV < CS->getNumOperands()) return CS->getOperand(IdxV);
-  } else if (ConstantArray *CA = dyn_cast<ConstantArray>(Agg)) {
-    if (IdxV < CA->getNumOperands()) return CA->getOperand(IdxV);
-  } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Agg)) {
-    if (IdxV < CP->getNumOperands()) return CP->getOperand(IdxV);
-  } else if (isa<ConstantAggregateZero>(Agg)) {
-    if (StructType *STy = dyn_cast<StructType>(Agg->getType())) {
-      if (IdxV < STy->getNumElements())
-        return Constant::getNullValue(STy->getElementType(IdxV));
-    } else if (SequentialType *STy =
-               dyn_cast<SequentialType>(Agg->getType())) {
-      return Constant::getNullValue(STy->getElementType());
-    }
-  } else if (isa<UndefValue>(Agg)) {
-    if (StructType *STy = dyn_cast<StructType>(Agg->getType())) {
-      if (IdxV < STy->getNumElements())
-        return UndefValue::get(STy->getElementType(IdxV));
-    } else if (SequentialType *STy =
-               dyn_cast<SequentialType>(Agg->getType())) {
-      return UndefValue::get(STy->getElementType());
-    }
-  }
-  return 0;
-}
-
-
 /// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
 /// users of the global, cleaning up the obvious ones.  This is largely just a
 /// quick scan over the use list to clean up the easy and obvious cruft.  This
 /// returns true if it made a change.
-static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
+static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
+                                       TargetData *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
     User *U = *UI++;
@@ -328,11 +320,11 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
         Constant *SubInit = 0;
         if (Init)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
-        Changed |= CleanupConstantGlobalUsers(CE, SubInit);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit, TD, TLI);
       } else if (CE->getOpcode() == Instruction::BitCast &&
                  CE->getType()->isPointerTy()) {
         // Pointer cast, delete any stores and memsets to the global.
-        Changed |= CleanupConstantGlobalUsers(CE, 0);
+        Changed |= CleanupConstantGlobalUsers(CE, 0, TD, TLI);
       }
 
       if (CE->use_empty()) {
@@ -346,11 +338,17 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
       Constant *SubInit = 0;
       if (!isa<ConstantExpr>(GEP->getOperand(0))) {
         ConstantExpr *CE =
-          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP));
+          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, TD, TLI));
         if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+
+        // If the initializer is an all-null value and we have an inbounds GEP,
+        // we already know what the result of any load from that GEP is.
+        // TODO: Handle splats.
+        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
+          SubInit = Constant::getNullValue(GEP->getType()->getElementType());
       }
-      Changed |= CleanupConstantGlobalUsers(GEP, SubInit);
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, TD, TLI);
 
       if (GEP->use_empty()) {
         GEP->eraseFromParent();
@@ -368,7 +366,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
       if (SafeToDestroyConstant(C)) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
-        CleanupConstantGlobalUsers(V, Init);
+        CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
       }
     }
@@ -514,8 +512,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
     NewGlobals.reserve(STy->getNumElements());
     const StructLayout &Layout = *TD.getStructLayout(STy);
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-      Constant *In = getAggregateConstantElement(Init,
-                    ConstantInt::get(Type::getInt32Ty(STy->getContext()), i));
+      Constant *In = Init->getAggregateElement(i);
       assert(In && "Couldn't get element of initializer?");
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
                                                GlobalVariable::InternalLinkage,
@@ -547,8 +544,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
     uint64_t EltSize = TD.getTypeAllocSize(STy->getElementType());
     unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType());
     for (unsigned i = 0, e = NumElements; i != e; ++i) {
-      Constant *In = getAggregateConstantElement(Init,
-                    ConstantInt::get(Type::getInt32Ty(Init->getContext()), i));
+      Constant *In = Init->getAggregateElement(i);
       assert(In && "Couldn't get element of initializer?");
 
       GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
@@ -770,7 +766,9 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
 /// value stored into it.  If there are uses of the loaded value that would trap
 /// if the loaded value is dynamically null, then we know that they cannot be
 /// reachable with a null optimize away the load.
-static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
+static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
+                                            TargetData *TD,
+                                            TargetLibraryInfo *TLI) {
   bool Changed = false;
 
   // Keep track of whether we are able to remove all the uses of the global
@@ -813,7 +811,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
   // nor is the global.
   if (AllNonStoreUsesGone) {
     DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
-    CleanupConstantGlobalUsers(GV, 0);
+    CleanupConstantGlobalUsers(GV, 0, TD, TLI);
     if (GV->use_empty()) {
       GV->eraseFromParent();
       ++NumDeleted;
@@ -825,10 +823,11 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
 
 /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
 /// instructions that are foldable.
-static void ConstantPropUsersOf(Value *V) {
+static void ConstantPropUsersOf(Value *V,
+                                TargetData *TD, TargetLibraryInfo *TLI) {
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; )
     if (Instruction *I = dyn_cast<Instruction>(*UI++))
-      if (Constant *NewC = ConstantFoldInstruction(I)) {
+      if (Constant *NewC = ConstantFoldInstruction(I, TD, TLI)) {
         I->replaceAllUsesWith(NewC);
 
         // Advance UI to the next non-I use to avoid invalidating it!
@@ -848,7 +847,8 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                                      CallInst *CI,
                                                      Type *AllocTy,
                                                      ConstantInt *NElements,
-                                                     TargetData* TD) {
+                                                     TargetData *TD,
+                                                     TargetLibraryInfo *TLI) {
   DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
 
   Type *GlobalType;
@@ -906,7 +906,8 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
   while (!GV->use_empty()) {
     if (StoreInst *SI = dyn_cast<StoreInst>(GV->use_back())) {
       // The global is initialized when the store to it occurs.
-      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, SI);
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0,
+                    SI->getOrdering(), SI->getSynchScope(), SI);
       SI->eraseFromParent();
       continue;
     }
@@ -921,7 +922,10 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
 
       ICmpInst *ICI = cast<ICmpInst>(LoadUse.getUser());
       // Replace the cmp X, 0 with a use of the bool value.
-      Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", ICI);
+      // Sink the load to where the compare was, if atomic rules allow us to.
+      Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0,
+                               LI->getOrdering(), LI->getSynchScope(),
+                               LI->isUnordered() ? (Instruction*)ICI : LI);
       InitBoolUsed = true;
       switch (ICI->getPredicate()) {
       default: llvm_unreachable("Unknown ICmp Predicate!");
@@ -962,9 +966,9 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
   // To further other optimizations, loop over all users of NewGV and try to
   // constant prop them.  This will promote GEP instructions with constant
   // indices into GEP constant-exprs, which will allow global-opt to hack on it.
-  ConstantPropUsersOf(NewGV);
+  ConstantPropUsersOf(NewGV, TD, TLI);
   if (RepValue != NewGV)
-    ConstantPropUsersOf(RepValue);
+    ConstantPropUsersOf(RepValue, TD, TLI);
 
   return NewGV;
 }
@@ -1203,7 +1207,6 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
     PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
   } else {
     llvm_unreachable("Unknown usable value");
-    Result = 0;
   }
 
   return FieldVals[FieldNo] = Result;
@@ -1293,9 +1296,9 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
-                                            Value* NElems, TargetData *TD) {
+                                            Value *NElems, TargetData *TD) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
-  Type* MAT = getMallocAllocatedType(CI);
+  Type *MAT = getMallocAllocatedType(CI);
   StructType *STy = cast<StructType>(MAT);
 
   // There is guaranteed to be at least one use of the malloc (storing
@@ -1482,8 +1485,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                CallInst *CI,
                                                Type *AllocTy,
+                                               AtomicOrdering Ordering,
                                                Module::global_iterator &GVI,
-                                               TargetData *TD) {
+                                               TargetData *TD,
+                                               TargetLibraryInfo *TLI) {
   if (!TD)
     return false;
 
@@ -1502,7 +1507,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 
   // We can't optimize this if the malloc itself is used in a complex way,
   // for example, being stored into multiple globals.  This allows the
-  // malloc to be stored into the specified global, loaded setcc'd, and
+  // malloc to be stored into the specified global, loaded icmp'd, and
   // GEP'd.  These are all things we could transform to using the global
   // for.
   SmallPtrSet<const PHINode*, 8> PHIs;
@@ -1523,7 +1528,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // (2048 bytes currently), as we don't want to introduce a 16M global or
     // something.
     if (NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) {
-      GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD);
+      GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD, TLI);
       return true;
     }
 
@@ -1531,6 +1536,9 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // into multiple malloc'd arrays, one for each field.  This is basically
   // SRoA for malloc'd memory.
 
+  if (Ordering != NotAtomic)
+    return false;
+
   // If this is an allocation of a fixed size array of structs, analyze as a
   // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
   if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
@@ -1563,7 +1571,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
         extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc);
     }
 
-    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true),TD);
+    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD);
     return true;
   }
 
@@ -1573,8 +1581,9 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 // OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge
 // that only one value (besides its initializer) is ever stored to the global.
 static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                                     AtomicOrdering Ordering,
                                      Module::global_iterator &GVI,
-                                     TargetData *TD) {
+                                     TargetData *TD, TargetLibraryInfo *TLI) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
 
@@ -1589,12 +1598,13 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
         SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
 
       // Optimize away any trapping uses of the loaded value.
-      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, TD, TLI))
         return true;
     } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
-      Type* MallocType = getMallocAllocatedType(CI);
-      if (MallocType && TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
-                                                           GVI, TD))
+      Type *MallocType = getMallocAllocatedType(CI);
+      if (MallocType &&
+          TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI,
+                                             TD, TLI))
         return true;
     }
   }
@@ -1670,7 +1680,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
           assert(LI->getOperand(0) == GV && "Not a copy!");
           // Insert a new load, to preserve the saved value.
-          StoreVal = new LoadInst(NewGV, LI->getName()+".b", LI);
+          StoreVal = new LoadInst(NewGV, LI->getName()+".b", false, 0,
+                                  LI->getOrdering(), LI->getSynchScope(), LI);
         } else {
           assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
                  "This is not a form that we understand!");
@@ -1678,11 +1689,13 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
         }
       }
-      new StoreInst(StoreVal, NewGV, SI);
+      new StoreInst(StoreVal, NewGV, false, 0,
+                    SI->getOrdering(), SI->getSynchScope(), SI);
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
-      LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", LI);
+      LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0,
+                                   LI->getOrdering(), LI->getSynchScope(), LI);
       Value *NSI;
       if (IsOneZero)
         NSI = new ZExtInst(NLI, LI->getType(), "", LI);
@@ -1699,8 +1712,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
 }
 
 
-/// ProcessInternalGlobal - Analyze the specified global variable and optimize
-/// it if possible.  If we make a change, return true.
+/// ProcessGlobal - Analyze the specified global variable and optimize it if
+/// possible.  If we make a change, return true.
 bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
                               Module::global_iterator &GVI) {
   if (!GV->hasLocalLinkage())
@@ -1737,7 +1750,7 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
 /// it if possible.  If we make a change, return true.
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI,
-                                      const SmallPtrSet<const PHINode*, 16> &PHIUsers,
+                                const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                       const GlobalStatus &GS) {
   // If this is a first class global and has only one accessing function
   // and this function is main (which we know is not recursive we can make
@@ -1755,11 +1768,11 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
       GS.AccessingFunction->hasExternalLinkage() &&
       GV->getType()->getAddressSpace() == 0) {
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV);
-    Instruction& FirstI = const_cast<Instruction&>(*GS.AccessingFunction
+    Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
-    Type* ElemTy = GV->getType()->getElementType();
+    Type *ElemTy = GV->getType()->getElementType();
     // FIXME: Pass Global's alignment when globals have alignment
-    AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
+    AllocaInst *Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
 
@@ -1776,7 +1789,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
     // Delete any stores we can find to the global.  We may not be able to
     // make it completely dead though.
-    bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer());
+    bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(),
+                                              TD, TLI);
 
     // If the global is dead now, delete it.
     if (GV->use_empty()) {
@@ -1791,7 +1805,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
-    CleanupConstantGlobalUsers(GV, GV->getInitializer());
+    CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
 
     // If the global is dead now, just nuke it.
     if (GV->use_empty()) {
@@ -1820,7 +1834,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
         GV->setInitializer(SOVConstant);
 
         // Clean up any obviously simplifiable users now.
-        CleanupConstantGlobalUsers(GV, GV->getInitializer());
+        CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
 
         if (GV->use_empty()) {
           DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
@@ -1836,8 +1850,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
     // Try to optimize globals based on the knowledge that only one value
     // (besides its initializer) is ever stored to the global.
-    if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GVI,
-                                 getAnalysisIfAvailable<TargetData>()))
+    if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI,
+                                 TD, TLI))
       return true;
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
@@ -1890,7 +1904,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
     if (!F->hasName() && !F->isDeclaration())
       F->setLinkage(GlobalValue::InternalLinkage);
     F->removeDeadConstantUsers();
-    if (F->use_empty() && (F->hasLocalLinkage() || F->hasLinkOnceLinkage())) {
+    if (F->isDefTriviallyDead()) {
       F->eraseFromParent();
       Changed = true;
       ++NumFnDeleted;
@@ -1930,8 +1944,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
     // Simplify the initializer.
     if (GV->hasInitializer())
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) {
-        TargetData *TD = getAnalysisIfAvailable<TargetData>();
-        Constant *New = ConstantFoldConstantExpression(CE, TD);
+        Constant *New = ConstantFoldConstantExpression(CE, TD, TLI);
         if (New && New != CE)
           GV->setInitializer(New);
       }
@@ -2052,16 +2065,10 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
 }
 
 
-static Constant *getVal(DenseMap<Value*, Constant*> &ComputedValues, Value *V) {
-  if (Constant *CV = dyn_cast<Constant>(V)) return CV;
-  Constant *R = ComputedValues[V];
-  assert(R && "Reference to an uncomputed value!");
-  return R;
-}
-
 static inline bool 
 isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSet<Constant*, 8> &SimpleConstants);
+                            SmallPtrSet<Constant*, 8> &SimpleConstants,
+                            const TargetData *TD);
 
 
 /// isSimpleEnoughValueToCommit - Return true if the specified constant can be
@@ -2073,7 +2080,8 @@ isSimpleEnoughValueToCommit(Constant *C,
 /// in SimpleConstants to avoid having to rescan the same constants all the
 /// time.
 static bool isSimpleEnoughValueToCommitHelper(Constant *C,
-                                   SmallPtrSet<Constant*, 8> &SimpleConstants) {
+                                   SmallPtrSet<Constant*, 8> &SimpleConstants,
+                                   const TargetData *TD) {
   // Simple integer, undef, constant aggregate zero, global addresses, etc are
   // all supported.
   if (C->getNumOperands() == 0 || isa<BlockAddress>(C) ||
@@ -2085,7 +2093,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
       isa<ConstantVector>(C)) {
     for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
       Constant *Op = cast<Constant>(C->getOperand(i));
-      if (!isSimpleEnoughValueToCommit(Op, SimpleConstants))
+      if (!isSimpleEnoughValueToCommit(Op, SimpleConstants, TD))
         return false;
     }
     return true;
@@ -2097,34 +2105,42 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
   ConstantExpr *CE = cast<ConstantExpr>(C);
   switch (CE->getOpcode()) {
   case Instruction::BitCast:
+    // Bitcast is fine if the casted value is fine.
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
+
   case Instruction::IntToPtr:
   case Instruction::PtrToInt:
-    // These casts are always fine if the casted value is.
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants);
+    // int <=> ptr is fine if the int type is the same size as the
+    // pointer type.
+    if (!TD || TD->getTypeSizeInBits(CE->getType()) !=
+               TD->getTypeSizeInBits(CE->getOperand(0)->getType()))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
       
   // GEP is fine if it is simple + constant offset.
   case Instruction::GetElementPtr:
     for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
       if (!isa<ConstantInt>(CE->getOperand(i)))
         return false;
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants);
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
       
   case Instruction::Add:
     // We allow simple+cst.
     if (!isa<ConstantInt>(CE->getOperand(1)))
       return false;
-    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants);
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
   }
   return false;
 }
 
 static inline bool 
 isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSet<Constant*, 8> &SimpleConstants) {
+                            SmallPtrSet<Constant*, 8> &SimpleConstants,
+                            const TargetData *TD) {
   // If we already checked this constant, we win.
   if (!SimpleConstants.insert(C)) return true;
   // Check the constant.
-  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants);
+  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, TD);
 }
 
 
@@ -2191,23 +2207,11 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
     return Val;
   }
 
-  std::vector<Constant*> Elts;
+  SmallVector<Constant*, 32> Elts;
   if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
-
     // Break up the constant into its elements.
-    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
-      for (User::op_iterator i = CS->op_begin(), e = CS->op_end(); i != e; ++i)
-        Elts.push_back(cast<Constant>(*i));
-    } else if (isa<ConstantAggregateZero>(Init)) {
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-        Elts.push_back(Constant::getNullValue(STy->getElementType(i)));
-    } else if (isa<UndefValue>(Init)) {
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-        Elts.push_back(UndefValue::get(STy->getElementType(i)));
-    } else {
-      llvm_unreachable("This code is out of sync with "
-             " ConstantFoldLoadThroughGEPConstantExpr");
-    }
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      Elts.push_back(Init->getAggregateElement(i));
 
     // Replace the element that we are supposed to.
     ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
@@ -2226,22 +2230,11 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   if (ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
     NumElts = ATy->getNumElements();
   else
-    NumElts = cast<VectorType>(InitTy)->getNumElements();
+    NumElts = InitTy->getVectorNumElements();
 
   // Break up the array into elements.
-  if (ConstantArray *CA = dyn_cast<ConstantArray>(Init)) {
-    for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i)
-      Elts.push_back(cast<Constant>(*i));
-  } else if (ConstantVector *CV = dyn_cast<ConstantVector>(Init)) {
-    for (User::op_iterator i = CV->op_begin(), e = CV->op_end(); i != e; ++i)
-      Elts.push_back(cast<Constant>(*i));
-  } else if (isa<ConstantAggregateZero>(Init)) {
-    Elts.assign(NumElts, Constant::getNullValue(InitTy->getElementType()));
-  } else {
-    assert(isa<UndefValue>(Init) && "This code is out of sync with "
-           " ConstantFoldLoadThroughGEPConstantExpr");
-    Elts.assign(NumElts, UndefValue::get(InitTy->getElementType()));
-  }
+  for (uint64_t i = 0, e = NumElts; i != e; ++i)
+    Elts.push_back(Init->getAggregateElement(i));
 
   assert(CI->getZExtValue() < NumElts);
   Elts[CI->getZExtValue()] =
@@ -2266,15 +2259,109 @@ static void CommitValueTo(Constant *Val, Constant *Addr) {
   GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
 }
 
+namespace {
+
+/// Evaluator - This class evaluates LLVM IR, producing the Constant
+/// representing each SSA instruction.  Changes to global variables are stored
+/// in a mapping that can be iterated over after the evaluation is complete.
+/// Once an evaluation call fails, the evaluation object should not be reused.
+class Evaluator {
+public:
+  Evaluator(const TargetData *TD, const TargetLibraryInfo *TLI)
+    : TD(TD), TLI(TLI) {
+    ValueStack.push_back(new DenseMap<Value*, Constant*>);
+  }
+
+  ~Evaluator() {
+    DeleteContainerPointers(ValueStack);
+    while (!AllocaTmps.empty()) {
+      GlobalVariable *Tmp = AllocaTmps.back();
+      AllocaTmps.pop_back();
+
+      // If there are still users of the alloca, the program is doing something
+      // silly, e.g. storing the address of the alloca somewhere and using it
+      // later.  Since this is undefined, we'll just make it be null.
+      if (!Tmp->use_empty())
+        Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
+      delete Tmp;
+    }
+  }
+
+  /// EvaluateFunction - Evaluate a call to function F, returning true if
+  /// successful, false if we can't evaluate it.  ActualArgs contains the formal
+  /// arguments for the function.
+  bool EvaluateFunction(Function *F, Constant *&RetVal,
+                        const SmallVectorImpl<Constant*> &ActualArgs);
+
+  /// EvaluateBlock - Evaluate all instructions in block BB, returning true if
+  /// successful, false if we can't evaluate it.  NewBB returns the next BB that
+  /// control flows into, or null upon return.
+  bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB);
+
+  Constant *getVal(Value *V) {
+    if (Constant *CV = dyn_cast<Constant>(V)) return CV;
+    Constant *R = ValueStack.back()->lookup(V);
+    assert(R && "Reference to an uncomputed value!");
+    return R;
+  }
+
+  void setVal(Value *V, Constant *C) {
+    ValueStack.back()->operator[](V) = C;
+  }
+
+  const DenseMap<Constant*, Constant*> &getMutatedMemory() const {
+    return MutatedMemory;
+  }
+
+  const SmallPtrSet<GlobalVariable*, 8> &getInvariants() const {
+    return Invariants;
+  }
+
+private:
+  Constant *ComputeLoadResult(Constant *P);
+
+  /// ValueStack - As we compute SSA register values, we store their contents
+  /// here. The back of the vector contains the current function and the stack
+  /// contains the values in the calling frames.
+  SmallVector<DenseMap<Value*, Constant*>*, 4> ValueStack;
+
+  /// CallStack - This is used to detect recursion.  In pathological situations
+  /// we could hit exponential behavior, but at least there is nothing
+  /// unbounded.
+  SmallVector<Function*, 4> CallStack;
+
+  /// MutatedMemory - For each store we execute, we update this map.  Loads
+  /// check this to get the most up-to-date value.  If evaluation is successful,
+  /// this state is committed to the process.
+  DenseMap<Constant*, Constant*> MutatedMemory;
+
+  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
+  /// to represent its body.  This vector is needed so we can delete the
+  /// temporary globals when we are done.
+  SmallVector<GlobalVariable*, 32> AllocaTmps;
+
+  /// Invariants - These global variables have been marked invariant by the
+  /// static constructor.
+  SmallPtrSet<GlobalVariable*, 8> Invariants;
+
+  /// SimpleConstants - These are constants we have checked and know to be
+  /// simple enough to live in a static initializer of a global.
+  SmallPtrSet<Constant*, 8> SimpleConstants;
+
+  const TargetData *TD;
+  const TargetLibraryInfo *TLI;
+};
+
+}  // anonymous namespace
+
 /// ComputeLoadResult - Return the value that would be computed by a load from
 /// P after the stores reflected by 'memory' have been performed.  If we can't
 /// decide, return null.
-static Constant *ComputeLoadResult(Constant *P,
-                                const DenseMap<Constant*, Constant*> &Memory) {
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
   // If this memory location has been recently stored, use the stored value: it
   // is the most up-to-date.
-  DenseMap<Constant*, Constant*>::const_iterator I = Memory.find(P);
-  if (I != Memory.end()) return I->second;
+  DenseMap<Constant*, Constant*>::const_iterator I = MutatedMemory.find(P);
+  if (I != MutatedMemory.end()) return I->second;
 
   // Access it.
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
@@ -2295,56 +2382,29 @@ static Constant *ComputeLoadResult(Constant *P,
   return 0;  // don't know how to evaluate.
 }
 
-/// EvaluateFunction - Evaluate a call to function F, returning true if
-/// successful, false if we can't evaluate it.  ActualArgs contains the formal
-/// arguments for the function.
-static bool EvaluateFunction(Function *F, Constant *&RetVal,
-                             const SmallVectorImpl<Constant*> &ActualArgs,
-                             std::vector<Function*> &CallStack,
-                             DenseMap<Constant*, Constant*> &MutatedMemory,
-                             std::vector<GlobalVariable*> &AllocaTmps,
-                             SmallPtrSet<Constant*, 8> &SimpleConstants,
-                             const TargetData *TD) {
-  // Check to see if this function is already executing (recursion).  If so,
-  // bail out.  TODO: we might want to accept limited recursion.
-  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
-    return false;
-
-  CallStack.push_back(F);
-
-  /// Values - As we compute SSA register values, we store their contents here.
-  DenseMap<Value*, Constant*> Values;
-
-  // Initialize arguments to the incoming values specified.
-  unsigned ArgNo = 0;
-  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
-       ++AI, ++ArgNo)
-    Values[AI] = ActualArgs[ArgNo];
-
-  /// ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
-  /// we can only evaluate any one basic block at most once.  This set keeps
-  /// track of what we have executed so we can detect recursive cases etc.
-  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
-
-  // CurInst - The current instruction we're evaluating.
-  BasicBlock::iterator CurInst = F->begin()->begin();
-
+/// EvaluateBlock - Evaluate all instructions in block BB, returning true if
+/// successful, false if we can't evaluate it.  NewBB returns the next BB that
+/// control flows into, or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+                              BasicBlock *&NextBB) {
   // This is the main evaluation loop.
   while (1) {
     Constant *InstResult = 0;
 
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
       if (!SI->isSimple()) return false;  // no volatile/atomic accesses.
-      Constant *Ptr = getVal(Values, SI->getOperand(1));
+      Constant *Ptr = getVal(SI->getOperand(1));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+        Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
       if (!isSimpleEnoughPointerToCommit(Ptr))
         // If this is too complex for us to commit, reject it.
         return false;
       
-      Constant *Val = getVal(Values, SI->getOperand(0));
+      Constant *Val = getVal(SI->getOperand(0));
 
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
-      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants))
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD))
         return false;
         
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
@@ -2354,7 +2414,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
           // stored value.
           Ptr = CE->getOperand(0);
           
-          Type *NewTy=cast<PointerType>(Ptr->getType())->getElementType();
+          Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
           
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
@@ -2366,16 +2426,18 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
             if (StructType *STy = dyn_cast<StructType>(NewTy)) {
               NewTy = STy->getTypeAtIndex(0U);
 
-              IntegerType *IdxTy =IntegerType::get(NewTy->getContext(), 32);
+              IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32);
               Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
               Constant * const IdxList[] = {IdxZero, IdxZero};
 
               Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList);
-            
+              if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+                Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
             } else {
-              return 0;
+              return false;
             }
           }
           
@@ -2387,33 +2449,35 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       MutatedMemory[Ptr] = Val;
     } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
       InstResult = ConstantExpr::get(BO->getOpcode(),
-                                     getVal(Values, BO->getOperand(0)),
-                                     getVal(Values, BO->getOperand(1)));
+                                     getVal(BO->getOperand(0)),
+                                     getVal(BO->getOperand(1)));
     } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
       InstResult = ConstantExpr::getCompare(CI->getPredicate(),
-                                            getVal(Values, CI->getOperand(0)),
-                                            getVal(Values, CI->getOperand(1)));
+                                            getVal(CI->getOperand(0)),
+                                            getVal(CI->getOperand(1)));
     } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
       InstResult = ConstantExpr::getCast(CI->getOpcode(),
-                                         getVal(Values, CI->getOperand(0)),
+                                         getVal(CI->getOperand(0)),
                                          CI->getType());
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
-      InstResult = ConstantExpr::getSelect(getVal(Values, SI->getOperand(0)),
-                                           getVal(Values, SI->getOperand(1)),
-                                           getVal(Values, SI->getOperand(2)));
+      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+                                           getVal(SI->getOperand(1)),
+                                           getVal(SI->getOperand(2)));
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
-      Constant *P = getVal(Values, GEP->getOperand(0));
+      Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
       for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
            i != e; ++i)
-        GEPOps.push_back(getVal(Values, *i));
+        GEPOps.push_back(getVal(*i));
       InstResult =
         ConstantExpr::getGetElementPtr(P, GEPOps,
                                        cast<GEPOperator>(GEP)->isInBounds());
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
       if (!LI->isSimple()) return false;  // no volatile/atomic accesses.
-      InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)),
-                                     MutatedMemory);
+      Constant *Ptr = getVal(LI->getOperand(0));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+        Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+      InstResult = ComputeLoadResult(Ptr);
       if (InstResult == 0) return false; // Could not evaluate load.
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
       if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
@@ -2423,25 +2487,53 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
                                               UndefValue::get(Ty),
                                               AI->getName()));
       InstResult = AllocaTmps.back();
-    } else if (CallInst *CI = dyn_cast<CallInst>(CurInst)) {
+    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+      CallSite CS(CurInst);
 
       // Debug info can safely be ignored here.
-      if (isa<DbgInfoIntrinsic>(CI)) {
+      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
-      if (isa<InlineAsm>(CI->getCalledValue())) return false;
-
-      if (MemSetInst *MSI = dyn_cast<MemSetInst>(CI)) {
-        if (MSI->isVolatile()) return false;
-        Constant *Ptr = getVal(Values, MSI->getDest());
-        Constant *Val = getVal(Values, MSI->getValue());
-        Constant *DestVal = ComputeLoadResult(getVal(Values, Ptr),
-                                              MutatedMemory);
-        if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
-          // This memset is a no-op.
+      if (isa<InlineAsm>(CS.getCalledValue())) return false;
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+          if (MSI->isVolatile()) return false;
+          Constant *Ptr = getVal(MSI->getDest());
+          Constant *Val = getVal(MSI->getValue());
+          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+            // This memset is a no-op.
+            ++CurInst;
+            continue;
+          }
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          ++CurInst;
+          continue;
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+          // We don't insert an entry into Values, as it doesn't have a
+          // meaningful return value.
+          if (!II->use_empty())
+            return false;
+          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+          Value *PtrArg = getVal(II->getArgOperand(1));
+          Value *Ptr = PtrArg->stripPointerCasts();
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
+            if (!Size->isAllOnesValue() &&
+                Size->getValue().getLimitedValue() >=
+                TD->getTypeStoreSize(ElemTy))
+              Invariants.insert(GV);
+          }
+          // Continue even if we do nothing.
           ++CurInst;
           continue;
         }
@@ -2449,19 +2541,17 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       }
 
       // Resolve function pointers.
-      Function *Callee = dyn_cast<Function>(getVal(Values,
-                                                   CI->getCalledValue()));
-      if (!Callee) return false;  // Cannot resolve.
+      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      if (!Callee || Callee->mayBeOverridden())
+        return false;  // Cannot resolve.
 
       SmallVector<Constant*, 8> Formals;
-      CallSite CS(CI);
-      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end();
-           i != e; ++i)
-        Formals.push_back(getVal(Values, *i));
+      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
+        Formals.push_back(getVal(*i));
 
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
-        if (Constant *C = ConstantFoldCall(Callee, Formals)) {
+        if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
           InstResult = C;
         } else {
           return false;
@@ -2472,62 +2562,43 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
 
         Constant *RetVal;
         // Execute the call, if successful, use the return value.
-        if (!EvaluateFunction(Callee, RetVal, Formals, CallStack,
-                              MutatedMemory, AllocaTmps, SimpleConstants, TD))
+        ValueStack.push_back(new DenseMap<Value*, Constant*>);
+        if (!EvaluateFunction(Callee, RetVal, Formals))
           return false;
+        delete ValueStack.pop_back_val();
         InstResult = RetVal;
       }
     } else if (isa<TerminatorInst>(CurInst)) {
-      BasicBlock *NewBB = 0;
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
         if (BI->isUnconditional()) {
-          NewBB = BI->getSuccessor(0);
+          NextBB = BI->getSuccessor(0);
         } else {
           ConstantInt *Cond =
-            dyn_cast<ConstantInt>(getVal(Values, BI->getCondition()));
+            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
           if (!Cond) return false;  // Cannot determine.
 
-          NewBB = BI->getSuccessor(!Cond->getZExtValue());
+          NextBB = BI->getSuccessor(!Cond->getZExtValue());
         }
       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
         ConstantInt *Val =
-          dyn_cast<ConstantInt>(getVal(Values, SI->getCondition()));
+          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
         if (!Val) return false;  // Cannot determine.
-        NewBB = SI->getSuccessor(SI->findCaseValue(Val));
+        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
       } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
-        Value *Val = getVal(Values, IBI->getAddress())->stripPointerCasts();
+        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
         if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
-          NewBB = BA->getBasicBlock();
+          NextBB = BA->getBasicBlock();
         else
           return false;  // Cannot determine.
-      } else if (ReturnInst *RI = dyn_cast<ReturnInst>(CurInst)) {
-        if (RI->getNumOperands())
-          RetVal = getVal(Values, RI->getOperand(0));
-
-        CallStack.pop_back();  // return from fn.
-        return true;  // We succeeded at evaluating this ctor!
+      } else if (isa<ReturnInst>(CurInst)) {
+        NextBB = 0;
       } else {
         // invoke, unwind, resume, unreachable.
         return false;  // Cannot handle this terminator.
       }
 
-      // Okay, we succeeded in evaluating this control flow.  See if we have
-      // executed the new block before.  If so, we have a looping function,
-      // which we cannot evaluate in reasonable time.
-      if (!ExecutedBlocks.insert(NewBB))
-        return false;  // looped!
-
-      // Okay, we have never been in this block before.  Check to see if there
-      // are any PHI nodes.  If so, evaluate them with information about where
-      // we came from.
-      BasicBlock *OldBB = CurInst->getParent();
-      CurInst = NewBB->begin();
-      PHINode *PN;
-      for (; (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
-        Values[PN] = getVal(Values, PN->getIncomingValueForBlock(OldBB));
-
-      // Do NOT increment CurInst.  We know that the terminator had no value.
-      continue;
+      // We succeeded at evaluating this block!
+      return true;
     } else {
       // Did not know how to evaluate this!
       return false;
@@ -2535,9 +2606,15 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
 
     if (!CurInst->use_empty()) {
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
-        InstResult = ConstantFoldConstantExpression(CE, TD);
+        InstResult = ConstantFoldConstantExpression(CE, TD, TLI);
       
-      Values[CurInst] = InstResult;
+      setVal(CurInst, InstResult);
+    }
+
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      return true;
     }
 
     // Advance program counter.
@@ -2545,64 +2622,96 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
   }
 }
 
-/// EvaluateStaticConstructor - Evaluate static constructors in the function, if
-/// we can.  Return true if we can, false otherwise.
-static bool EvaluateStaticConstructor(Function *F, const TargetData *TD) {
-  /// MutatedMemory - For each store we execute, we update this map.  Loads
-  /// check this to get the most up-to-date value.  If evaluation is successful,
-  /// this state is committed to the process.
-  DenseMap<Constant*, Constant*> MutatedMemory;
+/// EvaluateFunction - Evaluate a call to function F, returning true if
+/// successful, false if we can't evaluate it.  ActualArgs contains the formal
+/// arguments for the function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+                                 const SmallVectorImpl<Constant*> &ActualArgs) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+    return false;
 
-  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
-  /// to represent its body.  This vector is needed so we can delete the
-  /// temporary globals when we are done.
-  std::vector<GlobalVariable*> AllocaTmps;
+  CallStack.push_back(F);
 
-  /// CallStack - This is used to detect recursion.  In pathological situations
-  /// we could hit exponential behavior, but at least there is nothing
-  /// unbounded.
-  std::vector<Function*> CallStack;
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    setVal(AI, ActualArgs[ArgNo]);
 
-  /// SimpleConstants - These are constants we have checked and know to be
-  /// simple enough to live in a static initializer of a global.
-  SmallPtrSet<Constant*, 8> SimpleConstants;
-  
+  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  // we can only evaluate any one basic block at most once.  This set keeps
+  // track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+  // CurBB - The current basic block we're evaluating.
+  BasicBlock *CurBB = F->begin();
+
+  BasicBlock::iterator CurInst = CurBB->begin();
+
+  while (1) {
+    BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings.
+    if (!EvaluateBlock(CurInst, NextBB))
+      return false;
+
+    if (NextBB == 0) {
+      // Successfully running until there's no next block means that we found
+      // the return.  Fill it the return value and pop the call stack.
+      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+      if (RI->getNumOperands())
+        RetVal = getVal(RI->getOperand(0));
+      CallStack.pop_back();
+      return true;
+    }
+
+    // Okay, we succeeded in evaluating this control flow.  See if we have
+    // executed the new block before.  If so, we have a looping function,
+    // which we cannot evaluate in reasonable time.
+    if (!ExecutedBlocks.insert(NextBB))
+      return false;  // looped!
+
+    // Okay, we have never been in this block before.  Check to see if there
+    // are any PHI nodes.  If so, evaluate them with information about where
+    // we came from.
+    PHINode *PN = 0;
+    for (CurInst = NextBB->begin();
+         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+    // Advance to the next block.
+    CurBB = NextBB;
+  }
+}
+
+/// EvaluateStaticConstructor - Evaluate static constructors in the function, if
+/// we can.  Return true if we can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F, const TargetData *TD,
+                                      const TargetLibraryInfo *TLI) {
   // Call the function.
+  Evaluator Eval(TD, TLI);
   Constant *RetValDummy;
-  bool EvalSuccess = EvaluateFunction(F, RetValDummy,
-                                      SmallVector<Constant*, 0>(), CallStack,
-                                      MutatedMemory, AllocaTmps,
-                                      SimpleConstants, TD);
+  bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
+                                           SmallVector<Constant*, 0>());
   
   if (EvalSuccess) {
     // We succeeded at evaluation: commit the result.
     DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
-          << F->getName() << "' to " << MutatedMemory.size()
+          << F->getName() << "' to " << Eval.getMutatedMemory().size()
           << " stores.\n");
-    for (DenseMap<Constant*, Constant*>::iterator I = MutatedMemory.begin(),
-         E = MutatedMemory.end(); I != E; ++I)
+    for (DenseMap<Constant*, Constant*>::const_iterator I =
+           Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end();
+	 I != E; ++I)
       CommitValueTo(I->second, I->first);
-  }
-
-  // At this point, we are done interpreting.  If we created any 'alloca'
-  // temporaries, release them now.
-  while (!AllocaTmps.empty()) {
-    GlobalVariable *Tmp = AllocaTmps.back();
-    AllocaTmps.pop_back();
-
-    // If there are still users of the alloca, the program is doing something
-    // silly, e.g. storing the address of the alloca somewhere and using it
-    // later.  Since this is undefined, we'll just make it be null.
-    if (!Tmp->use_empty())
-      Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
-    delete Tmp;
+    for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I =
+           Eval.getInvariants().begin(), E = Eval.getInvariants().end();
+         I != E; ++I)
+      (*I)->setConstant(true);
   }
 
   return EvalSuccess;
 }
 
-
-
 /// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible.
 /// Return true if anything changed.
 bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
@@ -2610,7 +2719,6 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   bool MadeChange = false;
   if (Ctors.empty()) return false;
 
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
   // Loop over global ctors, optimizing them when we can.
   for (unsigned i = 0; i != Ctors.size(); ++i) {
     Function *F = Ctors[i];
@@ -2628,7 +2736,7 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
     if (F->empty()) continue;
 
     // If we can evaluate the ctor at compile time, do.
-    if (EvaluateStaticConstructor(F, TD)) {
+    if (EvaluateStaticConstructor(F, TD, TLI)) {
       Ctors.erase(Ctors.begin()+i);
       MadeChange = true;
       --i;
@@ -2700,12 +2808,15 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
   return Changed;
 }
 
-static Function *FindCXAAtExit(Module &M) {
-  Function *Fn = M.getFunction("__cxa_atexit");
+static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::cxa_atexit))
+    return 0;
+
+  Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit));
   
   if (!Fn)
     return 0;
-  
+
   FunctionType *FTy = Fn->getFunctionType();
   
   // Checking that the function has the right return type, the right number of 
@@ -2724,7 +2835,8 @@ static Function *FindCXAAtExit(Module &M) {
 /// destructor and can therefore be eliminated.
 /// Note that we assume that other optimization passes have already simplified
 /// the code so we only look for a function with a single basic block, where
-/// the only allowed instructions are 'ret' or 'call' to empty C++ dtor.
+/// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and
+/// other side-effect free instructions.
 static bool cxxDtorIsEmpty(const Function &Fn,
                            SmallPtrSet<const Function *, 8> &CalledFunctions) {
   // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
@@ -2757,9 +2869,9 @@ static bool cxxDtorIsEmpty(const Function &Fn,
       if (!cxxDtorIsEmpty(*CalledFn, NewCalledFunctions))
         return false;
     } else if (isa<ReturnInst>(*I))
-      return true;
-    else
-      return false;
+      return true; // We're done.
+    else if (I->mayHaveSideEffects())
+      return false; // Destructor with side effects, bail.
   }
 
   return false;
@@ -2815,10 +2927,13 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 bool GlobalOpt::runOnModule(Module &M) {
   bool Changed = false;
 
+  TD = getAnalysisIfAvailable<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
   // Try to find the llvm.globalctors list.
   GlobalVariable *GlobalCtors = FindGlobalCtors(M);
 
-  Function *CXAAtExitFn = FindCXAAtExit(M);
+  Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
 
   bool LocalChange = true;
   while (LocalChange) {
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index c0426da..664ddf6 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -32,34 +32,21 @@ namespace {
 
   // AlwaysInliner only inlines functions that are mark as "always inline".
   class AlwaysInliner : public Inliner {
-    // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline;
-    InlineCostAnalyzer CA;
   public:
     // Use extremely low threshold.
-    AlwaysInliner() : Inliner(ID, -2000000000) {
+    AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) {
       initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
     }
-    static char ID; // Pass identification, replacement for typeid
-    InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, NeverInline);
-    }
-    float getInlineFudgeFactor(CallSite CS) {
-      return CA.getInlineFudgeFactor(CS);
-    }
-    void resetCachedCostInfo(Function *Caller) {
-      CA.resetCachedCostInfo(Caller);
-    }
-    void growCachedCostInfo(Function* Caller, Function* Callee) {
-      CA.growCachedCostInfo(Caller, Callee);
+    AlwaysInliner(bool InsertLifetime) : Inliner(ID, -2000000000,
+                                                 InsertLifetime) {
+      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
     }
+    static char ID; // Pass identification, replacement for typeid
+    virtual InlineCost getInlineCost(CallSite CS);
     virtual bool doFinalization(CallGraph &CG) {
-      return removeDeadFunctions(CG, &NeverInline);
+      return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
     }
     virtual bool doInitialization(CallGraph &CG);
-    void releaseMemory() {
-      CA.clear();
-    }
   };
 }
 
@@ -72,17 +59,74 @@ INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
 
 Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
 
-// doInitialization - Initializes the vector of functions that have not
-// been annotated with the "always inline" attribute.
-bool AlwaysInliner::doInitialization(CallGraph &CG) {
-  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
+Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
+  return new AlwaysInliner(InsertLifetime);
+}
+
+/// \brief Minimal filter to detect invalid constructs for inlining.
+static bool isInlineViable(Function &F) {
+  bool ReturnsTwice = F.hasFnAttr(Attribute::ReturnsTwice);
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Disallow inlining of functions which contain an indirect branch.
+    if (isa<IndirectBrInst>(BI->getTerminator()))
+      return false;
 
-  Module &M = CG.getModule();
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      CallSite CS(II);
+      if (!CS)
+        continue;
 
-  for (Module::iterator I = M.begin(), E = M.end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline))
-      NeverInline.insert(I);
+      // Disallow recursive calls.
+      if (&F == CS.getCalledFunction())
+        return false;
 
+      // Disallow calls which expose returns-twice to a function not previously
+      // attributed as such.
+      if (!ReturnsTwice && CS.isCall() &&
+          cast<CallInst>(CS.getInstruction())->canReturnTwice())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// \brief Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
+  Function *Callee = CS.getCalledFunction();
+  // We assume indirect calls aren't calling an always-inline function.
+  if (!Callee) return InlineCost::getNever();
+
+  // We can't inline calls to external functions.
+  // FIXME: We shouldn't even get here.
+  if (Callee->isDeclaration()) return InlineCost::getNever();
+
+  // Return never for anything not marked as always inline.
+  if (!Callee->hasFnAttr(Attribute::AlwaysInline))
+    return InlineCost::getNever();
+
+  // Do some minimal analysis to preclude non-viable functions.
+  if (!isInlineViable(*Callee))
+    return InlineCost::getNever();
+
+  // Otherwise, force inlining.
+  return InlineCost::getAlways();
+}
+
+// doInitialization - Initializes the vector of functions that have not
+// been annotated with the "always inline" attribute.
+bool AlwaysInliner::doInitialization(CallGraph &CG) {
   return false;
 }
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 84dd4fd..50038d8 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -23,40 +23,26 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
 
 namespace {
 
   class SimpleInliner : public Inliner {
-    // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
     SimpleInliner() : Inliner(ID) {
       initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
     }
-    SimpleInliner(int Threshold) : Inliner(ID, Threshold) {
+    SimpleInliner(int Threshold) : Inliner(ID, Threshold,
+                                           /*InsertLifetime*/true) {
       initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
     }
     static char ID; // Pass identification, replacement for typeid
     InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, NeverInline);
-    }
-    float getInlineFudgeFactor(CallSite CS) {
-      return CA.getInlineFudgeFactor(CS);
-    }
-    void resetCachedCostInfo(Function *Caller) {
-      CA.resetCachedCostInfo(Caller);
-    }
-    void growCachedCostInfo(Function* Caller, Function* Callee) {
-      CA.growCachedCostInfo(Caller, Callee);
+      return CA.getInlineCost(CS, getInlineThreshold(CS));
     }
     virtual bool doInitialization(CallGraph &CG);
-    void releaseMemory() {
-      CA.clear();
-    }
   };
 }
 
@@ -77,44 +63,6 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 // annotated with the noinline attribute.
 bool SimpleInliner::doInitialization(CallGraph &CG) {
   CA.setTargetData(getAnalysisIfAvailable<TargetData>());
-
-  Module &M = CG.getModule();
-
-  for (Module::iterator I = M.begin(), E = M.end();
-       I != E; ++I)
-    if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline))
-      NeverInline.insert(I);
-
-  // Get llvm.noinline
-  GlobalVariable *GV = M.getNamedGlobal("llvm.noinline");
-
-  if (GV == 0)
-    return false;
-
-  // Don't crash on invalid code
-  if (!GV->hasDefinitiveInitializer())
-    return false;
-
-  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-
-  if (InitList == 0)
-    return false;
-
-  // Iterate over each element and add to the NeverInline set
-  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
-
-    // Get Source
-    const Constant *Elt = InitList->getOperand(i);
-
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt))
-      if (CE->getOpcode() == Instruction::BitCast)
-        Elt = CE->getOperand(0);
-
-    // Insert into set of functions to never inline
-    if (const Function *F = dyn_cast<Function>(Elt))
-      NeverInline.insert(F);
-  }
-
   return false;
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index f00935b..dc9cbfb 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -36,6 +36,11 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 
+// This weirdly named statistic tracks the number of times that, when attemting
+// to inline a function A into B, we analyze the callers of B in order to see
+// if those would be more profitable and blocked inline steps.
+STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
+
 static cl::opt<int>
 InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
         cl::desc("Control the amount of inlining to perform (default = 225)"));
@@ -48,11 +53,12 @@ HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325),
 const int OptSizeThreshold = 75;
 
 Inliner::Inliner(char &ID) 
-  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit) {}
+  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {}
 
-Inliner::Inliner(char &ID, int Threshold) 
+Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
   : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ?
-                                          InlineLimit : Threshold) {}
+                                          InlineLimit : Threshold),
+    InsertLifetime(InsertLifetime) {}
 
 /// getAnalysisUsage - For this class, we declare that we require and preserve
 /// the call graph.  If the derived class implements this method, it should
@@ -75,13 +81,13 @@ InlinedArrayAllocasTy;
 /// any new allocas to the set if not possible.
 static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
                                  InlinedArrayAllocasTy &InlinedArrayAllocas,
-                                 int InlineHistory) {
+                                 int InlineHistory, bool InsertLifetime) {
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
 
   // Try to inline the function.  Get the list of static allocas that were
   // inlined.
-  if (!InlineFunction(CS, IFI))
+  if (!InlineFunction(CS, IFI, InsertLifetime))
     return false;
 
   // If the inlined function had a higher stack protection level than the
@@ -230,29 +236,37 @@ bool Inliner::shouldInline(CallSite CS) {
     return false;
   }
   
-  int Cost = IC.getValue();
   Function *Caller = CS.getCaller();
-  int CurrentThreshold = getInlineThreshold(CS);
-  float FudgeFactor = getInlineFudgeFactor(CS);
-  int AdjThreshold = (int)(CurrentThreshold * FudgeFactor);
-  if (Cost >= AdjThreshold) {
-    DEBUG(dbgs() << "    NOT Inlining: cost=" << Cost
-          << ", thres=" << AdjThreshold
+  if (!IC) {
+    DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
+          << ", thres=" << (IC.getCostDelta() + IC.getCost())
           << ", Call: " << *CS.getInstruction() << "\n");
     return false;
   }
   
-  // Try to detect the case where the current inlining candidate caller
-  // (call it B) is a static function and is an inlining candidate elsewhere,
-  // and the current candidate callee (call it C) is large enough that
-  // inlining it into B would make B too big to inline later.  In these
-  // circumstances it may be best not to inline C into B, but to inline B
-  // into its callers.
-  if (Caller->hasLocalLinkage()) {
+  // Try to detect the case where the current inlining candidate caller (call
+  // it B) is a static or linkonce-ODR function and is an inlining candidate
+  // elsewhere, and the current candidate callee (call it C) is large enough
+  // that inlining it into B would make B too big to inline later. In these
+  // circumstances it may be best not to inline C into B, but to inline B into
+  // its callers.
+  //
+  // This only applies to static and linkonce-ODR functions because those are
+  // expected to be available for inlining in the translation units where they
+  // are used. Thus we will always have the opportunity to make local inlining
+  // decisions. Importantly the linkonce-ODR linkage covers inline functions
+  // and templates in C++.
+  //
+  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
+  // the internal implementation of the inline cost metrics rather than
+  // treating them as truly abstract units etc.
+  if (Caller->hasLocalLinkage() ||
+      Caller->getLinkage() == GlobalValue::LinkOnceODRLinkage) {
     int TotalSecondaryCost = 0;
-    bool outerCallsFound = false;
+    // The candidate cost to be imposed upon the current function.
+    int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
     // This bool tracks what happens if we do NOT inline C into B.
-    bool callerWillBeRemoved = true;
+    bool callerWillBeRemoved = Caller->hasLocalLinkage();
     // This bool tracks what happens if we DO inline C into B.
     bool inliningPreventsSomeOuterInline = false;
     for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); 
@@ -268,26 +282,20 @@ bool Inliner::shouldInline(CallSite CS) {
       }
 
       InlineCost IC2 = getInlineCost(CS2);
-      if (IC2.isNever())
+      ++NumCallerCallersAnalyzed;
+      if (!IC2) {
         callerWillBeRemoved = false;
-      if (IC2.isAlways() || IC2.isNever())
+        continue;
+      }
+      if (IC2.isAlways())
         continue;
 
-      outerCallsFound = true;
-      int Cost2 = IC2.getValue();
-      int CurrentThreshold2 = getInlineThreshold(CS2);
-      float FudgeFactor2 = getInlineFudgeFactor(CS2);
-
-      if (Cost2 >= (int)(CurrentThreshold2 * FudgeFactor2))
-        callerWillBeRemoved = false;
-
-      // See if we have this case.  We subtract off the penalty
-      // for the call instruction, which we would be deleting.
-      if (Cost2 < (int)(CurrentThreshold2 * FudgeFactor2) &&
-          Cost2 + Cost - (InlineConstants::CallPenalty + 1) >= 
-                (int)(CurrentThreshold2 * FudgeFactor2)) {
+      // See if inlining or original callsite would erase the cost delta of
+      // this callsite. We subtract off the penalty for the call instruction,
+      // which we would be deleting.
+      if (IC2.getCostDelta() <= CandidateCost) {
         inliningPreventsSomeOuterInline = true;
-        TotalSecondaryCost += Cost2;
+        TotalSecondaryCost += IC2.getCost();
       }
     }
     // If all outer calls to Caller would get inlined, the cost for the last
@@ -297,17 +305,16 @@ bool Inliner::shouldInline(CallSite CS) {
     if (callerWillBeRemoved && Caller->use_begin() != Caller->use_end())
       TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
 
-    if (outerCallsFound && inliningPreventsSomeOuterInline &&
-        TotalSecondaryCost < Cost) {
-      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() << 
-           " Cost = " << Cost << 
+    if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) {
+      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
+           " Cost = " << IC.getCost() <<
            ", outer Cost = " << TotalSecondaryCost << '\n');
       return false;
     }
   }
 
-  DEBUG(dbgs() << "    Inlining: cost=" << Cost
-        << ", thres=" << AdjThreshold
+  DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
+        << ", thres=" << (IC.getCostDelta() + IC.getCost())
         << ", Call: " << *CS.getInstruction() << '\n');
   return true;
 }
@@ -326,7 +333,6 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
   return false;
 }
 
-
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraph>();
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
@@ -415,8 +421,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         CG[Caller]->removeCallEdgeFor(CS);
         CS.getInstruction()->eraseFromParent();
         ++NumCallsDeleted;
-        // Update the cached cost info with the missing call
-        growCachedCostInfo(Caller, NULL);
       } else {
         // We can only inline direct calls to non-declarations.
         if (Callee == 0 || Callee->isDeclaration()) continue;
@@ -439,7 +443,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
         // Attempt to inline the function.
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
-                                  InlineHistoryID))
+                                  InlineHistoryID, InsertLifetime))
           continue;
         ++NumInlined;
         
@@ -457,9 +461,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
             CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
           }
         }
-        
-        // Update the cached cost info with the inlined call.
-        growCachedCostInfo(Caller, Callee);
       }
       
       // If we inlined or deleted the last possible call site to the function,
@@ -479,8 +480,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         // Remove any call graph edges from the callee to its callees.
         CalleeNode->removeAllCalledFunctions();
         
-        resetCachedCostInfo(Callee);
-        
         // Removing the node for callee from the call graph and delete it.
         delete CG.removeFunctionFromModule(CalleeNode);
         ++NumDeleted;
@@ -514,29 +513,28 @@ bool Inliner::doFinalization(CallGraph &CG) {
 
 /// removeDeadFunctions - Remove dead functions that are not included in
 /// DNR (Do Not Remove) list.
-bool Inliner::removeDeadFunctions(CallGraph &CG, 
-                                  SmallPtrSet<const Function *, 16> *DNR) {
-  SmallPtrSet<CallGraphNode*, 16> FunctionsToRemove;
+bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
+  SmallVector<CallGraphNode*, 16> FunctionsToRemove;
 
   // Scan for all of the functions, looking for ones that should now be removed
   // from the program.  Insert the dead ones in the FunctionsToRemove set.
   for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) {
     CallGraphNode *CGN = I->second;
-    if (CGN->getFunction() == 0)
-      continue;
-    
     Function *F = CGN->getFunction();
-    
+    if (!F || F->isDeclaration())
+      continue;
+
+    // Handle the case when this function is called and we only want to care
+    // about always-inline functions. This is a bit of a hack to share code
+    // between here and the InlineAlways pass.
+    if (AlwaysInlineOnly && !F->hasFnAttr(Attribute::AlwaysInline))
+      continue;
+
     // If the only remaining users of the function are dead constants, remove
     // them.
     F->removeDeadConstantUsers();
 
-    if (DNR && DNR->count(F))
-      continue;
-    if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
-        !F->hasAvailableExternallyLinkage())
-      continue;
-    if (!F->use_empty())
+    if (!F->isDefTriviallyDead())
       continue;
     
     // Remove any call graph edges from the function to its callees.
@@ -548,24 +546,27 @@ bool Inliner::removeDeadFunctions(CallGraph &CG,
     CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
 
     // Removing the node for callee from the call graph and delete it.
-    FunctionsToRemove.insert(CGN);
+    FunctionsToRemove.push_back(CGN);
   }
+  if (FunctionsToRemove.empty())
+    return false;
 
   // Now that we know which functions to delete, do so.  We didn't want to do
   // this inline, because that would invalidate our CallGraph::iterator
   // objects. :(
   //
-  // Note that it doesn't matter that we are iterating over a non-stable set
+  // Note that it doesn't matter that we are iterating over a non-stable order
   // here to do this, it doesn't matter which order the functions are deleted
   // in.
-  bool Changed = false;
-  for (SmallPtrSet<CallGraphNode*, 16>::iterator I = FunctionsToRemove.begin(),
-       E = FunctionsToRemove.end(); I != E; ++I) {
-    resetCachedCostInfo((*I)->getFunction());
+  array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+  FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(),
+                                      FunctionsToRemove.end()),
+                          FunctionsToRemove.end());
+  for (SmallVectorImpl<CallGraphNode *>::iterator I = FunctionsToRemove.begin(),
+                                                  E = FunctionsToRemove.end();
+       I != E; ++I) {
     delete CG.removeFunctionFromModule(*I);
     ++NumDeleted;
-    Changed = true;
   }
-
-  return Changed;
+  return true;
 }
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 7cb1d18..cd29e7a 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -122,6 +122,9 @@ bool InternalizePass::runOnModule(Module &M) {
 
   bool Changed = false;
 
+  // Never internalize functions which code-gen might insert.
+  ExternalNames.insert("__stack_chk_fail");
+
   // Mark all functions not in the api as internal.
   // FIXME: maybe use private linkage?
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
@@ -148,9 +151,11 @@ bool InternalizePass::runOnModule(Module &M) {
   // won't find them.  (see MachineModuleInfo.)
   ExternalNames.insert("llvm.global_ctors");
   ExternalNames.insert("llvm.global_dtors");
-  ExternalNames.insert("llvm.noinline");
   ExternalNames.insert("llvm.global.annotations");
 
+  // Never internalize symbols code-gen inserts.
+  ExternalNames.insert("__stack_chk_guard");
+
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   // FIXME: maybe use private linkage?
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
new file mode 100644
index 0000000..b18c915
--- /dev/null
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Transforms/IPO/LLVMBuild.txt -----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = IPO
+parent = Transforms
+library_name = ipo
+required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 8fdfd72..a1b0a45 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -22,14 +22,19 @@
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+RunVectorization("vectorize", cl::desc("Run vectorization passes"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -38,6 +43,7 @@ PassManagerBuilder::PassManagerBuilder() {
     DisableSimplifyLibCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
+    Vectorize = RunVectorization;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -101,6 +107,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(Inliner);
       Inliner = 0;
     }
+    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
     return;
   }
 
@@ -110,6 +117,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   addInitialAliasAnalysisPasses(MPM);
 
   if (!DisableUnitAtATime) {
+    addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
 
     MPM.add(createIPSCCPPass());              // IP SCCP
@@ -170,6 +179,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
+  if (Vectorize) {
+    MPM.add(createBBVectorizePass());
+    MPM.add(createInstructionCombiningPass());
+    if (OptLevel > 1)
+      MPM.add(createGVNPass());                 // Remove redundancies
+  }
+
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Clean up after everything.
@@ -186,11 +202,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     if (OptLevel > 1)
       MPM.add(createConstantMergePass());     // Merge dup global constants
   }
+  addExtensionsToPM(EP_OptimizerLast, MPM);
 }
 
 void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
                                                 bool Internalize,
-                                                bool RunInliner) {
+                                                bool RunInliner,
+                                                bool DisableGVNLoadPRE) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
@@ -246,9 +264,9 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createFunctionAttrsPass()); // Add nocapture.
   PM.add(createGlobalsModRefPass()); // IP alias analysis.
 
-  PM.add(createLICMPass());      // Hoist loop invariants.
-  PM.add(createGVNPass());       // Remove redundancies.
-  PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+  PM.add(createLICMPass());                 // Hoist loop invariants.
+  PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+  PM.add(createMemCpyOptPass());            // Remove dead memcpys.
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
 
@@ -340,4 +358,3 @@ void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
   PassManagerBase *LPM = unwrap(PM);
   Builder->populateLTOPassManager(*LPM, Internalize, RunInliner);
 }
-
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index cbb80f0..c8cc8fd 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -101,8 +101,7 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
       // Check to see if this function performs an unwind or calls an
       // unwinding function.
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        if (CheckUnwind && (isa<UnwindInst>(BB->getTerminator()) ||
-                            isa<ResumeInst>(BB->getTerminator()))) {
+        if (CheckUnwind && isa<ResumeInst>(BB->getTerminator())) {
           // Uses unwind / resume!
           SCCMightUnwind = true;
         } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) {
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index a46d5ad..d070ccc 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -13,11 +13,3 @@ add_llvm_library(LLVMInstCombine
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
-
-add_llvm_library_dependencies(LLVMInstCombine
-  LLVMAnalysis
-  LLVMCore
-  LLVMSupport
-  LLVMTarget
-  LLVMTransformUtils
-  )
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 3808278..199df51 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -22,6 +22,7 @@
 namespace llvm {
   class CallSite;
   class TargetData;
+  class TargetLibraryInfo;
   class DbgDeclareInst;
   class MemIntrinsic;
   class MemSetInst;
@@ -71,6 +72,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
                              : public FunctionPass,
                                public InstVisitor<InstCombiner, Instruction*> {
   TargetData *TD;
+  TargetLibraryInfo *TLI;
   bool MadeIRChange;
 public:
   /// Worklist - All of the instructions that need to be simplified.
@@ -92,9 +94,11 @@ public:
   bool DoOneIteration(Function &F, unsigned ItNum);
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-                                 
+
   TargetData *getTargetData() const { return TD; }
 
+  TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
+
   // Visitation implementation - Implement instruction combining for different
   // instruction types.  The semantics are as follows:
   // Return Value:
@@ -287,9 +291,9 @@ public:
     return 0;  // Don't do anything with FI
   }
       
-  void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero,
+  void ComputeMaskedBits(Value *V, APInt &KnownZero,
                          APInt &KnownOne, unsigned Depth = 0) const {
-    return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+    return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   }
   
   bool MaskedValueIsZero(Value *V, const APInt &Mask, 
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d10046c..05e702f 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -136,6 +136,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext");
         return BinaryOperator::CreateAShr(NewShl, ShAmt);
       }
+
+      // If this is a xor that was canonicalized from a sub, turn it back into
+      // a sub and fuse this add with it.
+      if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
+        IntegerType *IT = cast<IntegerType>(I.getType());
+        APInt LHSKnownOne(IT->getBitWidth(), 0);
+        APInt LHSKnownZero(IT->getBitWidth(), 0);
+        ComputeMaskedBits(XorLHS, LHSKnownZero, LHSKnownOne);
+        if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue())
+          return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
+                                           XorLHS);
+      }
     }
   }
 
@@ -189,14 +201,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
   // A+B --> A|B iff A and B have no bits set in common.
   if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
-    APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
     APInt LHSKnownOne(IT->getBitWidth(), 0);
     APInt LHSKnownZero(IT->getBitWidth(), 0);
-    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
     if (LHSKnownZero != 0) {
       APInt RHSKnownOne(IT->getBitWidth(), 0);
       APInt RHSKnownZero(IT->getBitWidth(), 0);
-      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
       
       // No bits in common -> bitwise or.
       if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
@@ -466,57 +477,57 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
-  GetElementPtrInst *GEP = 0;
-  ConstantExpr *CstGEP = 0;
-  
-  // TODO: Could also optimize &A[i] - &A[j] -> "i-j", and "&A.foo[i] - &A.foo".
+  GEPOperator *GEP1 = 0, *GEP2 = 0;
+
   // For now we require one side to be the base pointer "A" or a constant
-  // expression derived from it.
-  if (GetElementPtrInst *LHSGEP = dyn_cast<GetElementPtrInst>(LHS)) {
+  // GEP derived from it.
+  if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
     // (gep X, ...) - X
     if (LHSGEP->getOperand(0) == RHS) {
-      GEP = LHSGEP;
+      GEP1 = LHSGEP;
       Swapped = false;
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(RHS)) {
-      // (gep X, ...) - (ce_gep X, ...)
-      if (CE->getOpcode() == Instruction::GetElementPtr &&
-          LHSGEP->getOperand(0) == CE->getOperand(0)) {
-        CstGEP = CE;
-        GEP = LHSGEP;
+    } else if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
+      // (gep X, ...) - (gep X, ...)
+      if (LHSGEP->getOperand(0)->stripPointerCasts() ==
+            RHSGEP->getOperand(0)->stripPointerCasts()) {
+        GEP2 = RHSGEP;
+        GEP1 = LHSGEP;
         Swapped = false;
       }
     }
   }
   
-  if (GetElementPtrInst *RHSGEP = dyn_cast<GetElementPtrInst>(RHS)) {
+  if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
     // X - (gep X, ...)
     if (RHSGEP->getOperand(0) == LHS) {
-      GEP = RHSGEP;
+      GEP1 = RHSGEP;
       Swapped = true;
-    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(LHS)) {
-      // (ce_gep X, ...) - (gep X, ...)
-      if (CE->getOpcode() == Instruction::GetElementPtr &&
-          RHSGEP->getOperand(0) == CE->getOperand(0)) {
-        CstGEP = CE;
-        GEP = RHSGEP;
+    } else if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
+      // (gep X, ...) - (gep X, ...)
+      if (RHSGEP->getOperand(0)->stripPointerCasts() ==
+            LHSGEP->getOperand(0)->stripPointerCasts()) {
+        GEP2 = LHSGEP;
+        GEP1 = RHSGEP;
         Swapped = true;
       }
     }
   }
   
-  if (GEP == 0)
+  // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
+  // multiple users.
+  if (GEP1 == 0 ||
+      (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
     return 0;
   
   // Emit the offset of the GEP and an intptr_t.
-  Value *Result = EmitGEPOffset(GEP);
+  Value *Result = EmitGEPOffset(GEP1);
   
   // If we had a constant expression GEP on the other side offsetting the
   // pointer, subtract it from the offset we have.
-  if (CstGEP) {
-    Value *CstOffset = EmitGEPOffset(CstGEP);
-    Result = Builder->CreateSub(Result, CstOffset);
+  if (GEP2) {
+    Value *Offset = EmitGEPOffset(GEP2);
+    Result = Builder->CreateSub(Result, Offset);
   }
-  
 
   // If we have p - gep(p, ...)  then we have to negate the result.
   if (Swapped)
@@ -587,6 +598,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     ConstantInt *C2;
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2))))
       return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
+
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
   }
 
   
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 5e0bfe8..0dbe11d 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -14,6 +14,7 @@
 #include "InstCombine.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
@@ -62,50 +63,6 @@ static inline Value *dyn_castNotVal(Value *V) {
   return 0;
 }
 
-
-/// getICmpCode - Encode a icmp predicate into a three bit mask.  These bits
-/// are carefully arranged to allow folding of expressions such as:
-///
-///      (A < B) | (A > B) --> (A != B)
-///
-/// Note that this is only valid if the first and second predicates have the
-/// same sign. Is illegal to do: (A u< B) | (A s> B) 
-///
-/// Three bits are used to represent the condition, as follows:
-///   0  A > B
-///   1  A == B
-///   2  A < B
-///
-/// <=>  Value  Definition
-/// 000     0   Always false
-/// 001     1   A >  B
-/// 010     2   A == B
-/// 011     3   A >= B
-/// 100     4   A <  B
-/// 101     5   A != B
-/// 110     6   A <= B
-/// 111     7   Always true
-///  
-static unsigned getICmpCode(const ICmpInst *ICI) {
-  switch (ICI->getPredicate()) {
-    // False -> 0
-  case ICmpInst::ICMP_UGT: return 1;  // 001
-  case ICmpInst::ICMP_SGT: return 1;  // 001
-  case ICmpInst::ICMP_EQ:  return 2;  // 010
-  case ICmpInst::ICMP_UGE: return 3;  // 011
-  case ICmpInst::ICMP_SGE: return 3;  // 011
-  case ICmpInst::ICMP_ULT: return 4;  // 100
-  case ICmpInst::ICMP_SLT: return 4;  // 100
-  case ICmpInst::ICMP_NE:  return 5;  // 101
-  case ICmpInst::ICMP_ULE: return 6;  // 110
-  case ICmpInst::ICMP_SLE: return 6;  // 110
-    // True -> 7
-  default:
-    llvm_unreachable("Invalid ICmp predicate!");
-    return 0;
-  }
-}
-
 /// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp
 /// predicate into a three bit mask. It also returns whether it is an ordered
 /// predicate by reference.
@@ -130,31 +87,19 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
   default:
     // Not expecting FCMP_FALSE and FCMP_TRUE;
     llvm_unreachable("Unexpected FCmp predicate!");
-    return 0;
   }
 }
 
-/// getICmpValue - This is the complement of getICmpCode, which turns an
+/// getNewICmpValue - This is the complement of getICmpCode, which turns an
 /// opcode and two operands into either a constant true or false, or a brand 
 /// new ICmp instruction. The sign is passed in to determine which kind
 /// of predicate to use in the new icmp instruction.
-static Value *getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
-                           InstCombiner::BuilderTy *Builder) {
-  CmpInst::Predicate Pred;
-  switch (Code) {
-  default: assert(0 && "Illegal ICmp code!");
-  case 0: // False.
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-  case 1: Pred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
-  case 2: Pred = ICmpInst::ICMP_EQ; break;
-  case 3: Pred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
-  case 4: Pred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
-  case 5: Pred = ICmpInst::ICMP_NE; break;
-  case 6: Pred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
-  case 7: // True.
-    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
-  }
-  return Builder->CreateICmp(Pred, LHS, RHS);
+static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+                              InstCombiner::BuilderTy *Builder) {
+  ICmpInst::Predicate NewPred;
+  if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred))
+    return NewConstant;
+  return Builder->CreateICmp(NewPred, LHS, RHS);
 }
 
 /// getFCmpValue - This is the complement of getFCmpCode, which turns an
@@ -165,7 +110,7 @@ static Value *getFCmpValue(bool isordered, unsigned code,
                            InstCombiner::BuilderTy *Builder) {
   CmpInst::Predicate Pred;
   switch (code) {
-  default: assert(0 && "Illegal FCmp code!");
+  default: llvm_unreachable("Illegal FCmp code!");
   case 0: Pred = isordered ? FCmpInst::FCMP_ORD : FCmpInst::FCMP_UNO; break;
   case 1: Pred = isordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; break;
   case 2: Pred = isordered ? FCmpInst::FCMP_OEQ : FCmpInst::FCMP_UEQ; break;
@@ -180,14 +125,6 @@ static Value *getFCmpValue(bool isordered, unsigned code,
   return Builder->CreateFCmp(Pred, LHS, RHS);
 }
 
-/// PredicatesFoldable - Return true if both predicates match sign or if at
-/// least one of them is an equality comparison (which is signless).
-static bool PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
-  return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) ||
-         (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) ||
-         (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1));
-}
-
 // OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
 // the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
 // guaranteed to be a binary operator.
@@ -558,6 +495,38 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
   return result;
 }
 
+/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z)
+/// if possible. The returned predicate is either == or !=. Returns false if
+/// decomposition fails.
+static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,
+                                 Value *&X, Value *&Y, Value *&Z) {
+  // X < 0 is equivalent to (X & SignBit) != 0.
+  if (I->getPredicate() == ICmpInst::ICMP_SLT)
+    if (ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+      if (C->isZero()) {
+        X = I->getOperand(0);
+        Y = ConstantInt::get(I->getContext(),
+                             APInt::getSignBit(C->getBitWidth()));
+        Pred = ICmpInst::ICMP_NE;
+        Z = C;
+        return true;
+      }
+
+  // X > -1 is equivalent to (X & SignBit) == 0.
+  if (I->getPredicate() == ICmpInst::ICMP_SGT)
+    if (ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+      if (C->isAllOnesValue()) {
+        X = I->getOperand(0);
+        Y = ConstantInt::get(I->getContext(),
+                             APInt::getSignBit(C->getBitWidth()));
+        Pred = ICmpInst::ICMP_EQ;
+        Z = ConstantInt::getNullValue(C->getType());
+        return true;
+      }
+
+  return false;
+}
+
 /// foldLogOpOfMaskedICmpsHelper:
 /// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// return the set of pattern classes (from MaskedICmpType)
@@ -565,10 +534,9 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
 static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, 
                                              Value*& B, Value*& C,
                                              Value*& D, Value*& E,
-                                             ICmpInst *LHS, ICmpInst *RHS) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
-  if (LHSCC != ICmpInst::ICMP_EQ && LHSCC != ICmpInst::ICMP_NE) return 0;
-  if (RHSCC != ICmpInst::ICMP_EQ && RHSCC != ICmpInst::ICMP_NE) return 0;
+                                             ICmpInst *LHS, ICmpInst *RHS,
+                                             ICmpInst::Predicate &LHSCC,
+                                             ICmpInst::Predicate &RHSCC) {
   if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0;
   // vectors are not (yet?) supported
   if (LHS->getOperand(0)->getType()->isVectorTy()) return 0;
@@ -582,40 +550,60 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   Value *L1 = LHS->getOperand(0);
   Value *L2 = LHS->getOperand(1);
   Value *L11,*L12,*L21,*L22;
-  if (match(L1, m_And(m_Value(L11), m_Value(L12)))) {
-    if (!match(L2, m_And(m_Value(L21), m_Value(L22))))
+  // Check whether the icmp can be decomposed into a bit test.
+  if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) {
+    L21 = L22 = L1 = 0;
+  } else {
+    // Look for ANDs in the LHS icmp.
+    if (match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      if (!match(L2, m_And(m_Value(L21), m_Value(L22))))
+        L21 = L22 = 0;
+    } else {
+      if (!match(L2, m_And(m_Value(L11), m_Value(L12))))
+        return 0;
+      std::swap(L1, L2);
       L21 = L22 = 0;
-  }
-  else {
-    if (!match(L2, m_And(m_Value(L11), m_Value(L12))))
-      return 0;
-    std::swap(L1, L2);
-    L21 = L22 = 0;
+    }
   }
 
+  // Bail if LHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(LHSCC))
+    return 0;
+
   Value *R1 = RHS->getOperand(0);
   Value *R2 = RHS->getOperand(1);
   Value *R11,*R12;
   bool ok = false;
-  if (match(R1, m_And(m_Value(R11), m_Value(R12)))) {
-    if (R11 != 0 && (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22)) {
-      A = R11; D = R12; E = R2; ok = true;
+  if (decomposeBitTestICmp(RHS, RHSCC, R11, R12, R2)) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11; D = R12;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12; D = R11;
+    } else {
+      return 0;
     }
-    else 
-    if (R12 != 0 && (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22)) {
+    E = R2; R1 = 0; ok = true;
+  } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11; D = R12; E = R2; ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
       A = R12; D = R11; E = R2; ok = true;
     }
   }
+
+  // Bail if RHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(RHSCC))
+    return 0;
+
+  // Look for ANDs in on the right side of the RHS icmp.
   if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) {
-    if (R11 != 0 && (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22)) {
-       A = R11; D = R12; E = R1; ok = true;
-    }
-    else 
-    if (R12 != 0 && (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22)) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11; D = R12; E = R1; ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
       A = R12; D = R11; E = R1; ok = true;
-    }
-    else
+    } else {
       return 0;
+    }
   }
   if (!ok)
     return 0;
@@ -644,8 +632,12 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
                                      ICmpInst::Predicate NEWCC,
                                      llvm::InstCombiner::BuilderTy* Builder) {
   Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0;
-  unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS);
+  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+  unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
+                                               LHSCC, RHSCC);
   if (mask == 0) return 0;
+  assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
+         "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
 
   if (NEWCC == ICmpInst::ICMP_NE)
     mask >>= 1; // treat "Not"-states as normal states
@@ -693,11 +685,11 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
 
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
     if (CCst == 0) return 0;
-    if (LHS->getPredicate() != NEWCC)
+    if (LHSCC != NEWCC)
       CCst = dyn_cast<ConstantInt>( ConstantExpr::getXor(BCst, CCst) );
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
     if (ECst == 0) return 0;
-    if (RHS->getPredicate() != NEWCC)
+    if (RHSCC != NEWCC)
       ECst = dyn_cast<ConstantInt>( ConstantExpr::getXor(DCst, ECst) );
     ConstantInt* MCst = dyn_cast<ConstantInt>(
       ConstantExpr::getAnd(ConstantExpr::getAnd(BCst, DCst),
@@ -728,7 +720,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
       unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
       bool isSigned = LHS->isSigned() || RHS->isSigned();
-      return getICmpValue(isSigned, Code, Op0, Op1, Builder);
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
     }
   }
 
@@ -756,24 +748,12 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       Value *NewOr = Builder->CreateOr(Val, Val2);
       return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
     }
-
-    // (icmp slt A, 0) & (icmp slt B, 0) --> (icmp slt (A&B), 0)
-    if (LHSCC == ICmpInst::ICMP_SLT && LHSCst->isZero()) {
-      Value *NewAnd = Builder->CreateAnd(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewAnd, LHSCst);
-    }
-
-    // (icmp sgt A, -1) & (icmp sgt B, -1) --> (icmp sgt (A|B), -1)
-    if (LHSCC == ICmpInst::ICMP_SGT && LHSCst->isAllOnesValue()) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
-    }
   }
 
   // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
   // where CMAX is the all ones value for the truncated type,
   // iff the lower bits of C2 and CA are zero.
-  if (LHSCC == RHSCC && ICmpInst::isEquality(LHSCC) &&
+  if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC &&
       LHS->hasOneUse() && RHS->hasOneUse()) {
     Value *V;
     ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0;
@@ -805,7 +785,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       }
     }
   }
-  
+
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
   if (Val != Val2) return 0;
@@ -1382,13 +1362,8 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
   // part of the value (e.g. byte 3) then it must be shifted right.  If from the
   // low part, it must be shifted left.
   unsigned DestByteNo = InputByteNo + OverallLeftShift;
-  if (InputByteNo < ByteValues.size()/2) {
-    if (ByteValues.size()-1-DestByteNo != InputByteNo)
-      return true;
-  } else {
-    if (ByteValues.size()-1-DestByteNo != InputByteNo)
-      return true;
-  }
+  if (ByteValues.size()-1-DestByteNo != InputByteNo)
+    return true;
   
   // If the destination byte value is already defined, the values are or'd
   // together, which isn't a bswap (unless it's an or of the same bits).
@@ -1469,7 +1444,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
       unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
       bool isSigned = LHS->isSigned() || RHS->isSigned();
-      return getICmpValue(isSigned, Code, Op0, Op1, Builder);
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
     }
   }
 
@@ -1490,18 +1465,6 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       Value *NewOr = Builder->CreateOr(Val, Val2);
       return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
     }
-
-    // (icmp slt A, 0) | (icmp slt B, 0) --> (icmp slt (A|B), 0)
-    if (LHSCC == ICmpInst::ICMP_SLT && LHSCst->isZero()) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
-    }
-
-    // (icmp sgt A, -1) | (icmp sgt B, -1) --> (icmp sgt (A&B), -1)
-    if (LHSCC == ICmpInst::ICMP_SGT && LHSCst->isAllOnesValue()) {
-      Value *NewAnd = Builder->CreateAnd(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewAnd, LHSCst);
-    }
   }
 
   // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
@@ -1586,7 +1549,6 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
       return ConstantInt::getTrue(LHS->getContext());
     }
-    break;
   case ICmpInst::ICMP_ULT:
     switch (RHSCC) {
     default: llvm_unreachable("Unknown integer condition code!");
@@ -1962,8 +1924,11 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       }
 
   // Canonicalize xor to the RHS.
-  if (match(Op0, m_Xor(m_Value(), m_Value())))
+  bool SwappedForXor = false;
+  if (match(Op0, m_Xor(m_Value(), m_Value()))) {
     std::swap(Op0, Op1);
+    SwappedForXor = true;
+  }
 
   // A | ( A ^ B) -> A |  B
   // A | (~A ^ B) -> A | ~B
@@ -1994,6 +1959,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         return BinaryOperator::CreateOr(Not, Op0);
       }
 
+  if (SwappedForXor)
+    std::swap(Op0, Op1);
+
   if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
     if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
       if (Value *Res = FoldOrOfICmps(LHS, RHS))
@@ -2281,7 +2249,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
           unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
           bool isSigned = LHS->isSigned() || RHS->isSigned();
           return ReplaceInstUsesWith(I, 
-                               getICmpValue(isSigned, Code, Op0, Op1, Builder));
+                               getNewICmpValue(isSigned, Code, Op0, Op1,
+                                               Builder));
         }
       }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c7b3ff8..77e4727 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -37,26 +37,26 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned CopyAlign = MI->getAlignment();
 
   if (CopyAlign < MinAlign) {
-    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), 
+    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
                                              MinAlign, false));
     return MI;
   }
-  
+
   // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
   // load/store.
   ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
   if (MemOpLength == 0) return 0;
-  
+
   // Source and destination pointer types are always "i8*" for intrinsic.  See
   // if the size is something we can handle with a single primitive load/store.
   // A single load+store correctly handles overlapping memory in the memmove
   // case.
   unsigned Size = MemOpLength->getZExtValue();
   if (Size == 0) return MI;  // Delete this mem transfer.
-  
+
   if (Size > 8 || (Size&(Size-1)))
     return 0;  // If not 1/2/4/8 bytes, exit.
-  
+
   // Use an integer load+store unless we can find something better.
   unsigned SrcAddrSp =
     cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
@@ -66,7 +66,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
   Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
-  
+
   // Memcpy forces the use of i8* for the source and destination.  That means
   // that if you're using memcpy to move one double around, you'll get a cast
   // from double* to i8*.  We'd much rather use a double load+store rather than
@@ -94,20 +94,20 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
         } else
           break;
       }
-      
+
       if (SrcETy->isSingleValueType()) {
         NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
         NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
       }
     }
   }
-  
-  
+
+
   // If the memcpy/memmove provides better alignment info than we can
   // infer, use it.
   SrcAlign = std::max(SrcAlign, CopyAlign);
   DstAlign = std::max(DstAlign, CopyAlign);
-  
+
   Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
   Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
@@ -127,7 +127,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
                                              Alignment, false));
     return MI;
   }
-  
+
   // Extract the length and alignment and fill if they are constant.
   ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
   ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
@@ -135,14 +135,14 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
     return 0;
   uint64_t Len = LenC->getZExtValue();
   Alignment = MI->getAlignment();
-  
+
   // If the length is zero, this is a no-op
   if (Len == 0) return MI; // memset(d,c,0,a) -> noop
-  
+
   // memset(s,c,n) -> store s, c (for n=1,2,4,8)
   if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
     Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
-    
+
     Value *Dest = MI->getDest();
     unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
     Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
@@ -150,13 +150,13 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
 
     // Alignment 0 is identity for alignment 1 for memset, but not store.
     if (Alignment == 0) Alignment = 1;
-    
+
     // Extract the fill value and store.
     uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
     StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest,
                                         MI->isVolatile());
     S->setAlignment(Alignment);
-    
+
     // Set the size of the copy to 0, it will be deleted on the next iteration.
     MI->setLength(Constant::getNullValue(LenC->getType()));
     return MI;
@@ -165,7 +165,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   return 0;
 }
 
-/// visitCallInst - CallInst simplification.  This mostly only handles folding 
+/// visitCallInst - CallInst simplification.  This mostly only handles folding
 /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
 /// the heavy lifting.
 ///
@@ -182,7 +182,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     CI.setDoesNotThrow();
     return &CI;
   }
-  
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
   if (!II) return visitCallSite(&CI);
 
@@ -203,7 +203,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           // alignment is sufficient.
         }
     }
-    
+
     // No other transformations apply to volatile transfers.
     if (MI->isVolatile())
       return 0;
@@ -242,13 +242,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     if (Changed) return II;
   }
-  
+
   switch (II->getIntrinsicID()) {
   default: break;
   case Intrinsic::objectsize: {
     // We need target data for just about everything so depend on it.
     if (!TD) break;
-    
+
     Type *ReturnTy = CI.getType();
     uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL;
 
@@ -265,6 +265,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // Get the current byte offset into the thing. Use the original
       // operand in case we're looking through a bitcast.
       SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
+      if (!GEP->getPointerOperandType()->isPointerTy())
+        return 0;
       Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
 
       Op1 = GEP->getPointerOperand()->stripPointerCasts();
@@ -322,7 +324,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getArgOperand(0)))
       if (Operand->getIntrinsicID() == Intrinsic::bswap)
         return ReplaceInstUsesWith(CI, Operand->getArgOperand(0));
-      
+
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (TruncInst *TI = dyn_cast<TruncInst>(II->getArgOperand(0))) {
       if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(TI->getOperand(0)))
@@ -334,7 +336,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           return new TruncInst(V, TI->getType());
         }
     }
-      
+
     break;
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
@@ -359,14 +361,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth),
-                      KnownZero, KnownOne);
+    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
     unsigned TrailingZeros = KnownOne.countTrailingZeros();
     APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
     if ((Mask & KnownZero) == Mask)
       return ReplaceInstUsesWith(CI, ConstantInt::get(IT,
                                  APInt(BitWidth, TrailingZeros)));
-    
+
     }
     break;
   case Intrinsic::ctlz: {
@@ -378,31 +379,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth),
-                      KnownZero, KnownOne);
+    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
     unsigned LeadingZeros = KnownOne.countLeadingZeros();
     APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
     if ((Mask & KnownZero) == Mask)
       return ReplaceInstUsesWith(CI, ConstantInt::get(IT,
                                  APInt(BitWidth, LeadingZeros)));
-    
+
     }
     break;
   case Intrinsic::uadd_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
     IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
-    APInt Mask = APInt::getSignBit(BitWidth);
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
     bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
     bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
 
     if (LHSKnownNegative || LHSKnownPositive) {
       APInt RHSKnownZero(BitWidth, 0);
       APInt RHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
       bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
       bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
       if (LHSKnownNegative && RHSKnownNegative) {
@@ -448,7 +447,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // X + undef -> undef
     if (isa<UndefValue>(II->getArgOperand(1)))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
-      
+
     if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X + 0 -> {X, false}
       if (RHS->isZero()) {
@@ -469,7 +468,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (isa<UndefValue>(II->getArgOperand(0)) ||
         isa<UndefValue>(II->getArgOperand(1)))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
-      
+
     if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X - 0 -> {X, false}
       if (RHS->isZero()) {
@@ -477,7 +476,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           UndefValue::get(II->getArgOperand(0)->getType()),
           ConstantInt::getFalse(II->getContext())
         };
-        Constant *Struct = 
+        Constant *Struct =
           ConstantStruct::get(cast<StructType>(II->getType()), V);
         return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
       }
@@ -486,14 +485,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::umul_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
     unsigned BitWidth = cast<IntegerType>(LHS->getType())->getBitWidth();
-    APInt Mask = APInt::getAllOnesValue(BitWidth);
 
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
     APInt RHSKnownZero(BitWidth, 0);
     APInt RHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+    ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
 
     // Get the largest possible values for each operand.
     APInt LHSMax = ~LHSKnownZero;
@@ -526,19 +524,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // X * undef -> undef
     if (isa<UndefValue>(II->getArgOperand(1)))
       return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
-      
+
     if (ConstantInt *RHSI = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // X*0 -> {0, false}
       if (RHSI->isZero())
         return ReplaceInstUsesWith(CI, Constant::getNullValue(II->getType()));
-      
+
       // X * 1 -> {X, false}
       if (RHSI->equalsInt(1)) {
         Constant *V[] = {
           UndefValue::get(II->getArgOperand(0)->getType()),
           ConstantInt::getFalse(II->getContext())
         };
-        Constant *Struct = 
+        Constant *Struct =
           ConstantStruct::get(cast<StructType>(II->getType()), V);
         return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
       }
@@ -557,7 +555,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) {
-      Type *OpPtrTy = 
+      Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(II->getArgOperand(0), Ptr);
@@ -568,7 +566,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
-      Type *OpPtrTy = 
+      Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(1)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
       return new StoreInst(II->getArgOperand(1), Ptr);
@@ -621,19 +619,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
-    if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getArgOperand(2))) {
-      assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
-      
+    if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
+      assert(Mask->getType()->getVectorNumElements() == 16 &&
+             "Bad type for intrinsic!");
+
       // Check that all of the elements are integer constants or undefs.
       bool AllEltsOk = true;
       for (unsigned i = 0; i != 16; ++i) {
-        if (!isa<ConstantInt>(Mask->getOperand(i)) && 
-            !isa<UndefValue>(Mask->getOperand(i))) {
+        Constant *Elt = Mask->getAggregateElement(i);
+        if (Elt == 0 ||
+            !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
           AllEltsOk = false;
           break;
         }
       }
-      
+
       if (AllEltsOk) {
         // Cast the input vectors to byte vectors.
         Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0),
@@ -641,23 +641,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1),
                                             Mask->getType());
         Value *Result = UndefValue::get(Op0->getType());
-        
+
         // Only extract each element once.
         Value *ExtractedElts[32];
         memset(ExtractedElts, 0, sizeof(ExtractedElts));
-        
+
         for (unsigned i = 0; i != 16; ++i) {
-          if (isa<UndefValue>(Mask->getOperand(i)))
+          if (isa<UndefValue>(Mask->getAggregateElement(i)))
             continue;
-          unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue();
+          unsigned Idx =
+            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
           Idx &= 31;  // Match the hardware behavior.
-          
+
           if (ExtractedElts[Idx] == 0) {
-            ExtractedElts[Idx] = 
+            ExtractedElts[Idx] =
               Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
                                             Builder->getInt32(Idx&15));
           }
-        
+
           // Insert this value into the result vector.
           Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx],
                                                 Builder->getInt32(i));
@@ -703,7 +704,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           return EraseInstFromFunction(CI);
       }
     }
-    
+
     // Scan down this block to see if there is another stack restore in the
     // same block without an intervening call/alloca.
     BasicBlock::iterator BI = II;
@@ -728,12 +729,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         }
       }
     }
-    
+
     // If the stack restore is in a return, resume, or unwind block and if there
     // are no allocas or calls between the restore and the return, nuke the
     // restore.
-    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI) ||
-                          isa<UnwindInst>(TI)))
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
       return EraseInstFromFunction(CI);
     break;
   }
@@ -748,7 +748,7 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
   return visitCallSite(&II);
 }
 
-/// isSafeToEliminateVarargsCast - If this cast does not affect the value 
+/// isSafeToEliminateVarargsCast - If this cast does not affect the value
 /// passed through the varargs area, we can eliminate the use of the cast.
 static bool isSafeToEliminateVarargsCast(const CallSite CS,
                                          const CastInst * const CI,
@@ -760,10 +760,10 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   // The size of ByVal arguments is derived from the type, so we
   // can't change to a type with a different size.  If the size were
   // passed explicitly we could avoid this check.
-  if (!CS.paramHasAttr(ix, Attribute::ByVal))
+  if (!CS.isByValArgument(ix))
     return true;
 
-  Type* SrcTy = 
+  Type* SrcTy =
             cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
   Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
   if (!SrcTy->isSized() || !DstTy->isSized())
@@ -807,7 +807,7 @@ public:
 } // end anonymous namespace
 
 // Try to fold some different type of calls here.
-// Currently we're only working with the checking functions, memcpy_chk, 
+// Currently we're only working with the checking functions, memcpy_chk,
 // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
 // strcat_chk and strncat_chk.
 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) {
@@ -916,7 +916,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
         !CalleeF->isDeclaration()) {
       Instruction *OldCall = CS.getInstruction();
       new StoreInst(ConstantInt::getTrue(Callee->getContext()),
-                UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), 
+                UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
                                   OldCall);
       // If OldCall dues not return void then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
@@ -924,7 +924,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
         ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
       if (isa<CallInst>(OldCall))
         return EraseInstFromFunction(*OldCall);
-      
+
       // We cannot remove an invoke, because it would change the CFG, just
       // change the callee to a null pointer.
       cast<InvokeInst>(OldCall)->setCalledFunction(
@@ -960,7 +960,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   if (FTy->isVarArg()) {
-    int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
+    int ix = FTy->getNumParams();
     // See if we can optimize any arguments passed through the varargs area of
     // the call.
     for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
@@ -1061,17 +1061,17 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
 
-    unsigned Attrs = CallerPAL.getParamAttributes(i + 1);
+    Attributes Attrs = CallerPAL.getParamAttributes(i + 1);
     if (Attrs & Attribute::typeIncompatible(ParamTy))
       return false;   // Attribute not compatible with transformed value.
-    
+
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
     if (ParamTy != ActTy && (Attrs & Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
-      
+
       Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
       if (TD->getTypeAllocSize(CurElTy) !=
           TD->getTypeAllocSize(ParamPTy->getElementType()))
@@ -1099,8 +1099,17 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
     if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
       return false;
+
+    // If both the callee and the cast type are varargs, we still have to make
+    // sure the number of fixed parameters are the same or we have the same
+    // ABI issues as if we introduce a varargs call.
+    if (FT->isVarArg() &&
+        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
+        FT->getNumParams() !=
+        cast<FunctionType>(APTy->getElementType())->getNumParams())
+      return false;
   }
-      
+
   if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
       !CallerPAL.isEmpty())
     // In this case we have more arguments than the new function type, but we
@@ -1114,7 +1123,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         return false;
     }
 
-  
+
   // Okay, we decided that this is a safe thing to do: go ahead and start
   // inserting cast instructions as necessary.
   std::vector<Value*> Args;
@@ -1352,11 +1361,11 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
       // Replace the trampoline call with a direct call.  Let the generic
       // code sort out any function type mismatches.
-      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, 
+      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
                                                 FTy->isVarArg());
       Constant *NewCallee =
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
-        NestF : ConstantExpr::getBitCast(NestF, 
+        NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
       const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(),
                                                    NewAttrs.end());
@@ -1385,9 +1394,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   // parameter, there is no need to adjust the argument list.  Let the generic
   // code sort out any function type mismatches.
   Constant *NewCallee =
-    NestF->getType() == PTy ? NestF : 
+    NestF->getType() == PTy ? NestF :
                               ConstantExpr::getBitCast(NestF, PTy);
   CS.setCalledFunction(NewCallee);
   return CS.getInstruction();
 }
-
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f10e48a..39279f4 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -14,6 +14,7 @@
 #include "InstCombine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -147,8 +148,6 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   return ReplaceInstUsesWith(CI, New);
 }
 
-
-
 /// EvaluateInDifferentType - Given an expression that 
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
@@ -158,7 +157,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
     // If we got a constantexpr back, try to simplify it with TD info.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-      C = ConstantFoldConstantExpression(CE, TD);
+      C = ConstantFoldConstantExpression(CE, TD, TLI);
     return C;
   }
 
@@ -216,7 +215,6 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
   default: 
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
-    break;
   }
   
   Res->takeName(I);
@@ -528,9 +526,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 
       return ReplaceInstUsesWith(CI, In);
     }
-      
-      
-      
+
     // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
     // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
     // zext (X == 1) to i32 --> X        iff X has only the low bit set.
@@ -545,8 +541,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
-      ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne);
+      ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
         
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
@@ -594,9 +589,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 
       APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
       APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
-      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
-      ComputeMaskedBits(LHS, TypeMask, KnownZeroLHS, KnownOneLHS);
-      ComputeMaskedBits(RHS, TypeMask, KnownZeroRHS, KnownOneRHS);
+      ComputeMaskedBits(LHS, KnownZeroLHS, KnownOneLHS);
+      ComputeMaskedBits(RHS, KnownZeroRHS, KnownOneRHS);
 
       if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
         APInt KnownBits = KnownZeroLHS | KnownOneLHS;
@@ -915,8 +909,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
         ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
       unsigned BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
-      ComputeMaskedBits(Op0, TypeMask, KnownZero, KnownOne);
+      ComputeMaskedBits(Op0, KnownZero, KnownOne);
 
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) {
@@ -1163,6 +1156,9 @@ static Value *LookThroughFPExtensions(Value *V) {
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
       return V;  // No constant folding of this.
+    // See if the value can be truncated to half and then reextended.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf))
+      return V;
     // See if the value can be truncated to float and then reextended.
     if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle))
       return V;
@@ -1213,10 +1209,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   }
   
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
-  // NOTE: This should be disabled by -fno-builtin-sqrt if we ever support it.
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
-  if (Call && Call->getCalledFunction() &&
-      Call->getCalledFunction()->getName() == "sqrt" &&
+  if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
+      Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) &&
       Call->getNumArgOperands() == 1 &&
       Call->hasOneUse()) {
     CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
@@ -1423,16 +1418,15 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
-  SmallVector<Constant*, 16> ShuffleMask;
+  SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
-  IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext());
   
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
     // elements from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
     for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
-      ShuffleMask.push_back(ConstantInt::get(Int32Ty, i));
+      ShuffleMask.push_back(i);
     
   } else {
     // If we're increasing the number of elements, shuffle in all of the
@@ -1441,14 +1435,16 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
     V2 = Constant::getNullValue(SrcTy);
     unsigned SrcElts = SrcTy->getNumElements();
     for (unsigned i = 0, e = SrcElts; i != e; ++i)
-      ShuffleMask.push_back(ConstantInt::get(Int32Ty, i));
+      ShuffleMask.push_back(i);
 
     // The excess elements reference the first element of the zero input.
-    ShuffleMask.append(DestTy->getNumElements()-SrcElts,
-                       ConstantInt::get(Int32Ty, SrcElts));
+    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
+      ShuffleMask.push_back(SrcElts);
   }
   
-  return new ShuffleVectorInst(InVal, V2, ConstantVector::get(ShuffleMask));
+  return new ShuffleVectorInst(InVal, V2,
+                               ConstantDataVector::get(V2->getContext(),
+                                                       ShuffleMask));
 }
 
 static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index bb1cbfa..ab2987f 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -203,8 +203,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // We need TD information to know the pointer size unless this is inbounds.
   if (!GEP->isInBounds() && TD == 0) return 0;
 
-  ConstantArray *Init = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (Init == 0 || Init->getNumOperands() > 1024) return 0;
+  Constant *Init = GV->getInitializer();
+  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+    return 0;
+  
+  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+  if (ArrayElementCount > 1024) return 0;  // Don't blow up on huge arrays.
 
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
@@ -221,7 +225,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // structs.
   SmallVector<unsigned, 4> LaterIndices;
 
-  Type *EltTy = cast<ArrayType>(Init->getType())->getElementType();
+  Type *EltTy = Init->getType()->getArrayElementType();
   for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
     ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (Idx == 0) return 0;  // Variable index.
@@ -272,8 +276,9 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 
   // Scan the array and see if one of our patterns matches.
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
-  for (unsigned i = 0, e = Init->getNumOperands(); i != e; ++i) {
-    Constant *Elt = Init->getOperand(i);
+  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
+    Constant *Elt = Init->getAggregateElement(i);
+    if (Elt == 0) return 0;
 
     // If this is indexing an array of structures, get the structure element.
     if (!LaterIndices.empty())
@@ -284,7 +289,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 
     // Find out if the comparison would be true or false for the i'th element.
     Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
-                                                  CompareRHS, TD);
+                                                  CompareRHS, TD, TLI);
     // If the result is undef for this element, ignore it.
     if (isa<UndefValue>(C)) {
       // Extend range state machines to cover this element in case there is an
@@ -440,10 +445,10 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // If a 32-bit or 64-bit magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
-  if (Init->getNumOperands() <= 32 ||
-      (TD && Init->getNumOperands() <= 64 && TD->isLegalInteger(64))) {
+  if (ArrayElementCount <= 32 ||
+      (TD && ArrayElementCount <= 64 && TD->isLegalInteger(64))) {
     Type *Ty;
-    if (Init->getNumOperands() <= 32)
+    if (ArrayElementCount <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
     else
       Ty = Type::getInt64Ty(Init->getContext());
@@ -566,6 +571,14 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
 Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                                        ICmpInst::Predicate Cond,
                                        Instruction &I) {
+  // Don't transform signed compares of GEPs into index compares. Even if the
+  // GEP is inbounds, the final add of the base pointer can have signed overflow
+  // and would change the result of the icmp.
+  // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
+  // the maximum signed value for the pointer type.
+  if (ICmpInst::isSigned(Cond))
+    return 0;
+
   // Look through bitcasts.
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
     RHS = BCI->getOperand(0);
@@ -602,6 +615,20 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
         return new ICmpInst(ICmpInst::getSignedPredicate(Cond),
                             GEPLHS->getOperand(0), GEPRHS->getOperand(0));
 
+      // If we're comparing GEPs with two base pointers that only differ in type
+      // and both GEPs have only constant indices or just one use, then fold
+      // the compare with the adjusted indices.
+      if (TD && GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
+          (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
+          (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
+          PtrBase->stripPointerCasts() ==
+            GEPRHS->getOperand(0)->stripPointerCasts()) {
+        Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond),
+                                         EmitGEPOffset(GEPLHS),
+                                         EmitGEPOffset(GEPRHS));
+        return ReplaceInstUsesWith(I, Cmp);
+      }
+
       // Otherwise, the base pointers are different and the indices are
       // different, bail out.
       return 0;
@@ -1001,9 +1028,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       // of the high bits truncated out of x are known.
       unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
              SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
-      APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
       APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
-      ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
+      ComputeMaskedBits(LHSI->getOperand(0), KnownZero, KnownOne);
 
       // If all the high bits are known, we can do this xform.
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
@@ -1657,6 +1683,14 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
       CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
     return 0;
 
+  // This is only really a signed overflow check if the inputs have been
+  // sign-extended; check for that condition. For example, if CI2 is 2^31 and
+  // the operands of the add are 64 bits wide, we need at least 33 sign bits.
+  unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
+  if (IC.ComputeNumSignBits(A) < NeededSignBits ||
+      IC.ComputeNumSignBits(B) < NeededSignBits)
+    return 0;
+
   // In order to replace the original add with a narrower
   // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
   // and truncates that discard the high bits of the add.  Verify that this is
@@ -1787,6 +1821,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
+  // comparing -val or val with non-zero is the same as just comparing val
+  // ie, abs(val) != 0 -> val != 0
+  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero()))
+  {
+    Value *Cond, *SelectTrue, *SelectFalse;
+    if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
+                            m_Value(SelectFalse)))) {
+      if (Value *V = dyn_castNegVal(SelectTrue)) {
+        if (V == SelectFalse)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+      else if (Value *V = dyn_castNegVal(SelectFalse)) {
+        if (V == SelectTrue)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+    }
+  }
+
   Type *Ty = Op0->getType();
 
   // icmp's with boolean values can always be turned into bitwise operations
@@ -2683,6 +2735,17 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
       return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
     }
+  } else {
+    // See if the RHS value is < UnsignedMin.
+    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
+          Pred == ICmpInst::ICMP_UGE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    }
   }
 
   // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
@@ -2822,7 +2885,9 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 
         const fltSemantics *Sem;
         // FIXME: This shouldn't be here.
-        if (LHSExt->getSrcTy()->isFloatTy())
+        if (LHSExt->getSrcTy()->isHalfTy())
+          Sem = &APFloat::IEEEhalf;
+        else if (LHSExt->getSrcTy()->isFloatTy())
           Sem = &APFloat::IEEEsingle;
         else if (LHSExt->getSrcTy()->isDoubleTy())
           Sem = &APFloat::IEEEdouble;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 7446a51..b2f2e24 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -22,6 +22,72 @@ using namespace llvm;
 
 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
 
+// Try to kill dead allocas by walking through its uses until we see some use
+// that could escape. This is a conservative analysis which tries to handle
+// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things
+// left after inlining and SROA finish chewing on an alloca.
+static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) {
+  SmallVector<Instruction *, 4> Worklist, DeadStores;
+  Worklist.push_back(&AI);
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end();
+         UI != UE; ++UI) {
+      Instruction *I = cast<Instruction>(*UI);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return 0;
+
+      case Instruction::GetElementPtr:
+      case Instruction::BitCast:
+        Worklist.push_back(I);
+        continue;
+
+      case Instruction::Call:
+        // We can handle a limited subset of calls to no-op intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          case Intrinsic::dbg_declare:
+          case Intrinsic::dbg_value:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+            continue;
+          default:
+            return 0;
+          }
+        }
+        // Reject everything else.
+        return 0;
+
+      case Instruction::Store: {
+        // Stores into the alloca are only live if the alloca is live.
+        StoreInst *SI = cast<StoreInst>(I);
+        // We can eliminate atomic stores, but not volatile.
+        if (SI->isVolatile())
+          return 0;
+        // The store is only trivially safe if the poniter is the destination
+        // as opposed to the value. We're conservative here and don't check for
+        // the case where we store the address of a dead alloca into a dead
+        // alloca.
+        if (SI->getPointerOperand() != PI)
+          return 0;
+        DeadStores.push_back(I);
+        continue;
+      }
+      }
+    }
+  } while (!Worklist.empty());
+
+  // The alloca is dead. Kill off all the stores to it, and then replace it
+  // with undef.
+  while (!DeadStores.empty())
+    IC.EraseInstFromFunction(*DeadStores.pop_back_val());
+  return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType()));
+}
+
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
@@ -81,7 +147,10 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
   }
 
-  return 0;
+  // Try to aggressively remove allocas which are only used for GEPs, lifetime
+  // markers, and stores. This happens when SROA iteratively promotes stores
+  // out of the alloca, and we need to cleanup after it.
+  return removeDeadAlloca(*this, AI);
 }
 
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 7f48125..5168e2a 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -256,22 +256,18 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  // Simplify mul instructions with a constant RHS...
+  // Simplify mul instructions with a constant RHS.
   if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
     if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1C)) {
       // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
       // ANSI says we can drop signals, so we can do this anyway." (from GCC)
       if (Op1F->isExactlyValue(1.0))
         return ReplaceInstUsesWith(I, Op0);  // Eliminate 'fmul double %X, 1.0'
-    } else if (Op1C->getType()->isVectorTy()) {
-      if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1C)) {
-        // As above, vector X*splat(1.0) -> X in all defined cases.
-        if (Constant *Splat = Op1V->getSplatValue()) {
-          if (ConstantFP *F = dyn_cast<ConstantFP>(Splat))
-            if (F->isExactlyValue(1.0))
-              return ReplaceInstUsesWith(I, Op0);
-        }
-      }
+    } else if (ConstantDataVector *Op1V = dyn_cast<ConstantDataVector>(Op1C)) {
+      // As above, vector X*splat(1.0) -> X in all defined cases.
+      if (ConstantFP *F = dyn_cast_or_null<ConstantFP>(Op1V->getSplatValue()))
+        if (F->isExactlyValue(1.0))
+          return ReplaceInstUsesWith(I, Op0);
     }
 
     // Try to fold constant mul into select arguments.
@@ -441,19 +437,23 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
     return Common;
-
-  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+  
+  { 
     // X udiv 2^C -> X >> C
     // Check to see if this is an unsigned division with an exact power of 2,
     // if so, convert to a right shift.
-    if (C->getValue().isPowerOf2()) { // 0 not included in isPowerOf2
+    const APInt *C;
+    if (match(Op1, m_Power2(C))) {
       BinaryOperator *LShr =
-        BinaryOperator::CreateLShr(Op0, 
-            ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
+      BinaryOperator::CreateLShr(Op0, 
+                                 ConstantInt::get(Op0->getType(), 
+                                                  C->logBase2()));
       if (I.isExact()) LShr->setIsExact();
       return LShr;
     }
+  }
 
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
     // X udiv C, where C >= signbit
     if (C->getValue().isNegative()) {
       Value *IC = Builder->CreateICmpULT(Op0, C);
@@ -684,28 +684,36 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   }
 
   // If it's a constant vector, flip any negative values positive.
-  if (ConstantVector *RHSV = dyn_cast<ConstantVector>(Op1)) {
-    unsigned VWidth = RHSV->getNumOperands();
+  if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
+    Constant *C = cast<Constant>(Op1);
+    unsigned VWidth = C->getType()->getVectorNumElements();
 
     bool hasNegative = false;
-    for (unsigned i = 0; !hasNegative && i != VWidth; ++i)
-      if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i)))
+    bool hasMissing = false;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (Elt == 0) {
+        hasMissing = true;
+        break;
+      }
+
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
         if (RHS->isNegative())
           hasNegative = true;
+    }
 
-    if (hasNegative) {
-      std::vector<Constant *> Elts(VWidth);
+    if (hasNegative && !hasMissing) {
+      SmallVector<Constant *, 16> Elts(VWidth);
       for (unsigned i = 0; i != VWidth; ++i) {
-        if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i))) {
+        Elts[i] = C->getAggregateElement(i);  // Handle undef, etc.
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
           if (RHS->isNegative())
             Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
-          else
-            Elts[i] = RHS;
         }
       }
 
       Constant *NewRHSV = ConstantVector::get(Elts);
-      if (NewRHSV != RHSV) {
+      if (NewRHSV != C) {  // Don't loop on -MININT
         Worklist.AddValue(I.getOperand(1));
         I.setOperand(1, NewRHSV);
         return &I;
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 91e60a4..e727b2c 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -184,7 +184,6 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
       return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp);
   }
   llvm_unreachable("Shouldn't get here");
-  return 0;
 }
 
 static bool isSelect01(Constant *C1, Constant *C2) {
@@ -282,7 +281,8 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
 /// replaced with RepOp.
 static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                     const TargetData *TD) {
+                                     const TargetData *TD,
+                                     const TargetLibraryInfo *TLI) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
@@ -294,17 +294,19 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   // If this is a binary operator, try to simplify it with the replaced op.
   if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) {
     if (B->getOperand(0) == Op)
-      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD);
+      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD, TLI);
     if (B->getOperand(1) == Op)
-      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD);
+      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD, TLI);
   }
 
   // Same for CmpInsts.
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (C->getOperand(0) == Op)
-      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD);
+      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD,
+                             TLI);
     if (C->getOperand(1) == Op)
-      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD);
+      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD,
+                             TLI);
   }
 
   // TODO: We could hand off more cases to instsimplify here.
@@ -330,7 +332,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
           return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
 
       return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                      ConstOps, TD);
+                                      ConstOps, TD, TLI);
     }
   }
 
@@ -479,18 +481,18 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD, TLI) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD, TLI) == TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD, TLI) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD, TLI) == FalseVal)
       return ReplaceInstUsesWith(SI, FalseVal);
   } else if (Pred == ICmpInst::ICMP_NE) {
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD, TLI) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD, TLI) == FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD, TLI) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD, TLI) == TrueVal)
       return ReplaceInstUsesWith(SI, TrueVal);
   }
 
@@ -679,6 +681,13 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       return BinaryOperator::CreateOr(CondVal, FalseVal);
     else if (CondVal == FalseVal)
       return BinaryOperator::CreateAnd(CondVal, TrueVal);
+
+    // select a, ~a, b -> (~a)&b
+    // select a, b, ~a -> (~a)|b
+    if (match(TrueVal, m_Not(m_Specific(CondVal))))
+      return BinaryOperator::CreateAnd(TrueVal, FalseVal);
+    else if (match(FalseVal, m_Not(m_Specific(CondVal))))
+      return BinaryOperator::CreateOr(TrueVal, FalseVal);
   }
 
   // Selecting between two integer constants?
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 6d85add..b31049e 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -190,7 +190,8 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       V = IC.Builder->CreateLShr(C, NumBits);
     // If we got a constantexpr back, try to simplify it with TD info.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-      V = ConstantFoldConstantExpression(CE, IC.getTargetData());
+      V = ConstantFoldConstantExpression(CE, IC.getTargetData(),
+                                         IC.getTargetLibraryInfo());
     return V;
   }
   
@@ -198,7 +199,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   IC.Worklist.Add(I);
 
   switch (I->getOpcode()) {
-  default: assert(0 && "Inconsistency with CanEvaluateShifted");
+  default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
@@ -535,12 +536,11 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
     Value *X = ShiftOp->getOperand(0);
     
-    uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
-    
     IntegerType *Ty = cast<IntegerType>(I.getType());
     
     // Check for (X << c1) << c2  and  (X >> c1) >> c2
     if (I.getOpcode() == ShiftOp->getOpcode()) {
+      uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
       // If this is oversized composite shift, then unsigned shifts get 0, ashr
       // saturates.
       if (AmtSum >= TypeBits) {
@@ -576,7 +576,16 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           ShiftOp->getOpcode() != Instruction::Shl) {
         assert(ShiftOp->getOpcode() == Instruction::LShr ||
                ShiftOp->getOpcode() == Instruction::AShr);
-        Value *Shift = Builder->CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        if (ShiftOp->isExact()) {
+          // (X >>?,exact C1) << C2 --> X << (C2-C1)
+          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                          X, ShiftDiffCst);
+          NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+          NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+          return NewShl;
+        }
+        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
         
         APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
@@ -586,15 +595,34 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
-        assert(ShiftOp->getOpcode() == Instruction::Shl);
-        Value *Shift = Builder->CreateLShr(X, ConstantInt::get(Ty, ShiftDiff));
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        // (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
+        if (ShiftOp->hasNoUnsignedWrap()) {
+          BinaryOperator *NewLShr = BinaryOperator::Create(Instruction::LShr,
+                                                           X, ShiftDiffCst);
+          NewLShr->setIsExact(I.isExact());
+          return NewLShr;
+        }
+        Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
         
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
       }
-      
-      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in.
+
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
+      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+      if (I.getOpcode() == Instruction::AShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        if (ShiftOp->hasNoSignedWrap()) {
+          // (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
+          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+          BinaryOperator *NewAShr = BinaryOperator::Create(Instruction::AShr,
+                                                           X, ShiftDiffCst);
+          NewAShr->setIsExact(I.isExact());
+          return NewAShr;
+        }
+      }
     } else {
       assert(ShiftAmt2 < ShiftAmt1);
       uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
@@ -602,9 +630,16 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
       if (I.getOpcode() == Instruction::Shl &&
           ShiftOp->getOpcode() != Instruction::Shl) {
-        Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(), X,
-                                            ConstantInt::get(Ty, ShiftDiff));
-        
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        if (ShiftOp->isExact()) {
+          // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+          BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
+                                                          X, ShiftDiffCst);
+          NewShr->setIsExact(true);
+          return NewShr;
+        }
+        Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(),
+                                            X, ShiftDiffCst);
         APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
@@ -613,14 +648,34 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
       if (I.getOpcode() == Instruction::LShr &&
           ShiftOp->getOpcode() == Instruction::Shl) {
-        Value *Shift = Builder->CreateShl(X, ConstantInt::get(Ty, ShiftDiff));
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        if (ShiftOp->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X <<nuw (C1-C2)
+          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                          X, ShiftDiffCst);
+          NewShl->setHasNoUnsignedWrap(true);
+          return NewShl;
+        }
+        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
         
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
       }
       
-      // We can't handle (X << C1) >>a C2, it shifts arbitrary bits in.
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
+      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+      if (I.getOpcode() == Instruction::AShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        if (ShiftOp->hasNoSignedWrap()) {
+          // (X <<nsw C1) >>s C2 --> X <<nsw (C1-C2)
+          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                          X, ShiftDiffCst);
+          NewShl->setHasNoSignedWrap(true);
+          return NewShl;
+        }
+      }
     }
   }
   return 0;
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 5cd9a4b..125c74a 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -142,7 +142,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
-    ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
     return 0;        // Only analyze instructions.
   }
 
@@ -156,10 +156,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // this instruction has a simpler value in that context.
     if (I->getOpcode() == Instruction::And) {
       // If either the LHS or the RHS are Zero, the result is zero.
-      ComputeMaskedBits(I->getOperand(1), DemandedMask,
-                        RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
-                        LHSKnownZero, LHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       
       // If all of the demanded bits are known 1 on one side, return the other.
       // These bits cannot contribute to the result of the 'and' in this
@@ -180,10 +178,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // only bits from X or Y are demanded.
       
       // If either the LHS or the RHS are One, the result is One.
-      ComputeMaskedBits(I->getOperand(1), DemandedMask, 
-                        RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
-                        LHSKnownZero, LHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       
       // If all of the demanded bits are known zero on one side, return the
       // other.  These bits cannot contribute to the result of the 'or' in this
@@ -206,7 +202,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     }
     
     // Compute the KnownZero/KnownOne bits to simplify things downstream.
-    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
     return 0;
   }
   
@@ -219,7 +215,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   
   switch (I->getOpcode()) {
   default:
-    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
     break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
@@ -567,9 +563,20 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                LHSKnownZero, LHSKnownOne, Depth+1))
         return I;
     }
+
     // Otherwise just hand the sub off to ComputeMaskedBits to fill in
     // the known zeros and ones.
-    ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
+
+    // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
+    // zero.
+    if (ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(0))) {
+      APInt I0 = C0->getValue();
+      if ((I0 + 1).isPowerOf2() && (I0 | KnownZero).isAllOnesValue()) {
+        Instruction *Xor = BinaryOperator::CreateXor(I->getOperand(1), C0);
+        return InsertNewInstWith(Xor, *I);
+      }
+    }
     break;
   case Instruction::Shl:
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
@@ -671,8 +678,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] || 
           (HighBits & ~DemandedMask) == HighBits) {
         // Perform the logical shift right.
-        Instruction *NewVal = BinaryOperator::CreateLShr(
-                          I->getOperand(0), SA, I->getName());
+        BinaryOperator *NewVal = BinaryOperator::CreateLShr(I->getOperand(0),
+                                                            SA, I->getName());
+        NewVal->setIsExact(cast<BinaryOperator>(I)->isExact());
         return InsertNewInstWith(NewVal, *I);
       } else if ((KnownOne & SignBit) != 0) { // New bits are known one.
         KnownOne |= HighBits;
@@ -717,10 +725,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
     if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
-      APInt Mask2 = APInt::getSignBit(BitWidth);
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(I->getOperand(0), Mask2, LHSKnownZero, LHSKnownOne,
-                        Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero |= LHSKnownZero;
@@ -783,7 +789,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return 0;
       }
     }
-    ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
     break;
   }
   
@@ -822,46 +828,39 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   }
 
   UndefElts = 0;
-  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+  
+  // Handle ConstantAggregateZero, ConstantVector, ConstantDataSequential.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Check if this is identity. If so, return 0 since we are not simplifying
+    // anything.
+    if (DemandedElts.isAllOnesValue())
+      return 0;
+
     Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Undef = UndefValue::get(EltTy);
-
-    std::vector<Constant*> Elts;
-    for (unsigned i = 0; i != VWidth; ++i)
+    
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
       if (!DemandedElts[i]) {   // If not demanded, set to undef.
         Elts.push_back(Undef);
         UndefElts.setBit(i);
-      } else if (isa<UndefValue>(CV->getOperand(i))) {   // Already undef.
+        continue;
+      }
+      
+      Constant *Elt = C->getAggregateElement(i);
+      if (Elt == 0) return 0;
+      
+      if (isa<UndefValue>(Elt)) {   // Already undef.
         Elts.push_back(Undef);
         UndefElts.setBit(i);
       } else {                               // Otherwise, defined.
-        Elts.push_back(CV->getOperand(i));
+        Elts.push_back(Elt);
       }
-
-    // If we changed the constant, return it.
-    Constant *NewCP = ConstantVector::get(Elts);
-    return NewCP != CV ? NewCP : 0;
-  }
-  
-  if (isa<ConstantAggregateZero>(V)) {
-    // Simplify the CAZ to a ConstantVector where the non-demanded elements are
-    // set to undef.
-    
-    // Check if this is identity. If so, return 0 since we are not simplifying
-    // anything.
-    if (DemandedElts.isAllOnesValue())
-      return 0;
-    
-    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
-    Constant *Zero = Constant::getNullValue(EltTy);
-    Constant *Undef = UndefValue::get(EltTy);
-    std::vector<Constant*> Elts;
-    for (unsigned i = 0; i != VWidth; ++i) {
-      Constant *Elt = DemandedElts[i] ? Zero : Undef;
-      Elts.push_back(Elt);
     }
-    UndefElts = DemandedElts ^ EltMask;
-    return ConstantVector::get(Elts);
+    
+    // If we changed the constant, return it.
+    Constant *NewCV = ConstantVector::get(Elts);
+    return NewCV != C ? NewCV : 0;
   }
   
   // Limit search depth.
@@ -977,7 +976,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
     if (NewUndefElts) {
       // Add additional discovered undefs.
-      std::vector<Constant*> Elts;
+      SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0; i < VWidth; ++i) {
         if (UndefElts[i])
           Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext())));
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 154267c..cf60f0f 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -16,16 +16,16 @@
 using namespace llvm;
 
 /// CheapToScalarize - Return true if the value is cheaper to scalarize than it
-/// is to leave as a vector operation.
+/// is to leave as a vector operation.  isConstant indicates whether we're
+/// extracting one known element.  If false we're extracting a variable index.
 static bool CheapToScalarize(Value *V, bool isConstant) {
-  if (isa<ConstantAggregateZero>(V))
-    return true;
-  if (ConstantVector *C = dyn_cast<ConstantVector>(V)) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
     if (isConstant) return true;
-    // If all elts are the same, we can extract.
-    Constant *Op0 = C->getOperand(0);
-    for (unsigned i = 1; i < C->getNumOperands(); ++i)
-      if (C->getOperand(i) != Op0)
+
+    // If all elts are the same, we can extract it and use any of the values.
+    Constant *Op0 = C->getAggregateElement(0U);
+    for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e; ++i)
+      if (C->getAggregateElement(i) != Op0)
         return false;
     return true;
   }
@@ -53,41 +53,18 @@ static bool CheapToScalarize(Value *V, bool isConstant) {
   return false;
 }
 
-/// getShuffleMask - Read and decode a shufflevector mask.
-/// Turn undef elements into negative values.
-static std::vector<int> getShuffleMask(const ShuffleVectorInst *SVI) {
-  unsigned NElts = SVI->getType()->getNumElements();
-  if (isa<ConstantAggregateZero>(SVI->getOperand(2)))
-    return std::vector<int>(NElts, 0);
-  if (isa<UndefValue>(SVI->getOperand(2)))
-    return std::vector<int>(NElts, -1);
-
-  std::vector<int> Result;
-  const ConstantVector *CP = cast<ConstantVector>(SVI->getOperand(2));
-  for (User::const_op_iterator i = CP->op_begin(), e = CP->op_end(); i!=e; ++i)
-    if (isa<UndefValue>(*i))
-      Result.push_back(-1);  // undef
-    else
-      Result.push_back(cast<ConstantInt>(*i)->getZExtValue());
-  return Result;
-}
-
 /// FindScalarElement - Given a vector and an element number, see if the scalar
 /// value is already around as a register, for example if it were inserted then
 /// extracted from the vector.
 static Value *FindScalarElement(Value *V, unsigned EltNo) {
   assert(V->getType()->isVectorTy() && "Not looking at a vector?");
-  VectorType *PTy = cast<VectorType>(V->getType());
-  unsigned Width = PTy->getNumElements();
+  VectorType *VTy = cast<VectorType>(V->getType());
+  unsigned Width = VTy->getNumElements();
   if (EltNo >= Width)  // Out of range access.
-    return UndefValue::get(PTy->getElementType());
+    return UndefValue::get(VTy->getElementType());
 
-  if (isa<UndefValue>(V))
-    return UndefValue::get(PTy->getElementType());
-  if (isa<ConstantAggregateZero>(V))
-    return Constant::getNullValue(PTy->getElementType());
-  if (ConstantVector *CP = dyn_cast<ConstantVector>(V))
-    return CP->getOperand(EltNo);
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->getAggregateElement(EltNo);
 
   if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert to a variable element, we don't know what it is.
@@ -106,11 +83,10 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
   }
 
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
-    unsigned LHSWidth =
-      cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements();
-    int InEl = getShuffleMask(SVI)[EltNo];
+    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
+    int InEl = SVI->getMaskValue(EltNo);
     if (InEl < 0)
-      return UndefValue::get(PTy->getElementType());
+      return UndefValue::get(VTy->getElementType());
     if (InEl < (int)LHSWidth)
       return FindScalarElement(SVI->getOperand(0), InEl);
     return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
@@ -121,27 +97,11 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
 }
 
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
-  // If vector val is undef, replace extract with scalar undef.
-  if (isa<UndefValue>(EI.getOperand(0)))
-    return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
-
-  // If vector val is constant 0, replace extract with scalar 0.
-  if (isa<ConstantAggregateZero>(EI.getOperand(0)))
-    return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
-
-  if (ConstantVector *C = dyn_cast<ConstantVector>(EI.getOperand(0))) {
-    // If vector val is constant with all elements the same, replace EI with
-    // that element. When the elements are not identical, we cannot replace yet
-    // (we do that below, but only when the index is constant).
-    Constant *op0 = C->getOperand(0);
-    for (unsigned i = 1; i != C->getNumOperands(); ++i)
-      if (C->getOperand(i) != op0) {
-        op0 = 0;
-        break;
-      }
-    if (op0)
-      return ReplaceInstUsesWith(EI, op0);
-  }
+  // If vector val is constant with all elements the same, replace EI with
+  // that element.  We handle a known element # below.
+  if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
+    if (CheapToScalarize(C, false))
+      return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));
 
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
@@ -175,8 +135,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     // the same number of elements, see if we can find the source element from
     // it.  In this case, we will end up needing to bitcast the scalars.
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
-      if (VectorType *VT =
-          dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
+      if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
         if (VT->getNumElements() == VectorWidth)
           if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
             return new BitCastInst(Elt, EI.getType());
@@ -212,10 +171,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       // If this is extracting an element from a shufflevector, figure out where
       // it came from and extract from the appropriate input element instead.
       if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
-        int SrcIdx = getShuffleMask(SVI)[Elt->getZExtValue()];
+        int SrcIdx = SVI->getMaskValue(Elt->getZExtValue());
         Value *Src;
         unsigned LHSWidth =
-          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements();
+          SVI->getOperand(0)->getType()->getVectorNumElements();
 
         if (SrcIdx < 0)
           return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
@@ -248,7 +207,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
 /// elements from either LHS or RHS, return the shuffle mask and true.
 /// Otherwise, return false.
 static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
-                                         std::vector<Constant*> &Mask) {
+                                         SmallVectorImpl<Constant*> &Mask) {
   assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
   unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
@@ -325,7 +284,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
 /// CollectShuffleElements - We are building a shuffle of V, using RHS as the
 /// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
 /// that computes V and the LHS value of the shuffle.
-static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
+static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
                                      Value *&RHS) {
   assert(V->getType()->isVectorTy() &&
          (RHS == 0 || V->getType() == RHS->getType()) &&
@@ -335,10 +294,14 @@ static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
   if (isa<UndefValue>(V)) {
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
     return V;
-  } else if (isa<ConstantAggregateZero>(V)) {
+  }
+  
+  if (isa<ConstantAggregateZero>(V)) {
     Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
     return V;
-  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+  }
+  
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert of an extract from some other vector, include it.
     Value *VecOp    = IEI->getOperand(0);
     Value *ScalarOp = IEI->getOperand(1);
@@ -421,7 +384,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
       // If this insertelement isn't used by some other insertelement, turn it
       // (and any insertelements it points to), into one big shuffle.
       if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
-        std::vector<Constant*> Mask;
+        SmallVector<Constant*, 16> Mask;
         Value *RHS = 0;
         Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
         if (RHS == 0) RHS = UndefValue::get(LHS->getType());
@@ -447,7 +410,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
-  std::vector<int> Mask = getShuffleMask(&SVI);
+  SmallVector<int, 16> Mask = SVI.getShuffleMask();
 
   bool MadeChange = false;
 
@@ -457,9 +420,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
   unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
 
-  if (VWidth != cast<VectorType>(LHS->getType())->getNumElements())
-    return 0;
-
   APInt UndefElts(VWidth, 0);
   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
   if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
@@ -470,29 +430,34 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     MadeChange = true;
   }
 
+  unsigned LHSWidth = cast<VectorType>(LHS->getType())->getNumElements();
+
   // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
   // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
   if (LHS == RHS || isa<UndefValue>(LHS)) {
     if (isa<UndefValue>(LHS) && LHS == RHS) {
       // shuffle(undef,undef,mask) -> undef.
-      return ReplaceInstUsesWith(SVI, LHS);
+      Value* result = (VWidth == LHSWidth)
+                      ? LHS : UndefValue::get(SVI.getType());
+      return ReplaceInstUsesWith(SVI, result);
     }
 
     // Remap any references to RHS to use LHS.
-    std::vector<Constant*> Elts;
-    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-      if (Mask[i] < 0)
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
+      if (Mask[i] < 0) {
         Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
-      else {
-        if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
-            (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
-          Mask[i] = -1;     // Turn into undef.
-          Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
-        } else {
-          Mask[i] = Mask[i] % e;  // Force to LHS.
-          Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
-                                          Mask[i]));
-        }
+        continue;
+      }
+
+      if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
+          (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
+        Mask[i] = -1;     // Turn into undef.
+        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+      } else {
+        Mask[i] = Mask[i] % e;  // Force to LHS.
+        Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
+                                        Mask[i]));
       }
     }
     SVI.setOperand(0, SVI.getOperand(1));
@@ -503,72 +468,204 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     MadeChange = true;
   }
 
-  // Analyze the shuffle, are the LHS or RHS and identity shuffles?
-  bool isLHSID = true, isRHSID = true;
+  if (VWidth == LHSWidth) {
+    // Analyze the shuffle, are the LHS or RHS and identity shuffles?
+    bool isLHSID = true, isRHSID = true;
 
-  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-    if (Mask[i] < 0) continue;  // Ignore undef values.
-    // Is this an identity shuffle of the LHS value?
-    isLHSID &= (Mask[i] == (int)i);
+    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] < 0) continue;  // Ignore undef values.
+      // Is this an identity shuffle of the LHS value?
+      isLHSID &= (Mask[i] == (int)i);
 
-    // Is this an identity shuffle of the RHS value?
-    isRHSID &= (Mask[i]-e == i);
-  }
+      // Is this an identity shuffle of the RHS value?
+      isRHSID &= (Mask[i]-e == i);
+    }
 
-  // Eliminate identity shuffles.
-  if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
-  if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+    // Eliminate identity shuffles.
+    if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+    if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  }
 
   // If the LHS is a shufflevector itself, see if we can combine it with this
-  // one without producing an unusual shuffle.  Here we are really conservative:
+  // one without producing an unusual shuffle.
+  // Cases that might be simplified:
+  // 1.
+  // x1=shuffle(v1,v2,mask1)
+  //  x=shuffle(x1,undef,mask)
+  //        ==>
+  //  x=shuffle(v1,undef,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
+  // 2.
+  // x1=shuffle(v1,undef,mask1)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == mask1.size()
+  //        ==>
+  //  x=shuffle(v1,x2,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
+  // 3.
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v2.size() == mask2.size()
+  //        ==>
+  //  x=shuffle(x1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
+  // 4.
+  // x1=shuffle(v1,undef,mask1)
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == v2.size()
+  //        ==>
+  //  x=shuffle(v1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
+  //
+  // Here we are really conservative:
   // we are absolutely afraid of producing a shuffle mask not in the input
   // program, because the code gen may not be smart enough to turn a merged
   // shuffle into two specific shuffles: it may produce worse code.  As such,
   // we only merge two shuffles if the result is either a splat or one of the
-  // two input shuffle masks.  In this case, merging the shuffles just removes
+  // input shuffle masks.  In this case, merging the shuffles just removes
   // one instruction, which we know is safe.  This is good for things like
-  // turning: (splat(splat)) -> splat.
-  if (ShuffleVectorInst *LHSSVI = dyn_cast<ShuffleVectorInst>(LHS)) {
+  // turning: (splat(splat)) -> splat, or
+  // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
+  ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
+  ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LHSShuffle)
+    if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
+      LHSShuffle = NULL;
+  if (RHSShuffle)
+    if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
+      RHSShuffle = NULL;
+  if (!LHSShuffle && !RHSShuffle)
+    return MadeChange ? &SVI : 0;
+
+  Value* LHSOp0 = NULL;
+  Value* LHSOp1 = NULL;
+  Value* RHSOp0 = NULL;
+  unsigned LHSOp0Width = 0;
+  unsigned RHSOp0Width = 0;
+  if (LHSShuffle) {
+    LHSOp0 = LHSShuffle->getOperand(0);
+    LHSOp1 = LHSShuffle->getOperand(1);
+    LHSOp0Width = cast<VectorType>(LHSOp0->getType())->getNumElements();
+  }
+  if (RHSShuffle) {
+    RHSOp0 = RHSShuffle->getOperand(0);
+    RHSOp0Width = cast<VectorType>(RHSOp0->getType())->getNumElements();
+  }
+  Value* newLHS = LHS;
+  Value* newRHS = RHS;
+  if (LHSShuffle) {
+    // case 1
     if (isa<UndefValue>(RHS)) {
-      std::vector<int> LHSMask = getShuffleMask(LHSSVI);
-
-      if (LHSMask.size() == Mask.size()) {
-        std::vector<int> NewMask;
-        bool isSplat = true;
-        int SplatElt = -1; // undef
-        for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-          int MaskElt;
-          if (Mask[i] < 0 || Mask[i] >= (int)e)
-            MaskElt = -1; // undef
-          else
-            MaskElt = LHSMask[Mask[i]];
-          // Check if this could still be a splat.
-          if (MaskElt >= 0) {
-            if (SplatElt >=0 && SplatElt != MaskElt)
-              isSplat = false;
-            SplatElt = MaskElt;
-          }
-          NewMask.push_back(MaskElt);
-        }
+      newLHS = LHSOp0;
+      newRHS = LHSOp1;
+    }
+    // case 2 or 4
+    else if (LHSOp0Width == LHSWidth) {
+      newLHS = LHSOp0;
+    }
+  }
+  // case 3 or 4
+  if (RHSShuffle && RHSOp0Width == LHSWidth) {
+    newRHS = RHSOp0;
+  }
+  // case 4
+  if (LHSOp0 == RHSOp0) {
+    newLHS = LHSOp0;
+    newRHS = NULL;
+  }
 
-        // If the result mask is equal to the src shuffle or this
-        // shuffle mask, do the replacement.
-        if (isSplat || NewMask == LHSMask || NewMask == Mask) {
-          std::vector<Constant*> Elts;
-          Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
-          for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
-            if (NewMask[i] < 0) {
-              Elts.push_back(UndefValue::get(Int32Ty));
-            } else {
-              Elts.push_back(ConstantInt::get(Int32Ty, NewMask[i]));
-            }
-          }
-          return new ShuffleVectorInst(LHSSVI->getOperand(0),
-                                       LHSSVI->getOperand(1),
-                                       ConstantVector::get(Elts));
+  if (newLHS == LHS && newRHS == RHS)
+    return MadeChange ? &SVI : 0;
+
+  SmallVector<int, 16> LHSMask;
+  SmallVector<int, 16> RHSMask;
+  if (newLHS != LHS)
+    LHSMask = LHSShuffle->getShuffleMask();
+  if (RHSShuffle && newRHS != RHS)
+    RHSMask = RHSShuffle->getShuffleMask();
+
+  unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
+  SmallVector<int, 16> newMask;
+  bool isSplat = true;
+  int SplatElt = -1;
+  // Create a new mask for the new ShuffleVectorInst so that the new
+  // ShuffleVectorInst is equivalent to the original one.
+  for (unsigned i = 0; i < VWidth; ++i) {
+    int eltMask;
+    if (Mask[i] == -1) {
+      // This element is an undef value.
+      eltMask = -1;
+    } else if (Mask[i] < (int)LHSWidth) {
+      // This element is from left hand side vector operand.
+      // 
+      // If LHS is going to be replaced (case 1, 2, or 4), calculate the
+      // new mask value for the element.
+      if (newLHS != LHS) {
+        eltMask = LHSMask[Mask[i]];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+          eltMask = -1;
+      }
+      else
+        eltMask = Mask[i];
+    } else {
+      // This element is from right hand side vector operand
+      //
+      // If the value selected is an undef value, explicitly specify it
+      // with a -1 mask value. (case 1)
+      if (isa<UndefValue>(RHS))
+        eltMask = -1;
+      // If RHS is going to be replaced (case 3 or 4), calculate the
+      // new mask value for the element.
+      else if (newRHS != RHS) {
+        eltMask = RHSMask[Mask[i]-LHSWidth];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)RHSOp0Width) {
+          assert(isa<UndefValue>(RHSShuffle->getOperand(1))
+                 && "should have been check above");
+          eltMask = -1;
         }
       }
+      else
+        eltMask = Mask[i]-LHSWidth;
+
+      // If LHS's width is changed, shift the mask value accordingly.
+      // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
+      // references to RHSOp0 to LHSOp0, so we don't need to shift the mask.
+      if (eltMask >= 0 && newRHS != NULL)
+        eltMask += newLHSWidth;
+    }
+
+    // Check if this could still be a splat.
+    if (eltMask >= 0) {
+      if (SplatElt >= 0 && SplatElt != eltMask)
+        isSplat = false;
+      SplatElt = eltMask;
+    }
+
+    newMask.push_back(eltMask);
+  }
+
+  // If the result mask is equal to one of the original shuffle masks,
+  // or is a splat, do the replacement.
+  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
+    SmallVector<Constant*, 16> Elts;
+    Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+    for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
+      if (newMask[i] < 0) {
+        Elts.push_back(UndefValue::get(Int32Ty));
+      } else {
+        Elts.push_back(ConstantInt::get(Int32Ty, newMask[i]));
+      }
     }
+    if (newRHS == NULL)
+      newRHS = UndefValue::get(newLHS->getType());
+    return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
   }
 
   return MadeChange ? &SVI : 0;
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 32009c3..99a02fc 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -55,9 +55,9 @@ public:
     Worklist.reserve(NumEntries+16);
     WorklistMap.resize(NumEntries);
     DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
-    for (; NumEntries; --NumEntries) {
+    for (unsigned Idx = 0; NumEntries; --NumEntries) {
       Instruction *I = List[NumEntries-1];
-      WorklistMap.insert(std::make_pair(I, Worklist.size()));
+      WorklistMap.insert(std::make_pair(I, Idx++));
       Worklist.push_back(I);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index c15b805..066b2ec 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
@@ -74,11 +75,15 @@ void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
 }
 
 char InstCombiner::ID = 0;
-INITIALIZE_PASS(InstCombiner, "instcombine",
+INITIALIZE_PASS_BEGIN(InstCombiner, "instcombine",
+                "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(InstCombiner, "instcombine",
                 "Combine redundant instructions", false, false)
 
 void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+  AU.addRequired<TargetLibraryInfo>();
 }
 
 
@@ -490,7 +495,7 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
   if (ConstantInt *C = dyn_cast<ConstantInt>(V))
     return ConstantExpr::getNeg(C);
 
-  if (ConstantVector *C = dyn_cast<ConstantVector>(V))
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
     if (C->getType()->getElementType()->isIntegerTy())
       return ConstantExpr::getNeg(C);
 
@@ -509,7 +514,7 @@ Value *InstCombiner::dyn_castFNegVal(Value *V) const {
   if (ConstantFP *C = dyn_cast<ConstantFP>(V))
     return ConstantExpr::getFNeg(C);
 
-  if (ConstantVector *C = dyn_cast<ConstantVector>(V))
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
     if (C->getType()->getElementType()->isFloatingPointTy())
       return ConstantExpr::getFNeg(C);
 
@@ -826,7 +831,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           MadeChange = true;
         }
 
-      if ((*I)->getType() != IntPtrTy) {
+      Type *IndexTy = (*I)->getType();
+      if (IndexTy != IntPtrTy && !IndexTy->isVectorTy()) {
         // If we are using a wider index than needed for this platform, shrink
         // it to what we need.  If narrower, sign-extend it to what we need.
         // This explicit cast can make subsequent optimizations more obvious.
@@ -909,7 +915,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
-  PointerType *StrippedPtrTy =cast<PointerType>(StrippedPtr->getType());
+  PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
+
+  // We do not handle pointer-vector geps here.
+  if (!StrippedPtrTy)
+    return 0;
+
   if (StrippedPtr != PtrOp &&
     StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
@@ -1235,15 +1246,15 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     if (I->getOpcode() == Instruction::Add)
       if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // change 'switch (X+4) case 1:' into 'switch (X) case -3'
-        unsigned NumCases = SI.getNumCases();
         // Skip the first item since that's the default case.
-        for (unsigned i = 1; i < NumCases; ++i) {
-          ConstantInt* CaseVal = SI.getCaseValue(i);
+        for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+             i != e; ++i) {
+          ConstantInt* CaseVal = i.getCaseValue();
           Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal),
                                                       AddRHS);
           assert(isa<ConstantInt>(NewCaseVal) &&
                  "Result of expression should be constant");
-          SI.setSuccessorValue(i, cast<ConstantInt>(NewCaseVal));
+          i.setValue(cast<ConstantInt>(NewCaseVal));
         }
         SI.setCondition(I->getOperand(0));
         Worklist.Add(I);
@@ -1260,24 +1271,16 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
     return ReplaceInstUsesWith(EV, Agg);
 
   if (Constant *C = dyn_cast<Constant>(Agg)) {
-    if (isa<UndefValue>(C))
-      return ReplaceInstUsesWith(EV, UndefValue::get(EV.getType()));
-      
-    if (isa<ConstantAggregateZero>(C))
-      return ReplaceInstUsesWith(EV, Constant::getNullValue(EV.getType()));
-
-    if (isa<ConstantArray>(C) || isa<ConstantStruct>(C)) {
-      // Extract the element indexed by the first index out of the constant
-      Value *V = C->getOperand(*EV.idx_begin());
-      if (EV.getNumIndices() > 1)
-        // Extract the remaining indices out of the constant indexed by the
-        // first index
-        return ExtractValueInst::Create(V, EV.getIndices().slice(1));
-      else
-        return ReplaceInstUsesWith(EV, V);
+    if (Constant *C2 = C->getAggregateElement(*EV.idx_begin())) {
+      if (EV.getNumIndices() == 0)
+        return ReplaceInstUsesWith(EV, C2);
+      // Extract the remaining indices out of the constant indexed by the
+      // first index
+      return ExtractValueInst::Create(C2, EV.getIndices().slice(1));
     }
     return 0; // Can't handle other constants
-  } 
+  }
+  
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
     // We're extracting from an insertvalue instruction, compare the indices
     const unsigned *exti, *exte, *insi, *inse;
@@ -1414,7 +1417,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
 enum Personality_Type {
   Unknown_Personality,
   GNU_Ada_Personality,
-  GNU_CXX_Personality
+  GNU_CXX_Personality,
+  GNU_ObjC_Personality
 };
 
 /// RecognizePersonality - See if the given exception handling personality
@@ -1426,7 +1430,8 @@ static Personality_Type RecognizePersonality(Value *Pers) {
     return Unknown_Personality;
   return StringSwitch<Personality_Type>(F->getName())
     .Case("__gnat_eh_personality", GNU_Ada_Personality)
-    .Case("__gxx_personality_v0", GNU_CXX_Personality)
+    .Case("__gxx_personality_v0",  GNU_CXX_Personality)
+    .Case("__objc_personality_v0", GNU_ObjC_Personality)
     .Default(Unknown_Personality);
 }
 
@@ -1440,6 +1445,7 @@ static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) {
     // match foreign exceptions (or didn't, before gcc-4.7).
     return false;
   case GNU_CXX_Personality:
+  case GNU_ObjC_Personality:
     return TypeInfo->isNullValue();
   }
   llvm_unreachable("Unknown personality!");
@@ -1795,7 +1801,8 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 static bool AddReachableCodeToWorklist(BasicBlock *BB, 
                                        SmallPtrSet<BasicBlock*, 64> &Visited,
                                        InstCombiner &IC,
-                                       const TargetData *TD) {
+                                       const TargetData *TD,
+                                       const TargetLibraryInfo *TLI) {
   bool MadeIRChange = false;
   SmallVector<BasicBlock*, 256> Worklist;
   Worklist.push_back(BB);
@@ -1822,7 +1829,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
-        if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
+        if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
           DEBUG(errs() << "IC: ConstFold to: " << *C << " from: "
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
@@ -1840,7 +1847,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 
           Constant*& FoldRes = FoldedConstants[CE];
           if (!FoldRes)
-            FoldRes = ConstantFoldConstantExpression(CE, TD);
+            FoldRes = ConstantFoldConstantExpression(CE, TD, TLI);
           if (!FoldRes)
             FoldRes = CE;
 
@@ -1867,15 +1874,16 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
         // See if this is an explicit destination.
-        for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
-          if (SI->getCaseValue(i) == Cond) {
-            BasicBlock *ReachableBB = SI->getSuccessor(i);
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i)
+          if (i.getCaseValue() == Cond) {
+            BasicBlock *ReachableBB = i.getCaseSuccessor();
             Worklist.push_back(ReachableBB);
             continue;
           }
         
         // Otherwise it is the default destination.
-        Worklist.push_back(SI->getSuccessor(0));
+        Worklist.push_back(SI->getDefaultDest());
         continue;
       }
     }
@@ -1899,14 +1907,15 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
   
   DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
-        << F.getNameStr() << "\n");
+               << F.getName() << "\n");
 
   {
     // Do a depth-first traversal of the function, populate the worklist with
     // the reachable instructions.  Ignore blocks that are not reachable.  Keep
     // track of which blocks we visit.
     SmallPtrSet<BasicBlock*, 64> Visited;
-    MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, TD);
+    MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, TD,
+                                               TLI);
 
     // Do a quick scan over the function.  If we find any blocks that are
     // unreachable, remove any instructions inside of them.  This prevents
@@ -1951,7 +1960,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Instruction isn't dead, see if we can constant propagate it.
     if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
-      if (Constant *C = ConstantFoldInstruction(I, TD)) {
+      if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
         DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
         // Add operands to the worklist.
@@ -2059,7 +2068,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
 bool InstCombiner::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<TargetData>();
-
+  TLI = &getAnalysis<TargetLibraryInfo>();
   
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
diff --git a/lib/Transforms/InstCombine/LLVMBuild.txt b/lib/Transforms/InstCombine/LLVMBuild.txt
new file mode 100644
index 0000000..62c61616
--- /dev/null
+++ b/lib/Transforms/InstCombine/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/InstCombine/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = InstCombine
+parent = Transforms
+required_libraries = Analysis Core Support Target TransformUtils
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
new file mode 100644
index 0000000..b43b9e5
--- /dev/null
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -0,0 +1,937 @@
+//===-- AddressSanitizer.cpp - memory error detector ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  http://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asan"
+
+#include "FunctionBlackList.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Type.h"
+
+#include <string>
+#include <algorithm>
+
+using namespace llvm;
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
+static const char *kAsanModuleCtorName = "asan.module_ctor";
+static const char *kAsanModuleDtorName = "asan.module_dtor";
+static const int   kAsanCtorAndCtorPriority = 1;
+static const char *kAsanReportErrorTemplate = "__asan_report_";
+static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
+static const char *kAsanInitName = "__asan_init";
+static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
+static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
+static const char *kAsanMappingScaleName = "__asan_mapping_scale";
+static const char *kAsanStackMallocName = "__asan_stack_malloc";
+static const char *kAsanStackFreeName = "__asan_stack_free";
+
+static const int kAsanStackLeftRedzoneMagic = 0xf1;
+static const int kAsanStackMidRedzoneMagic = 0xf2;
+static const int kAsanStackRightRedzoneMagic = 0xf3;
+static const int kAsanStackPartialRedzoneMagic = 0xf4;
+
+// Command-line flags.
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+       cl::desc("instrument read instructions"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes",
+       cl::desc("instrument write instructions"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack",
+       cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -f[no]asan-use-after-return.
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+       cl::desc("Check return-after-free"), cl::Hidden, cl::init(false));
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+       cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClMemIntrin("asan-memintrin",
+       cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -fasan-blacklist.
+static cl::opt<std::string>  ClBlackListFile("asan-blacklist",
+       cl::desc("File containing the list of functions to ignore "
+                "during instrumentation"), cl::Hidden);
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + (1 << offset_log)
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+       cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
+static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log",
+       cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+static cl::opt<bool> ClOpt("asan-opt",
+       cl::desc("Optimize instrumentation"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp",
+       cl::desc("Instrument the same temp just once"), cl::Hidden,
+       cl::init(true));
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+       cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true));
+
+// Debug flags.
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+static cl::opt<std::string> ClDebugFunc("asan-debug-func",
+                                        cl::Hidden, cl::desc("Debug func"));
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
+                               cl::Hidden, cl::init(-1));
+
+namespace {
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer : public ModulePass {
+  AddressSanitizer();
+  virtual const char *getPassName() const;
+  void instrumentMop(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  Instruction *generateCrashCode(IRBuilder<> &IRB, Value *Addr,
+                                 bool IsWrite, uint32_t TypeSize);
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
+                                  Value *Size,
+                                   Instruction *InsertBefore, bool IsWrite);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool handleFunction(Module &M, Function &F);
+  bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  bool poisonStackInFunction(Module &M, Function &F);
+  virtual bool runOnModule(Module &M);
+  bool insertGlobalRedzones(Module &M);
+  BranchInst *splitBlockAndInsertIfThen(Instruction *SplitBefore, Value *Cmp);
+  static char ID;  // Pass identification, replacement for typeid
+
+ private:
+
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
+    return SizeInBytes;
+  }
+  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+    return ((SizeInBytes + RedzoneSize - 1)
+            / RedzoneSize) * RedzoneSize;
+  }
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    return getAlignedSize(SizeInBytes);
+  }
+
+  void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+                   Value *ShadowBase, bool DoPoison);
+  bool LooksLikeCodeInBug11395(Instruction *I);
+
+  Module      *CurrentModule;
+  LLVMContext *C;
+  TargetData *TD;
+  uint64_t MappingOffset;
+  int MappingScale;
+  size_t RedzoneSize;
+  int LongSize;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  Function *AsanCtorFunction;
+  Function *AsanInitFunction;
+  Instruction *CtorInsertBefore;
+  OwningPtr<FunctionBlackList> BL;
+};
+}  // namespace
+
+char AddressSanitizer::ID = 0;
+INITIALIZE_PASS(AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
+    false, false)
+AddressSanitizer::AddressSanitizer() : ModulePass(ID) { }
+ModulePass *llvm::createAddressSanitizerPass() {
+  return new AddressSanitizer();
+}
+
+const char *AddressSanitizer::getPassName() const {
+  return "AddressSanitizer";
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), true,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+// Split the basic block and insert an if-then code.
+// Before:
+//   Head
+//   SplitBefore
+//   Tail
+// After:
+//   Head
+//   if (Cmp)
+//     NewBasicBlock
+//   SplitBefore
+//   Tail
+//
+// Returns the NewBasicBlock's terminator.
+BranchInst *AddressSanitizer::splitBlockAndInsertIfThen(
+    Instruction *SplitBefore, Value *Cmp) {
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  BasicBlock *NewBasicBlock =
+      BasicBlock::Create(*C, "", Head->getParent());
+  BranchInst *HeadNewTerm = BranchInst::Create(/*ifTrue*/NewBasicBlock,
+                                               /*ifFalse*/Tail,
+                                               Cmp);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+
+  BranchInst *CheckTerm = BranchInst::Create(Tail, NewBasicBlock);
+  return CheckTerm;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, MappingScale);
+  if (MappingOffset == 0)
+    return Shadow;
+  // (Shadow >> scale) | offset
+  return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy,
+                                               MappingOffset));
+}
+
+void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns,
+    Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
+  // Check the first byte.
+  {
+    IRBuilder<> IRB(InsertBefore);
+    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
+  }
+  // Check the last byte.
+  {
+    IRBuilder<> IRB(InsertBefore);
+    Value *SizeMinusOne = IRB.CreateSub(
+        Size, ConstantInt::get(Size->getType(), 1));
+    SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
+    Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+    Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
+    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
+  }
+}
+
+// Instrument memset/memmove/memcpy
+bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  Value *Dst = MI->getDest();
+  MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
+  Value *Src = MemTran ? MemTran->getSource() : NULL;
+  Value *Length = MI->getLength();
+
+  Constant *ConstLength = dyn_cast<Constant>(Length);
+  Instruction *InsertBefore = MI;
+  if (ConstLength) {
+    if (ConstLength->isNullValue()) return false;
+  } else {
+    // The size is not a constant so it could be zero -- check at run-time.
+    IRBuilder<> IRB(InsertBefore);
+
+    Value *Cmp = IRB.CreateICmpNE(Length,
+                                   Constant::getNullValue(Length->getType()));
+    InsertBefore = splitBlockAndInsertIfThen(InsertBefore, Cmp);
+  }
+
+  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
+  if (Src)
+    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
+  return true;
+}
+
+static Value *getLDSTOperand(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    return LI->getPointerOperand();
+  }
+  return cast<StoreInst>(*I).getPointerOperand();
+}
+
+void AddressSanitizer::instrumentMop(Instruction *I) {
+  int IsWrite = isa<StoreInst>(*I);
+  Value *Addr = getLDSTOperand(I);
+  if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) {
+    // We are accessing a global scalar variable. Nothing to catch here.
+    return;
+  }
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    // Ignore all unusual sizes.
+    return;
+  }
+
+  IRBuilder<> IRB(I);
+  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+}
+
+Instruction *AddressSanitizer::generateCrashCode(
+    IRBuilder<> &IRB, Value *Addr, bool IsWrite, uint32_t TypeSize) {
+  // IsWrite and TypeSize are encoded in the function name.
+  std::string FunctionName = std::string(kAsanReportErrorTemplate) +
+      (IsWrite ? "store" : "load") + itostr(TypeSize / 8);
+  Value *ReportWarningFunc = CurrentModule->getOrInsertFunction(
+      FunctionName, IRB.getVoidTy(), IntptrTy, NULL);
+  CallInst *Call = IRB.CreateCall(ReportWarningFunc, Addr);
+  Call->setDoesNotReturn();
+  return Call;
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         IRBuilder<> &IRB, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite) {
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  Type *ShadowTy  = IntegerType::get(
+      *C, std::max(8U, TypeSize >> MappingScale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue = IRB.CreateLoad(
+      IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+
+  Instruction *CheckTerm = splitBlockAndInsertIfThen(
+      cast<Instruction>(Cmp)->getNextNode(), Cmp);
+  IRBuilder<> IRB2(CheckTerm);
+
+  size_t Granularity = 1 << MappingScale;
+  if (TypeSize < 8 * Granularity) {
+    // Addr & (Granularity - 1)
+    Value *Lower3Bits = IRB2.CreateAnd(
+        AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+    // (Addr & (Granularity - 1)) + size - 1
+    Value *LastAccessedByte = IRB2.CreateAdd(
+        Lower3Bits, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+    // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+    LastAccessedByte = IRB2.CreateIntCast(
+        LastAccessedByte, IRB.getInt8Ty(), false);
+    // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+    Value *Cmp2 = IRB2.CreateICmpSGE(LastAccessedByte, ShadowValue);
+
+    CheckTerm = splitBlockAndInsertIfThen(CheckTerm, Cmp2);
+  }
+
+  IRBuilder<> IRB1(CheckTerm);
+  Instruction *Crash = generateCrashCode(IRB1, AddrLong, IsWrite, TypeSize);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+  ReplaceInstWithInst(CheckTerm, new UnreachableInst(*C));
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+bool AddressSanitizer::insertGlobalRedzones(Module &M) {
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+
+  for (Module::GlobalListType::iterator G = M.getGlobalList().begin(),
+       E = M.getGlobalList().end(); G != E; ++G) {
+    Type *Ty = cast<PointerType>(G->getType())->getElementType();
+    DEBUG(dbgs() << "GLOBAL: " << *G);
+
+    if (!Ty->isSized()) continue;
+    if (!G->hasInitializer()) continue;
+    // Touch only those globals that will not be defined in other modules.
+    // Don't handle ODR type linkages since other modules may be built w/o asan.
+    if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+        G->getLinkage() != GlobalVariable::PrivateLinkage &&
+        G->getLinkage() != GlobalVariable::InternalLinkage)
+      continue;
+    // Two problems with thread-locals:
+    //   - The address of the main thread's copy can't be computed at link-time.
+    //   - Need to poison all copies, not just the main thread's one.
+    if (G->isThreadLocal())
+      continue;
+    // For now, just ignore this Alloca if the alignment is large.
+    if (G->getAlignment() > RedzoneSize) continue;
+
+    // Ignore all the globals with the names starting with "\01L_OBJC_".
+    // Many of those are put into the .cstring section. The linker compresses
+    // that section by removing the spare \0s after the string terminator, so
+    // our redzones get broken.
+    if ((G->getName().find("\01L_OBJC_") == 0) ||
+        (G->getName().find("\01l_OBJC_") == 0)) {
+      DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G);
+      continue;
+    }
+
+    if (G->hasSection()) {
+      StringRef Section(G->getSection());
+      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+      // them.
+      if ((Section.find("__OBJC,") == 0) ||
+          (Section.find("__DATA, __objc_") == 0)) {
+        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G);
+        continue;
+      }
+      // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
+      // Constant CFString instances are compiled in the following way:
+      //  -- the string buffer is emitted into
+      //     __TEXT,__cstring,cstring_literals
+      //  -- the constant NSConstantString structure referencing that buffer
+      //     is placed into __DATA,__cfstring
+      // Therefore there's no point in placing redzones into __DATA,__cfstring.
+      // Moreover, it causes the linker to crash on OS X 10.7
+      if (Section.find("__DATA,__cfstring") == 0) {
+        DEBUG(dbgs() << "Ignoring CFString: " << *G);
+        continue;
+      }
+    }
+
+    GlobalsToChange.push_back(G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) return false;
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
+                                               IntptrTy, IntptrTy, NULL);
+  SmallVector<Constant *, 16> Initializers(n);
+
+  IRBuilder<> IRB(CtorInsertBefore);
+
+  for (size_t i = 0; i < n; i++) {
+    GlobalVariable *G = GlobalsToChange[i];
+    PointerType *PtrTy = cast<PointerType>(G->getType());
+    Type *Ty = PtrTy->getElementType();
+    uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
+    uint64_t RightRedzoneSize = RedzoneSize +
+        (RedzoneSize - (SizeInBytes % RedzoneSize));
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(),
+        Constant::getNullValue(RightRedZoneTy), NULL);
+
+    SmallString<2048> DescriptionOfGlobal = G->getName();
+    DescriptionOfGlobal += " (";
+    DescriptionOfGlobal += M.getModuleIdentifier();
+    DescriptionOfGlobal += ")";
+    GlobalVariable *Name = createPrivateGlobalForString(M, DescriptionOfGlobal);
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalVariable *NewGlobal = new GlobalVariable(
+        M, NewTy, G->isConstant(), G->getLinkage(),
+        NewInitializer, "", G, G->isThreadLocal());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setAlignment(RedzoneSize);
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewGlobal, Indices2, true));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+
+    Initializers[i] = ConstantStruct::get(
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        NULL);
+    DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal);
+  }
+
+  ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
+  GlobalVariable *AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+
+  Function *AsanRegisterGlobals = cast<Function>(M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
+
+  IRB.CreateCall2(AsanRegisterGlobals,
+                  IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, n));
+
+  // We also need to unregister globals at the end, e.g. when a shared library
+  // gets closed.
+  Function *AsanDtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+  IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
+  Function *AsanUnregisterGlobals = cast<Function>(M.getOrInsertFunction(
+      kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+
+  IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
+                       IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                       ConstantInt::get(IntptrTy, n));
+  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndCtorPriority);
+
+  DEBUG(dbgs() << M);
+  return true;
+}
+
+// virtual
+bool AddressSanitizer::runOnModule(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD)
+    return false;
+  BL.reset(new FunctionBlackList(ClBlackListFile));
+
+  CurrentModule = &M;
+  C = &(M.getContext());
+  LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  IntptrPtrTy = PointerType::get(IntptrTy, 0);
+
+  AsanCtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
+  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
+  CtorInsertBefore = ReturnInst::Create(*C, AsanCtorBB);
+
+  // call __asan_init in the module ctor.
+  IRBuilder<> IRB(CtorInsertBefore);
+  AsanInitFunction = cast<Function>(
+      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
+  AsanInitFunction->setLinkage(Function::ExternalLinkage);
+  IRB.CreateCall(AsanInitFunction);
+
+  MappingOffset = LongSize == 32
+      ? kDefaultShadowOffset32 : kDefaultShadowOffset64;
+  if (ClMappingOffsetLog >= 0) {
+    if (ClMappingOffsetLog == 0) {
+      // special case
+      MappingOffset = 0;
+    } else {
+      MappingOffset = 1ULL << ClMappingOffsetLog;
+    }
+  }
+  MappingScale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    MappingScale = ClMappingScale;
+  }
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  RedzoneSize = std::max(32, (int)(1 << MappingScale));
+
+  bool Res = false;
+
+  if (ClGlobals)
+    Res |= insertGlobalRedzones(M);
+
+  if (ClMappingOffsetLog >= 0) {
+    // Tell the run-time the current values of mapping offset and scale.
+    GlobalValue *asan_mapping_offset =
+        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                       ConstantInt::get(IntptrTy, MappingOffset),
+                       kAsanMappingOffsetName);
+    // Read the global, otherwise it may be optimized away.
+    IRB.CreateLoad(asan_mapping_offset, true);
+  }
+  if (ClMappingScale) {
+    GlobalValue *asan_mapping_scale =
+        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                           ConstantInt::get(IntptrTy, MappingScale),
+                           kAsanMappingScaleName);
+    // Read the global, otherwise it may be optimized away.
+    IRB.CreateLoad(asan_mapping_scale, true);
+  }
+
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    Res |= handleFunction(M, *F);
+  }
+
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
+
+  return Res;
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    IRBuilder<> IRB(F.begin()->begin());
+    IRB.CreateCall(AsanInitFunction);
+    return true;
+  }
+  return false;
+}
+
+bool AddressSanitizer::handleFunction(Module &M, Function &F) {
+  if (BL->isIn(F)) return false;
+  if (&F == AsanCtorFunction) return false;
+
+  // If needed, insert __asan_init before checking for AddressSafety attr.
+  maybeInsertAsanInitAtFunctionEntry(F);
+
+  if (!F.hasFnAttr(Attribute::AddressSafety)) return false;
+
+  if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
+    return false;
+  // We want to instrument every address only once per basic block
+  // (unless there are calls between uses).
+  SmallSet<Value*, 16> TempsToInstrument;
+  SmallVector<Instruction*, 16> ToInstrument;
+  SmallVector<Instruction*, 8> NoReturnCalls;
+
+  // Fill the set of memory operations to instrument.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    TempsToInstrument.clear();
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      if (LooksLikeCodeInBug11395(BI)) return false;
+      if ((isa<LoadInst>(BI) && ClInstrumentReads) ||
+          (isa<StoreInst>(BI) && ClInstrumentWrites)) {
+        Value *Addr = getLDSTOperand(BI);
+        if (ClOpt && ClOptSameTemp) {
+          if (!TempsToInstrument.insert(Addr))
+            continue;  // We've seen this temp in the current BB.
+        }
+      } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
+        // ok, take it.
+      } else {
+        if (CallInst *CI = dyn_cast<CallInst>(BI)) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+          if (CI->doesNotReturn()) {
+            NoReturnCalls.push_back(CI);
+          }
+        }
+        continue;
+      }
+      ToInstrument.push_back(BI);
+    }
+  }
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
+    Instruction *Inst = ToInstrument[i];
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      if (isa<StoreInst>(Inst) || isa<LoadInst>(Inst))
+        instrumentMop(Inst);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  DEBUG(dbgs() << F);
+
+  bool ChangedStack = poisonStackInFunction(M, F);
+
+  // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
+  // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
+  for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) {
+    Instruction *CI = NoReturnCalls[i];
+    IRBuilder<> IRB(CI);
+    IRB.CreateCall(M.getOrInsertFunction(kAsanHandleNoReturnName,
+                                         IRB.getVoidTy(), NULL));
+  }
+
+  return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+}
+
+static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) {
+  if (ShadowRedzoneSize == 1) return PoisonByte;
+  if (ShadowRedzoneSize == 2) return (PoisonByte << 8) + PoisonByte;
+  if (ShadowRedzoneSize == 4)
+    return (PoisonByte << 24) + (PoisonByte << 16) +
+        (PoisonByte << 8) + (PoisonByte);
+  llvm_unreachable("ShadowRedzoneSize is either 1, 2 or 4");
+}
+
+static void PoisonShadowPartialRightRedzone(uint8_t *Shadow,
+                                            size_t Size,
+                                            size_t RedzoneSize,
+                                            size_t ShadowGranularity,
+                                            uint8_t Magic) {
+  for (size_t i = 0; i < RedzoneSize;
+       i+= ShadowGranularity, Shadow++) {
+    if (i + ShadowGranularity <= Size) {
+      *Shadow = 0;  // fully addressable
+    } else if (i >= Size) {
+      *Shadow = Magic;  // unaddressable
+    } else {
+      *Shadow = Size - i;  // first Size-i bytes are addressable
+    }
+  }
+}
+
+void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec,
+                                   IRBuilder<> IRB,
+                                   Value *ShadowBase, bool DoPoison) {
+  size_t ShadowRZSize = RedzoneSize >> MappingScale;
+  assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
+  Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8);
+  Type *RZPtrTy = PointerType::get(RZTy, 0);
+
+  Value *PoisonLeft  = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackLeftRedzoneMagic : 0LL, ShadowRZSize));
+  Value *PoisonMid   = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackMidRedzoneMagic : 0LL, ShadowRZSize));
+  Value *PoisonRight = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackRightRedzoneMagic : 0LL, ShadowRZSize));
+
+  // poison the first red zone.
+  IRB.CreateStore(PoisonLeft, IRB.CreateIntToPtr(ShadowBase, RZPtrTy));
+
+  // poison all other red zones.
+  uint64_t Pos = RedzoneSize;
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
+    AllocaInst *AI = AllocaVec[i];
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    uint64_t AlignedSize = getAlignedAllocaSize(AI);
+    assert(AlignedSize - SizeInBytes < RedzoneSize);
+    Value *Ptr = NULL;
+
+    Pos += AlignedSize;
+
+    assert(ShadowBase->getType() == IntptrTy);
+    if (SizeInBytes < AlignedSize) {
+      // Poison the partial redzone at right
+      Ptr = IRB.CreateAdd(
+          ShadowBase, ConstantInt::get(IntptrTy,
+                                       (Pos >> MappingScale) - ShadowRZSize));
+      size_t AddressableBytes = RedzoneSize - (AlignedSize - SizeInBytes);
+      uint32_t Poison = 0;
+      if (DoPoison) {
+        PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes,
+                                        RedzoneSize,
+                                        1ULL << MappingScale,
+                                        kAsanStackPartialRedzoneMagic);
+      }
+      Value *PartialPoison = ConstantInt::get(RZTy, Poison);
+      IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
+    }
+
+    // Poison the full redzone at right.
+    Ptr = IRB.CreateAdd(ShadowBase,
+                        ConstantInt::get(IntptrTy, Pos >> MappingScale));
+    Value *Poison = i == AllocaVec.size() - 1 ? PoisonRight : PoisonMid;
+    IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
+
+    Pos += RedzoneSize;
+  }
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+// Find all static Alloca instructions and put
+// poisoned red zones around all of them.
+// Then unpoison everything back before the function returns.
+//
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
+  if (!ClStack) return false;
+  SmallVector<AllocaInst*, 16> AllocaVec;
+  SmallVector<Instruction*, 8> RetVec;
+  uint64_t TotalSize = 0;
+
+  // Filter out Alloca instructions we want (and can) handle.
+  // Collect Ret instructions.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    BasicBlock &BB = *FI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
+         BI != BE; ++BI) {
+      if (isa<ReturnInst>(BI)) {
+          RetVec.push_back(BI);
+          continue;
+      }
+
+      AllocaInst *AI = dyn_cast<AllocaInst>(BI);
+      if (!AI) continue;
+      if (AI->isArrayAllocation()) continue;
+      if (!AI->isStaticAlloca()) continue;
+      if (!AI->getAllocatedType()->isSized()) continue;
+      if (AI->getAlignment() > RedzoneSize) continue;
+      AllocaVec.push_back(AI);
+      uint64_t AlignedSize =  getAlignedAllocaSize(AI);
+      TotalSize += AlignedSize;
+    }
+  }
+
+  if (AllocaVec.empty()) return false;
+
+  uint64_t LocalStackSize = TotalSize + (AllocaVec.size() + 1) * RedzoneSize;
+
+  bool DoStackMalloc = ClUseAfterReturn
+      && LocalStackSize <= kMaxStackMallocSize;
+
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+
+
+  Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
+  AllocaInst *MyAlloca =
+      new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
+  MyAlloca->setAlignment(RedzoneSize);
+  assert(MyAlloca->isStaticAlloca());
+  Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy);
+  Value *LocalStackBase = OrigStackBase;
+
+  if (DoStackMalloc) {
+    Value *AsanStackMallocFunc = M.getOrInsertFunction(
+        kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL);
+    LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
+        ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
+  }
+
+  // This string will be parsed by the run-time (DescribeStackAddress).
+  SmallString<2048> StackDescriptionStorage;
+  raw_svector_ostream StackDescription(StackDescriptionStorage);
+  StackDescription << F.getName() << " " << AllocaVec.size() << " ";
+
+  uint64_t Pos = RedzoneSize;
+  // Replace Alloca instructions with base+offset.
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
+    AllocaInst *AI = AllocaVec[i];
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    StringRef Name = AI->getName();
+    StackDescription << Pos << " " << SizeInBytes << " "
+                     << Name.size() << " " << Name << " ";
+    uint64_t AlignedSize = getAlignedAllocaSize(AI);
+    assert((AlignedSize % RedzoneSize) == 0);
+    AI->replaceAllUsesWith(
+        IRB.CreateIntToPtr(
+            IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)),
+            AI->getType()));
+    Pos += AlignedSize + RedzoneSize;
+  }
+  assert(Pos == LocalStackSize);
+
+  // Write the Magic value and the frame description constant to the redzone.
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  Value *BasePlus1 = IRB.CreateAdd(LocalStackBase,
+                                   ConstantInt::get(IntptrTy, LongSize/8));
+  BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy);
+  Value *Description = IRB.CreatePointerCast(
+      createPrivateGlobalForString(M, StackDescription.str()),
+      IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+
+  // Poison the stack redzones at the entry.
+  Value *ShadowBase = memToShadow(LocalStackBase, IRB);
+  PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRB, ShadowBase, true);
+
+  Value *AsanStackFreeFunc = NULL;
+  if (DoStackMalloc) {
+    AsanStackFreeFunc = M.getOrInsertFunction(
+        kAsanStackFreeName, IRB.getVoidTy(),
+        IntptrTy, IntptrTy, IntptrTy, NULL);
+  }
+
+  // Unpoison the stack before all ret instructions.
+  for (size_t i = 0, n = RetVec.size(); i < n; i++) {
+    Instruction *Ret = RetVec[i];
+    IRBuilder<> IRBRet(Ret);
+
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    // Unpoison the stack.
+    PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRBRet, ShadowBase, false);
+
+    if (DoStackMalloc) {
+      IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
+                         ConstantInt::get(IntptrTy, LocalStackSize),
+                         OrigStackBase);
+    }
+  }
+
+  if (ClDebugStack) {
+    DEBUG(dbgs() << F);
+  }
+
+  return true;
+}
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 7b3a927a..e4c8cf1 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,15 +1,11 @@
 add_llvm_library(LLVMInstrumentation
+  AddressSanitizer.cpp
   EdgeProfiling.cpp
+  FunctionBlackList.cpp
   GCOVProfiling.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp
   PathProfiling.cpp
   ProfilingUtils.cpp
-  )
-
-add_llvm_library_dependencies(LLVMInstrumentation
-  LLVMAnalysis
-  LLVMCore
-  LLVMSupport
-  LLVMTransformUtils
+  ThreadSanitizer.cpp
   )
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.cpp b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
new file mode 100644
index 0000000..188ea4d
--- /dev/null
+++ b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
@@ -0,0 +1,79 @@
+//===-- FunctionBlackList.cpp - blacklist of functions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer 
+// or ThreadSanitizer) to avoid instrumenting some functions based on
+// user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FunctionBlackList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+namespace llvm {
+
+FunctionBlackList::FunctionBlackList(const std::string &Path) {
+  Functions = NULL;
+  const char *kFunPrefix = "fun:";
+  if (!Path.size()) return;
+  std::string Fun;
+
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path.c_str(), File)) {
+    report_fatal_error("Can't open blacklist file " + Path + ": " +
+                       EC.message());
+  }
+  MemoryBuffer *Buff = File.take();
+  const char *Data = Buff->getBufferStart();
+  size_t DataLen = Buff->getBufferSize();
+  SmallVector<StringRef, 16> Lines;
+  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
+  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
+    if (Lines[i].startswith(kFunPrefix)) {
+      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
+      std::string ThisFuncRE;
+      // add ThisFunc replacing * with .*
+      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
+        if (ThisFunc[j] == '*')
+          ThisFuncRE += '.';
+        ThisFuncRE += ThisFunc[j];
+      }
+      // Check that the regexp is valid.
+      Regex CheckRE(ThisFuncRE);
+      std::string Error;
+      if (!CheckRE.isValid(Error))
+        report_fatal_error("malformed blacklist regex: " + ThisFunc +
+                           ": " + Error);
+      // Append to the final regexp.
+      if (Fun.size())
+        Fun += "|";
+      Fun += ThisFuncRE;
+    }
+  }
+  if (Fun.size()) {
+    Functions = new Regex(Fun);
+  }
+}
+
+bool FunctionBlackList::isIn(const Function &F) {
+  if (Functions) {
+    bool Res = Functions->match(F.getName());
+    return Res;
+  }
+  return false;
+}
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.h b/lib/Transforms/Instrumentation/FunctionBlackList.h
new file mode 100644
index 0000000..c1239b9
--- /dev/null
+++ b/lib/Transforms/Instrumentation/FunctionBlackList.h
@@ -0,0 +1,37 @@
+//===-- FunctionBlackList.cpp - blacklist of functions ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions based on
+// user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include <string>
+
+namespace llvm {
+class Function;
+class Regex;
+
+// Blacklisted functions are not instrumented.
+// The blacklist file contains one or more lines like this:
+// ---
+// fun:FunctionWildCard
+// ---
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+class FunctionBlackList {
+ public:
+  FunctionBlackList(const std::string &Path);
+  bool isIn(const Function &F);
+ private:
+  Regex *Functions;
+};
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index ccf7e11..96e5d5b 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -43,12 +43,14 @@ namespace {
   public:
     static char ID;
     GCOVProfiler()
-        : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false) {
+        : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false),
+          UseExtraChecksum(false) {
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
-    GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false)
+    GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false,
+                 bool useExtraChecksum = false)
         : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData),
-          Use402Format(use402Format) {
+          Use402Format(use402Format), UseExtraChecksum(useExtraChecksum) {
       assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?");
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
@@ -94,6 +96,7 @@ namespace {
     bool EmitNotes;
     bool EmitData;
     bool Use402Format;
+    bool UseExtraChecksum;
 
     Module *M;
     LLVMContext *Ctx;
@@ -105,8 +108,9 @@ INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
                 "Insert instrumentation for GCOV profiling", false, false)
 
 ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
-                                         bool Use402Format) {
-  return new GCOVProfiler(EmitNotes, EmitData, Use402Format);
+                                         bool Use402Format,
+                                         bool UseExtraChecksum) {
+  return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum);
 }
 
 namespace {
@@ -167,7 +171,7 @@ namespace {
     }
 
     uint32_t length() {
-      // Here 2 = 1 for string lenght + 1 for '0' id#.
+      // Here 2 = 1 for string length + 1 for '0' id#.
       return lengthOfGCOVString(Filename) + 2 + Lines.size();
     }
 
@@ -244,10 +248,12 @@ namespace {
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
    public:
-    GCOVFunction(DISubprogram SP, raw_ostream *os, bool Use402Format) {
+    GCOVFunction(DISubprogram SP, raw_ostream *os,
+                 bool Use402Format, bool UseExtraChecksum) {
       this->os = os;
 
       Function *F = SP.getFunction();
+      DEBUG(dbgs() << "Function: " << F->getName() << "\n");
       uint32_t i = 0;
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
         Blocks[BB] = new GCOVBlock(i++, os);
@@ -257,14 +263,14 @@ namespace {
       writeBytes(FunctionTag, 4);
       uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) +
           1 + lengthOfGCOVString(SP.getFilename()) + 1;
-      if (!Use402Format)
-        ++BlockLen; // For second checksum.
+      if (UseExtraChecksum)
+        ++BlockLen;
       write(BlockLen);
       uint32_t Ident = reinterpret_cast<intptr_t>((MDNode*)SP);
       write(Ident);
-      write(0);  // checksum #1
-      if (!Use402Format)
-        write(0);  // checksum #2
+      write(0);  // lineno checksum
+      if (UseExtraChecksum)
+        write(0);  // cfg checksum
       writeGCOVString(SP.getName());
       writeGCOVString(SP.getFilename());
       write(SP.getLineNumber());
@@ -290,6 +296,7 @@ namespace {
       for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
         write(0);  // No flags on our blocks.
       }
+      DEBUG(dbgs() << Blocks.size() << " blocks.\n");
 
       // Emit edges between blocks.
       for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
@@ -301,6 +308,8 @@ namespace {
         write(Block.OutEdges.size() * 2 + 1);
         write(Block.Number);
         for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
+          DEBUG(dbgs() << Block.Number << " -> " << Block.OutEdges[i]->Number
+                       << "\n");
           write(Block.OutEdges[i]->Number);
           write(0);  // no flags
         }
@@ -350,68 +359,60 @@ bool GCOVProfiler::runOnModule(Module &M) {
 }
 
 void GCOVProfiler::emitGCNO() {
-  DenseMap<const MDNode *, raw_fd_ostream *> GcnoFiles;
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  if (CU_Nodes) {
-    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-      // Each compile unit gets its own .gcno file. This means that whether we run
-      // this pass over the original .o's as they're produced, or run it after
-      // LTO, we'll generate the same .gcno files.
-      
-      DICompileUnit CU(CU_Nodes->getOperand(i));
-      raw_fd_ostream *&out = GcnoFiles[CU];
-      std::string ErrorInfo;
-      out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
-                               raw_fd_ostream::F_Binary);
-      if (!Use402Format)
-        out->write("oncg*404MVLL", 12);
-      else
-        out->write("oncg*204MVLL", 12);
-  
-      DIArray SPs = CU.getSubprograms();
-      for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
-        DISubprogram SP(SPs.getElement(i));
-        if (!SP.Verify()) continue;
-        raw_fd_ostream *&os = GcnoFiles[CU];
-        
-        Function *F = SP.getFunction();
-        if (!F) continue;
-        GCOVFunction Func(SP, os, Use402Format);
-        
-        for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-          GCOVBlock &Block = Func.getBlock(BB);
-          TerminatorInst *TI = BB->getTerminator();
-          if (int successors = TI->getNumSuccessors()) {
-            for (int i = 0; i != successors; ++i) {
-              Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
-            }
-          } else if (isa<ReturnInst>(TI)) {
-            Block.addEdge(Func.getReturnBlock());
-          }
-          
-          uint32_t Line = 0;
-          for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) {
-            const DebugLoc &Loc = I->getDebugLoc();
-            if (Loc.isUnknown()) continue;
-            if (Line == Loc.getLine()) continue;
-            Line = Loc.getLine();
-            if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
-            
-            GCOVLines &Lines = Block.getFile(SP.getFilename());
-            Lines.addLine(Loc.getLine());
+  if (!CU_Nodes) return;
+
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
+    DICompileUnit CU(CU_Nodes->getOperand(i));
+    std::string ErrorInfo;
+    raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo,
+                       raw_fd_ostream::F_Binary);
+    if (!Use402Format)
+      out.write("oncg*404MVLL", 12);
+    else
+      out.write("oncg*204MVLL", 12);
+
+    DIArray SPs = CU.getSubprograms();
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram SP(SPs.getElement(i));
+      if (!SP.Verify()) continue;
+
+      Function *F = SP.getFunction();
+      if (!F) continue;
+      GCOVFunction Func(SP, &out, Use402Format, UseExtraChecksum);
+
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        GCOVBlock &Block = Func.getBlock(BB);
+        TerminatorInst *TI = BB->getTerminator();
+        if (int successors = TI->getNumSuccessors()) {
+          for (int i = 0; i != successors; ++i) {
+            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
           }
+        } else if (isa<ReturnInst>(TI)) {
+          Block.addEdge(Func.getReturnBlock());
+        }
+
+        uint32_t Line = 0;
+        for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
+             I != IE; ++I) {
+          const DebugLoc &Loc = I->getDebugLoc();
+          if (Loc.isUnknown()) continue;
+          if (Line == Loc.getLine()) continue;
+          Line = Loc.getLine();
+          if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
+
+          GCOVLines &Lines = Block.getFile(SP.getFilename());
+          Lines.addLine(Loc.getLine());
         }
-        Func.writeOut();
       }
+      Func.writeOut();
     }
-  }
-
-  for (DenseMap<const MDNode *, raw_fd_ostream *>::iterator
-           I = GcnoFiles.begin(), E = GcnoFiles.end(); I != E; ++I) {
-    raw_fd_ostream *&out = I->second;
-    out->write("\0\0\0\0\0\0\0\0", 8);  // EOF
-    out->close();
-    delete out;
+    out.write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    out.close();
   }
 }
 
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 71adc1e..c7266e2 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -24,6 +24,8 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
+  initializeAddressSanitizerPass(Registry);
+  initializeThreadSanitizerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/LLVMBuild.txt b/lib/Transforms/Instrumentation/LLVMBuild.txt
new file mode 100644
index 0000000..d36ad54
--- /dev/null
+++ b/lib/Transforms/Instrumentation/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/Instrumentation/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Instrumentation
+parent = Transforms
+required_libraries = Analysis Core Support TransformUtils
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index 62c21b8..1fe1254 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -69,7 +69,7 @@ inline static void printEdgeCounter(ProfileInfo::Edge e,
                                     BasicBlock* b,
                                     unsigned i) {
   DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \
-               << ((b)?(b)->getNameStr():"0") << " (# " << (i) << ")\n");
+               << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n");
 }
 
 bool OptimalEdgeProfiler::runOnModule(Module &M) {
@@ -127,7 +127,7 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
   unsigned i = 0;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Working on " << F->getNameStr() << "\n");
+    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
 
     // Calculate a Maximum Spanning Tree with the edge weights determined by
     // ProfileEstimator. ProfileEstimator also assign weights to the virtual
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 23915d3..b214796 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -665,7 +665,7 @@ void BLInstrumentationDag::unlinkPhony() {
 // Generate a .dot graph to represent the DAG and pathNumbers
 void BLInstrumentationDag::generateDotGraph() {
   std::string errorInfo;
-  std::string functionName = getFunction().getNameStr();
+  std::string functionName = getFunction().getName().str();
   std::string filename = "pathdag." + functionName + ".dot";
 
   DEBUG (dbgs() << "Writing '" << filename << "'...\n");
@@ -750,7 +750,8 @@ Value* BLInstrumentationNode::getStartingPathNumber(){
 // Sets the Value of the pathNumber.  Used by the instrumentation code.
 void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) {
   DEBUG(dbgs() << "  SPN-" << getName() << " <-- " << (pathNumber ?
-                                                       pathNumber->getNameStr() : "unused") << "\n");
+                                                       pathNumber->getName() :
+                                                       "unused") << "\n");
   _startingPathNumber = pathNumber;
 }
 
@@ -760,7 +761,7 @@ Value* BLInstrumentationNode::getEndingPathNumber(){
 
 void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) {
   DEBUG(dbgs() << "  EPN-" << getName() << " <-- "
-        << (pathNumber ? pathNumber->getNameStr() : "unused") << "\n");
+               << (pathNumber ? pathNumber->getName() : "unused") << "\n");
   _endingPathNumber = pathNumber;
 }
 
@@ -1239,9 +1240,9 @@ void PathProfiler::insertInstrumentation(
       insertPoint++;
 
     DEBUG(dbgs() << "\nInstrumenting method call block '"
-          << node->getBlock()->getNameStr() << "'\n");
+                 << node->getBlock()->getName() << "'\n");
     DEBUG(dbgs() << "   Path number initialized: "
-          << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
+                 << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
 
     Value* newpn;
     if( node->getStartingPathNumber() ) {
@@ -1370,7 +1371,7 @@ bool PathProfiler::runOnModule(Module &M) {
     if (F->isDeclaration())
       continue;
 
-    DEBUG(dbgs() << "Function: " << F->getNameStr() << "\n");
+    DEBUG(dbgs() << "Function: " << F->getName() << "\n");
     functionNumber++;
 
     // set function number
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
new file mode 100644
index 0000000..8bb337e
--- /dev/null
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -0,0 +1,311 @@
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+//   - Insert calls to run-time library before every memory access.
+//      - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tsan"
+
+#include "FunctionBlackList.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Type.h"
+
+using namespace llvm;
+
+static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
+       cl::desc("Blacklist file"), cl::Hidden);
+
+static cl::opt<bool> ClPrintStats("tsan-print-stats",
+       cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden);
+
+namespace {
+
+// Stats counters for ThreadSanitizer instrumentation.
+struct ThreadSanitizerStats {
+  size_t NumInstrumentedReads;
+  size_t NumInstrumentedWrites;
+  size_t NumOmittedReadsBeforeWrite;
+  size_t NumAccessesWithBadSize;
+  size_t NumInstrumentedVtableWrites;
+  size_t NumOmittedReadsFromConstantGlobals;
+  size_t NumOmittedReadsFromVtable;
+};
+
+/// ThreadSanitizer: instrument the code in module to find races.
+struct ThreadSanitizer : public FunctionPass {
+  ThreadSanitizer();
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  bool instrumentLoadOrStore(Instruction *I);
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
+                                     SmallVectorImpl<Instruction*> &All);
+  bool addrPointsToConstantData(Value *Addr);
+
+  TargetData *TD;
+  OwningPtr<FunctionBlackList> BL;
+  // Callbacks to run-time library are computed in doInitialization.
+  Value *TsanFuncEntry;
+  Value *TsanFuncExit;
+  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t kNumberOfAccessSizes = 5;
+  Value *TsanRead[kNumberOfAccessSizes];
+  Value *TsanWrite[kNumberOfAccessSizes];
+  Value *TsanVptrUpdate;
+
+  // Stats are modified w/o synchronization.
+  ThreadSanitizerStats stats;
+};
+}  // namespace
+
+char ThreadSanitizer::ID = 0;
+INITIALIZE_PASS(ThreadSanitizer, "tsan",
+    "ThreadSanitizer: detects data races.",
+    false, false)
+
+ThreadSanitizer::ThreadSanitizer()
+  : FunctionPass(ID),
+  TD(NULL) {
+}
+
+FunctionPass *llvm::createThreadSanitizerPass() {
+  return new ThreadSanitizer();
+}
+
+bool ThreadSanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD)
+    return false;
+  BL.reset(new FunctionBlackList(ClBlackListFile));
+  memset(&stats, 0, sizeof(stats));
+
+  // Always insert a call to __tsan_init into the module's CTORs.
+  IRBuilder<> IRB(M.getContext());
+  Value *TsanInit = M.getOrInsertFunction("__tsan_init",
+                                          IRB.getVoidTy(), NULL);
+  appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
+
+  // Initialize the callbacks.
+  TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", IRB.getVoidTy(),
+                                        IRB.getInt8PtrTy(), NULL);
+  TsanFuncExit = M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(),
+                                       NULL);
+  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+    SmallString<32> ReadName("__tsan_read");
+    ReadName += itostr(1 << i);
+    TsanRead[i] = M.getOrInsertFunction(ReadName, IRB.getVoidTy(),
+                                        IRB.getInt8PtrTy(), NULL);
+    SmallString<32> WriteName("__tsan_write");
+    WriteName += itostr(1 << i);
+    TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(),
+                                         IRB.getInt8PtrTy(), NULL);
+  }
+  TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(),
+                                         IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                         NULL);
+  return true;
+}
+
+bool ThreadSanitizer::doFinalization(Module &M) {
+  if (ClPrintStats) {
+    errs() << "ThreadSanitizerStats " << M.getModuleIdentifier()
+           << ": wr " << stats.NumInstrumentedWrites
+           << "; rd " << stats.NumInstrumentedReads
+           << "; vt " << stats.NumInstrumentedVtableWrites
+           << "; bs " << stats.NumAccessesWithBadSize
+           << "; rbw " << stats.NumOmittedReadsBeforeWrite
+           << "; rcg " << stats.NumOmittedReadsFromConstantGlobals
+           << "; rvt " << stats.NumOmittedReadsFromVtable
+           << "\n";
+  }
+  return true;
+}
+
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) {
+    if (Tag->getNumOperands() < 1) return false;
+    if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
+      if (Tag1->getString() == "vtable pointer") return true;
+    }
+  }
+  return false;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      stats.NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      stats.NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::choseInstructionsToInstrument(
+    SmallVectorImpl<Instruction*> &Local,
+    SmallVectorImpl<Instruction*> &All) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(),
+       E = Local.rend(); It != E; ++It) {
+    Instruction *I = *It;
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      WriteTargets.insert(Store->getPointerOperand());
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        stats.NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+bool ThreadSanitizer::runOnFunction(Function &F) {
+  if (!TD) return false;
+  if (BL->isIn(F)) return false;
+  SmallVector<Instruction*, 8> RetVec;
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  bool Res = false;
+  bool HasCalls = false;
+
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    BasicBlock &BB = *FI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
+         BI != BE; ++BI) {
+      if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
+        LocalLoadsAndStores.push_back(BI);
+      else if (isa<ReturnInst>(BI))
+        RetVec.push_back(BI);
+      else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
+        HasCalls = true;
+        choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+      }
+    }
+    choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+  }
+
+  // We have collected all loads and stores.
+  // FIXME: many of these accesses do not need to be checked for races
+  // (e.g. variables that do not escape, etc).
+
+  // Instrument memory accesses.
+  for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
+    Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
+  }
+
+  // Instrument function entry/exit points if there were instrumented accesses.
+  if (Res || HasCalls) {
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ReturnAddress = IRB.CreateCall(
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+        IRB.getInt32(0));
+    IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+    for (size_t i = 0, n = RetVec.size(); i < n; ++i) {
+      IRBuilder<> IRBRet(RetVec[i]);
+      IRBRet.CreateCall(TsanFuncExit);
+    }
+    Res = true;
+  }
+  return Res;
+}
+
+bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    stats.NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return false;
+  }
+  if (IsWrite && isVtableAccess(I)) {
+    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    IRB.CreateCall2(TsanVptrUpdate,
+                    IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy()));
+    stats.NumInstrumentedVtableWrites++;
+    return true;
+  }
+  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  if (IsWrite) stats.NumInstrumentedWrites++;
+  else         stats.NumInstrumentedReads++;
+  return true;
+}
diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt
new file mode 100644
index 0000000..f7bca06
--- /dev/null
+++ b/lib/Transforms/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Transforms/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
+
+[component_0]
+type = Group
+name = Transforms
+parent = Libraries
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
index e527be2..8b1df92 100644
--- a/lib/Transforms/Makefile
+++ b/lib/Transforms/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello
+PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
 
 include $(LEVEL)/Makefile.config
 
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 79bcae5..d660c72 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMScalarOpts
   DCE.cpp
   DeadStoreElimination.cpp
   EarlyCSE.cpp
+  GlobalMerge.cpp
   GVN.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
@@ -31,12 +32,3 @@ add_llvm_library(LLVMScalarOpts
   Sink.cpp
   TailRecursionElimination.cpp
   )
-
-add_llvm_library_dependencies(LLVMScalarOpts
-  LLVMAnalysis
-  LLVMCore
-  LLVMInstCombine
-  LLVMSupport
-  LLVMTarget
-  LLVMTransformUtils
-  )
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index f8f18b2..9a5423f 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -64,11 +65,17 @@ static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
 
+// FIXME: Remove this abomination once all of the tests pass without it!
+static cl::opt<bool> DisableDeleteDeadBlocks(
+  "disable-cgp-delete-dead-blocks", cl::Hidden, cl::init(false),
+  cl::desc("Disable deleting dead blocks in CodeGenPrepare"));
+
 namespace {
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
     const TargetLowering *TLI;
+    const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
     ProfileInfo *PFI;
     
@@ -97,6 +104,7 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<ProfileInfo>();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
   private:
@@ -116,7 +124,10 @@ namespace {
 }
 
 char CodeGenPrepare::ID = 0;
-INITIALIZE_PASS(CodeGenPrepare, "codegenprepare",
+INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
+                "Optimize for code generation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
                 "Optimize for code generation", false, false)
 
 FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
@@ -127,6 +138,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   bool EverMadeChange = false;
 
   ModifiedDT = false;
+  TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   PFI = getAnalysisIfAvailable<ProfileInfo>();
 
@@ -153,8 +165,22 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
   if (!DisableBranchOpts) {
     MadeChange = false;
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    SmallPtrSet<BasicBlock*, 8> WorkList;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
       MadeChange |= ConstantFoldTerminator(BB, true);
+      if (!MadeChange) continue;
+
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
+
+    if (!DisableDeleteDeadBlocks)
+      for (SmallPtrSet<BasicBlock*, 8>::iterator
+             I = WorkList.begin(), E = WorkList.end(); I != E; ++I)
+        DeleteDeadBlock(*I);
 
     if (MadeChange)
       ModifiedDT = true;
@@ -541,8 +567,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     // happens.
     WeakVH IterHandle(CurInstIterator);
     
-    ReplaceAndSimplifyAllUses(CI, RetVal, TLI ? TLI->getTargetData() : 0,
-                              ModifiedDT ? 0 : DT);
+    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0,
+                                  TLInfo, ModifiedDT ? 0 : DT);
 
     // If the iterator instruction was recursively deleted, start over at the
     // start of the block.
@@ -553,6 +579,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     return true;
   }
 
+  if (II && TLI) {
+    SmallVector<Value*, 2> PtrOps;
+    Type *AccessTy;
+    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+      while (!PtrOps.empty())
+        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+          return true;
+  }
+
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
 
@@ -612,7 +647,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   // It's not safe to eliminate the sign / zero extension of the return value.
   // See llvm::isInTailCallPosition().
   const Function *F = BB->getParent();
-  unsigned CallerRetAttr = F->getAttributes().getRetAttributes();
+  Attributes CallerRetAttr = F->getAttributes().getRetAttributes();
   if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
     return false;
 
@@ -667,7 +702,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    unsigned CalleeRetAttr = CS.getAttributes().getRetAttributes();
+    Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes();
     if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias)
       continue;
 
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 664c3f6..5430f62 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Constant.h"
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include <set>
@@ -42,19 +44,22 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
+      AU.addRequired<TargetLibraryInfo>();
     }
   };
 }
 
 char ConstantPropagation::ID = 0;
-INITIALIZE_PASS(ConstantPropagation, "constprop",
+INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(ConstantPropagation, "constprop",
                 "Simple constant propagation", false, false)
 
 FunctionPass *llvm::createConstantPropagationPass() {
   return new ConstantPropagation();
 }
 
-
 bool ConstantPropagation::runOnFunction(Function &F) {
   // Initialize the worklist to all of the instructions ready to process...
   std::set<Instruction*> WorkList;
@@ -62,13 +67,15 @@ bool ConstantPropagation::runOnFunction(Function &F) {
       WorkList.insert(&*i);
   }
   bool Changed = false;
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   while (!WorkList.empty()) {
     Instruction *I = *WorkList.begin();
     WorkList.erase(WorkList.begin());    // Get an element from the worklist...
 
     if (!I->use_empty())                 // Don't muck with dead instructions...
-      if (Constant *C = ConstantFoldInstruction(I)) {
+      if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
         // Add all of the users of this instruction to the worklist, they might
         // be constant propagatable now...
         for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index e275268..9b0aadb 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -28,6 +28,7 @@ STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumSelects,   "Number of selects propagated");
 STATISTIC(NumMemAccess, "Number of memory access targets propagated");
 STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
 
 namespace {
   class CorrelatedValuePropagation : public FunctionPass {
@@ -37,6 +38,7 @@ namespace {
     bool processPHI(PHINode *P);
     bool processMemAccess(Instruction *I);
     bool processCmp(CmpInst *C);
+    bool processSwitch(SwitchInst *SI);
 
   public:
     static char ID;
@@ -110,7 +112,8 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
     Changed = true;
   }
 
-  ++NumPhis;
+  if (Changed)
+    ++NumPhis;
 
   return Changed;
 }
@@ -173,6 +176,86 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
   return true;
 }
 
+/// processSwitch - Simplify a switch instruction by removing cases which can
+/// never fire.  If the uselessness of a case could be determined locally then
+/// constant propagation would already have figured it out.  Instead, walk the
+/// predecessors and statically evaluate cases based on information available
+/// on that edge.  Cases that cannot fire no matter what the incoming edge can
+/// safely be removed.  If a case fires on every incoming edge then the entire
+/// switch can be removed and replaced with a branch to the case destination.
+bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  BasicBlock *BB = SI->getParent();
+
+  // If the condition was defined in same block as the switch then LazyValueInfo
+  // currently won't say anything useful about it, though in theory it could.
+  if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+    return false;
+
+  // If the switch is unreachable then trying to improve it is a waste of time.
+  pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+  if (PB == PE) return false;
+
+  // Analyse each switch case in turn.  This is done in reverse order so that
+  // removing a case doesn't cause trouble for the iteration.
+  bool Changed = false;
+  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
+       ) {
+    ConstantInt *Case = CI.getCaseValue();
+
+    // Check to see if the switch condition is equal to/not equal to the case
+    // value on every incoming edge, equal/not equal being the same each time.
+    LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      // Is the switch condition equal to the case value?
+      LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+                                                              Cond, Case, *PI, BB);
+      // Give up on this case if nothing is known.
+      if (Value == LazyValueInfo::Unknown) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+
+      // If this was the first edge to be visited, record that all other edges
+      // need to give the same result.
+      if (PI == PB) {
+        State = Value;
+        continue;
+      }
+
+      // If this case is known to fire for some edges and known not to fire for
+      // others then there is nothing we can do - give up.
+      if (Value != State) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+    }
+
+    if (State == LazyValueInfo::False) {
+      // This case never fires - remove it.
+      CI.getCaseSuccessor()->removePredecessor(BB);
+      SI->removeCase(CI); // Does not invalidate the iterator.
+      ++NumDeadCases;
+      Changed = true;
+    } else if (State == LazyValueInfo::True) {
+      // This case always fires.  Arrange for the switch to be turned into an
+      // unconditional branch by replacing the switch condition with the case
+      // value.
+      SI->setCondition(Case);
+      NumDeadCases += SI->getNumCases();
+      Changed = true;
+      break;
+    }
+  }
+
+  if (Changed)
+    // If the switch has been simplified to the point where it can be replaced
+    // by a branch then do so now.
+    ConstantFoldTerminator(BB);
+
+  return Changed;
+}
+
 bool CorrelatedValuePropagation::runOnFunction(Function &F) {
   LVI = &getAnalysis<LazyValueInfo>();
 
@@ -200,6 +283,13 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
       }
     }
 
+    Instruction *Term = FI->getTerminator();
+    switch (Term->getOpcode()) {
+    case Instruction::Switch:
+      BBChanged |= processSwitch(cast<SwitchInst>(Term));
+      break;
+    }
+
     FnChanged |= BBChanged;
   }
 
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a593d0f..c8c5360 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
@@ -33,6 +34,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumFastStores, "Number of stores deleted");
@@ -42,25 +44,26 @@ namespace {
   struct DSE : public FunctionPass {
     AliasAnalysis *AA;
     MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
 
     static char ID; // Pass identification, replacement for typeid
-    DSE() : FunctionPass(ID), AA(0), MD(0) {
+    DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) {
       initializeDSEPass(*PassRegistry::getPassRegistry());
     }
 
     virtual bool runOnFunction(Function &F) {
       AA = &getAnalysis<AliasAnalysis>();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
-      DominatorTree &DT = getAnalysis<DominatorTree>();
+      DT = &getAnalysis<DominatorTree>();
 
       bool Changed = false;
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
         // Only check non-dead blocks.  Dead blocks may have strange pointer
         // cycles that will confuse alias analysis.
-        if (DT.isReachableFromEntry(I))
+        if (DT->isReachableFromEntry(I))
           Changed |= runOnBasicBlock(*I);
 
-      AA = 0; MD = 0;
+      AA = 0; MD = 0; DT = 0;
       return Changed;
     }
 
@@ -221,7 +224,7 @@ static bool isRemovable(Instruction *I) {
 
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
-  default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate");
+  default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
   case Intrinsic::lifetime_end:
     // Never remove dead lifetime_end's, e.g. because it is followed by a
     // free.
@@ -238,6 +241,24 @@ static bool isRemovable(Instruction *I) {
   }
 }
 
+
+/// isShortenable - Returns true if this instruction can be safely shortened in
+/// length.
+static bool isShortenable(Instruction *I) {
+  // Don't shorten stores for now
+  if (isa<StoreInst>(I))
+    return false;
+  
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
+  switch (II->getIntrinsicID()) {
+    default: return false;
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+      // Do shorten memory intrinsics.
+      return true;
+  }
+}
+
 /// getStoredPointerOperand - Return the pointer that is being written to.
 static Value *getStoredPointerOperand(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -247,46 +268,61 @@ static Value *getStoredPointerOperand(Instruction *I) {
 
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
-  default: assert(false && "Unexpected intrinsic!");
+  default: llvm_unreachable("Unexpected intrinsic!");
   case Intrinsic::init_trampoline:
     return II->getArgOperand(0);
   }
 }
 
-static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) {
+static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
   const TargetData *TD = AA.getTargetData();
+
+  if (const CallInst *CI = extractMallocCall(V)) {
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
+      return C->getZExtValue();
+  }
+
   if (TD == 0)
     return AliasAnalysis::UnknownSize;
 
-  if (AllocaInst *A = dyn_cast<AllocaInst>(V)) {
+  if (const AllocaInst *A = dyn_cast<AllocaInst>(V)) {
     // Get size information for the alloca
-    if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
       return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
-    return AliasAnalysis::UnknownSize;
   }
 
-  assert(isa<Argument>(V) && "Expected AllocaInst or Argument!");
-  PointerType *PT = cast<PointerType>(V->getType());
-  return TD->getTypeAllocSize(PT->getElementType());
+  if (const Argument *A = dyn_cast<Argument>(V)) {
+    if (A->hasByValAttr())
+      if (PointerType *PT = dyn_cast<PointerType>(A->getType()))
+        return TD->getTypeAllocSize(PT->getElementType());
+  }
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    if (!GV->mayBeOverridden())
+      return TD->getTypeAllocSize(GV->getType()->getElementType());
+  }
+
+  return AliasAnalysis::UnknownSize;
 }
 
-/// isObjectPointerWithTrustworthySize - Return true if the specified Value* is
-/// pointing to an object with a pointer size we can trust.
-static bool isObjectPointerWithTrustworthySize(const Value *V) {
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
-    return !AI->isArrayAllocation();
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    return !GV->mayBeOverridden();
-  if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasByValAttr();
-  return false;
+namespace {
+  enum OverwriteResult
+  {
+    OverwriteComplete,
+    OverwriteEnd,
+    OverwriteUnknown
+  };
 }
 
-/// isCompleteOverwrite - Return true if a store to the 'Later' location
+/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
 /// completely overwrites a store to the 'Earlier' location.
-static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
-                                const AliasAnalysis::Location &Earlier,
-                                AliasAnalysis &AA) {
+/// 'OverwriteEnd' if the end of the 'Earlier' location is completely 
+/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
+static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
+                                   const AliasAnalysis::Location &Earlier,
+                                   AliasAnalysis &AA,
+                                   int64_t &EarlierOff,
+                                   int64_t &LaterOff) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
 
@@ -300,23 +336,24 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
       // If we have no TargetData information around, then the size of the store
       // is inferrable from the pointee type.  If they are the same type, then
       // we know that the store is safe.
-      if (AA.getTargetData() == 0)
-        return Later.Ptr->getType() == Earlier.Ptr->getType();
-      return false;
+      if (AA.getTargetData() == 0 &&
+          Later.Ptr->getType() == Earlier.Ptr->getType())
+        return OverwriteComplete;
+        
+      return OverwriteUnknown;
     }
 
     // Make sure that the Later size is >= the Earlier size.
-    if (Later.Size < Earlier.Size)
-      return false;
-    return true;
+    if (Later.Size >= Earlier.Size)
+      return OverwriteComplete;
   }
 
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
       Earlier.Size == AliasAnalysis::UnknownSize ||
-      Later.Size <= Earlier.Size || AA.getTargetData() == 0)
-    return false;
+      AA.getTargetData() == 0)
+    return OverwriteUnknown;
 
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
@@ -329,26 +366,25 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
-    return false;
+    return OverwriteUnknown;
 
   // If the "Later" store is to a recognizable object, get its size.
-  if (isObjectPointerWithTrustworthySize(UO2)) {
-    uint64_t ObjectSize =
-      TD.getTypeAllocSize(cast<PointerType>(UO2->getType())->getElementType());
-    if (ObjectSize == Later.Size)
-      return true;
-  }
+  uint64_t ObjectSize = getPointerSize(UO2, AA);
+  if (ObjectSize != AliasAnalysis::UnknownSize)
+    if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
+      return OverwriteComplete;
 
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
   // pointers are equal, then we can reason about the two stores.
-  int64_t EarlierOff = 0, LaterOff = 0;
+  EarlierOff = 0;
+  LaterOff = 0;
   const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD);
   const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD);
 
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
-    return false;
+    return OverwriteUnknown;
 
   // The later store completely overlaps the earlier store if:
   //
@@ -366,11 +402,25 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
   //
   // We have to be careful here as *Off is signed while *.Size is unsigned.
   if (EarlierOff >= LaterOff &&
+      Later.Size > Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
-    return true;
+    return OverwriteComplete;
+  
+  // The other interesting case is if the later store overwrites the end of
+  // the earlier store
+  //
+  //      |--earlier--|
+  //                |--   later   --|
+  //
+  // In this case we may want to trim the size of earlier to avoid generating
+  // writes to addresses which will definitely be overwritten later
+  if (LaterOff > EarlierOff &&
+      LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+      int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))
+    return OverwriteEnd;
 
   // Otherwise, they don't completely overlap.
-  return false;
+  return OverwriteUnknown;
 }
 
 /// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
@@ -494,22 +544,52 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // If we find a write that is a) removable (i.e., non-volatile), b) is
       // completely obliterated by the store to 'Loc', and c) which we know that
       // 'Inst' doesn't load from, then we can remove it.
-      if (isRemovable(DepWrite) && isCompleteOverwrite(Loc, DepLoc, *AA) &&
+      if (isRemovable(DepWrite) && 
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
-        DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
-              << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-
-        // Delete the store and now-dead instructions that feed it.
-        DeleteDeadInstruction(DepWrite, *MD);
-        ++NumFastStores;
-        MadeChange = true;
-
-        // DeleteDeadInstruction can delete the current instruction in loop
-        // cases, reset BBI.
-        BBI = Inst;
-        if (BBI != BB.begin())
-          --BBI;
-        break;
+        int64_t InstWriteOffset, DepWriteOffset; 
+        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, 
+                                         DepWriteOffset, InstWriteOffset); 
+        if (OR == OverwriteComplete) {
+          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
+
+          // Delete the store and now-dead instructions that feed it.
+          DeleteDeadInstruction(DepWrite, *MD);
+          ++NumFastStores;
+          MadeChange = true;
+          
+          // DeleteDeadInstruction can delete the current instruction in loop
+          // cases, reset BBI.
+          BBI = Inst;
+          if (BBI != BB.begin())
+            --BBI;
+          break;
+        } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
+          // TODO: base this on the target vector size so that if the earlier
+          // store was too small to get vector writes anyway then its likely
+          // a good idea to shorten it
+          // Power of 2 vector writes are probably always a bad idea to optimize
+          // as any store/memset/memcpy is likely using vector instructions so
+          // shortening it to not vector size is likely to be slower
+          MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+          if (llvm::isPowerOf2_64(InstWriteOffset) ||
+              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
+            
+            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
+                  << *DepWrite << "\n  KILLER (offset " 
+                  << InstWriteOffset << ", " 
+                  << DepLoc.Size << ")"
+                  << *Inst << '\n');
+            
+            Value* DepWriteLength = DepIntrinsic->getLength();
+            Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
+                                                    InstWriteOffset - 
+                                                    DepWriteOffset);
+            DepIntrinsic->setLength(TrimmedLength);
+            MadeChange = true;
+          }
+        }
       }
 
       // If this is a may-aliased store that is clobbering the store value, we
@@ -538,37 +618,67 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+                                   BasicBlock *BB, DominatorTree *DT) {
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    BasicBlock *Pred = *I;
+    if (Pred == BB) continue;
+    TerminatorInst *PredTI = Pred->getTerminator();
+    if (PredTI->getNumSuccessors() != 1)
+      continue;
+
+    if (DT->isReachableFromEntry(Pred))
+      Blocks.push_back(Pred);
+  }
+}
+
 /// HandleFree - Handle frees of entire structures whose dependency is a store
 /// to a field of that structure.
 bool DSE::HandleFree(CallInst *F) {
   bool MadeChange = false;
 
-  MemDepResult Dep = MD->getDependency(F);
+  AliasAnalysis::Location Loc = AliasAnalysis::Location(F->getOperand(0));
+  SmallVector<BasicBlock *, 16> Blocks;
+  Blocks.push_back(F->getParent());
 
-  while (Dep.isDef() || Dep.isClobber()) {
-    Instruction *Dependency = Dep.getInst();
-    if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
-      return MadeChange;
+  while (!Blocks.empty()) {
+    BasicBlock *BB = Blocks.pop_back_val();
+    Instruction *InstPt = BB->getTerminator();
+    if (BB == F->getParent()) InstPt = F;
 
-    Value *DepPointer =
-      GetUnderlyingObject(getStoredPointerOperand(Dependency));
+    MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+      if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
+        break;
 
-    // Check for aliasing.
-    if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
-      return MadeChange;
+      Value *DepPointer =
+        GetUnderlyingObject(getStoredPointerOperand(Dependency));
 
-    // DCE instructions only used to calculate that store
-    DeleteDeadInstruction(Dependency, *MD);
-    ++NumFastStores;
-    MadeChange = true;
+      // Check for aliasing.
+      if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+        break;
 
-    // Inst's old Dependency is now deleted. Compute the next dependency,
-    // which may also be dead, as in
-    //    s[0] = 0;
-    //    s[1] = 0; // This has just been deleted.
-    //    free(s);
-    Dep = MD->getDependency(F);
-  };
+      Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
+
+      // DCE instructions only used to calculate that store
+      DeleteDeadInstruction(Dependency, *MD);
+      ++NumFastStores;
+      MadeChange = true;
+
+      // Inst's old Dependency is now deleted. Compute the next dependency,
+      // which may also be dead, as in
+      //    s[0] = 0;
+      //    s[1] = 0; // This has just been deleted.
+      //    free(s);
+      Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB);
+    }
+
+    if (Dep.isNonLocal())
+      FindUnconditionalPreds(Blocks, BB, DT);
+  }
 
   return MadeChange;
 }
@@ -588,10 +698,17 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
   // Find all of the alloca'd pointers in the entry block.
   BasicBlock *Entry = BB.getParent()->begin();
-  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
+  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       DeadStackObjects.insert(AI);
 
+    // Okay, so these are dead heap objects, but if the pointer never escapes
+    // then it's leaked by this function anyways.
+    if (CallInst *CI = extractMallocCall(I))
+      if (!PointerMayBeCaptured(CI, true, true))
+        DeadStackObjects.insert(CI);
+  }
+
   // Treat byval arguments the same, stores to them are dead at the end of the
   // function.
   for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
@@ -637,6 +754,11 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       continue;
     }
 
+    if (CallInst *CI = extractMallocCall(BBI)) {
+      DeadStackObjects.erase(CI);
+      continue;
+    }
+
     if (CallSite CS = cast<Value>(BBI)) {
       // If this call does not access memory, it can't be loading any of our
       // pointers.
@@ -732,4 +854,3 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
        I != E; ++I)
     DeadStackObjects.erase(*I);
 }
-
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index c0223d2..f3c92d6 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -19,11 +19,13 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
+#include <deque>
 using namespace llvm;
 
 STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
@@ -215,6 +217,7 @@ namespace {
 class EarlyCSE : public FunctionPass {
 public:
   const TargetData *TD;
+  const TargetLibraryInfo *TLI;
   DominatorTree *DT;
   typedef RecyclingAllocator<BumpPtrAllocator,
                       ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
@@ -257,12 +260,77 @@ public:
   bool runOnFunction(Function &F);
 
 private:
-  
+
+  // NodeScope - almost a POD, but needs to call the constructors for the
+  // scoped hash tables so that a new scope gets pushed on. These are RAII so
+  // that the scope gets popped when the NodeScope is destroyed.
+  class NodeScope {
+   public:
+    NodeScope(ScopedHTType *availableValues,
+              LoadHTType *availableLoads,
+              CallHTType *availableCalls) :
+        Scope(*availableValues),
+        LoadScope(*availableLoads),
+        CallScope(*availableCalls) {}
+
+   private:
+    NodeScope(const NodeScope&); // DO NOT IMPLEMENT
+
+    ScopedHTType::ScopeTy Scope;
+    LoadHTType::ScopeTy LoadScope;
+    CallHTType::ScopeTy CallScope;
+  };
+
+  // StackNode - contains all the needed information to create a stack for
+  // doing a depth first tranversal of the tree. This includes scopes for
+  // values, loads, and calls as well as the generation. There is a child
+  // iterator so that the children do not need to be store spearately.
+  class StackNode {
+   public:
+    StackNode(ScopedHTType *availableValues,
+              LoadHTType *availableLoads,
+              CallHTType *availableCalls,
+              unsigned cg, DomTreeNode *n,
+              DomTreeNode::iterator child, DomTreeNode::iterator end) :
+        CurrentGeneration(cg), ChildGeneration(cg), Node(n),
+        ChildIter(child), EndIter(end),
+        Scopes(availableValues, availableLoads, availableCalls),
+        Processed(false) {}
+
+    // Accessors.
+    unsigned currentGeneration() { return CurrentGeneration; }
+    unsigned childGeneration() { return ChildGeneration; }
+    void childGeneration(unsigned generation) { ChildGeneration = generation; }
+    DomTreeNode *node() { return Node; }
+    DomTreeNode::iterator childIter() { return ChildIter; }
+    DomTreeNode *nextChild() {
+      DomTreeNode *child = *ChildIter;
+      ++ChildIter;
+      return child;
+    }
+    DomTreeNode::iterator end() { return EndIter; }
+    bool isProcessed() { return Processed; }
+    void process() { Processed = true; }
+
+   private:
+    StackNode(const StackNode&); // DO NOT IMPLEMENT
+
+    // Members.
+    unsigned CurrentGeneration;
+    unsigned ChildGeneration;
+    DomTreeNode *Node;
+    DomTreeNode::iterator ChildIter;
+    DomTreeNode::iterator EndIter;
+    NodeScope Scopes;
+    bool Processed;
+  };
+
   bool processNode(DomTreeNode *Node);
   
   // This transformation requires dominator postdominator info
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     AU.addRequired<DominatorTree>();
+    AU.addRequired<TargetLibraryInfo>();
     AU.setPreservesCFG();
   }
 };
@@ -277,22 +345,10 @@ FunctionPass *llvm::createEarlyCSEPass() {
 
 INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
-  // Define a scope in the scoped hash table.  When we are done processing this
-  // domtree node and recurse back up to our parent domtree node, this will pop
-  // off all the values we install.
-  ScopedHTType::ScopeTy Scope(*AvailableValues);
-  
-  // Define a scope for the load values so that anything we add will get
-  // popped when we recurse back up to our parent domtree node.
-  LoadHTType::ScopeTy LoadScope(*AvailableLoads);
-  
-  // Define a scope for the call values so that anything we add will get
-  // popped when we recurse back up to our parent domtree node.
-  CallHTType::ScopeTy CallScope(*AvailableCalls);
-  
   BasicBlock *BB = Node->getBlock();
   
   // If this block has a single predecessor, then the predecessor is the parent
@@ -328,7 +384,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(Inst, TD, DT)) {
+    if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
       Inst->replaceAllUsesWith(V);
       Inst->eraseFromParent();
@@ -442,19 +498,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
     }
   }
-  
-  unsigned LiveOutGeneration = CurrentGeneration;
-  for (DomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I) {
-    Changed |= processNode(*I);
-    // Pop any generation changes off the stack from the recursive walk.
-    CurrentGeneration = LiveOutGeneration;
-  }
+
   return Changed;
 }
 
 
 bool EarlyCSE::runOnFunction(Function &F) {
+  std::deque<StackNode *> nodesToProcess;
+
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTree>();
   
   // Tables that the pass uses when walking the domtree.
@@ -466,5 +519,52 @@ bool EarlyCSE::runOnFunction(Function &F) {
   AvailableCalls = &CallTable;
   
   CurrentGeneration = 0;
-  return processNode(DT->getRootNode());
+  bool Changed = false;
+
+  // Process the root node.
+  nodesToProcess.push_front(
+      new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+                    CurrentGeneration, DT->getRootNode(),
+                    DT->getRootNode()->begin(),
+                    DT->getRootNode()->end()));
+
+  // Save the current generation.
+  unsigned LiveOutGeneration = CurrentGeneration;
+
+  // Process the stack.
+  while (!nodesToProcess.empty()) {
+    // Grab the first item off the stack. Set the current generation, remove
+    // the node from the stack, and process it.
+    StackNode *NodeToProcess = nodesToProcess.front();
+
+    // Initialize class members.
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    // Check if the node needs to be processed.
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+      Changed |= processNode(NodeToProcess->node());
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *child = NodeToProcess->nextChild();
+      nodesToProcess.push_front(
+          new StackNode(AvailableValues,
+                        AvailableLoads,
+                        AvailableCalls,
+                        NodeToProcess->childGeneration(), child,
+                        child->begin(), child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      delete NodeToProcess;
+      nodesToProcess.pop_front();
+    }
+  } // while (!nodes...)
+
+  // Reset the current generation.
+  CurrentGeneration = LiveOutGeneration;
+
+  return Changed;
 }
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index cbfdbcd..fb733ad 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -31,10 +31,12 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Allocator.h"
@@ -83,6 +85,12 @@ namespace {
         return false;
       return true;
     }
+
+    friend hash_code hash_value(const Expression &Value) {
+      return hash_combine(Value.opcode, Value.type,
+                          hash_combine_range(Value.varargs.begin(),
+                                             Value.varargs.end()));
+    }
   };
 
   class ValueTable {
@@ -95,12 +103,17 @@ namespace {
     uint32_t nextValueNumber;
 
     Expression create_expression(Instruction* I);
+    Expression create_cmp_expression(unsigned Opcode,
+                                     CmpInst::Predicate Predicate,
+                                     Value *LHS, Value *RHS);
     Expression create_extractvalue_expression(ExtractValueInst* EI);
     uint32_t lookup_or_add_call(CallInst* C);
   public:
     ValueTable() : nextValueNumber(1) { }
     uint32_t lookup_or_add(Value *V);
     uint32_t lookup(Value *V) const;
+    uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,
+                               Value *LHS, Value *RHS);
     void add(Value *V, uint32_t num);
     void clear();
     void erase(Value *v);
@@ -124,16 +137,8 @@ template <> struct DenseMapInfo<Expression> {
   }
 
   static unsigned getHashValue(const Expression e) {
-    unsigned hash = e.opcode;
-
-    hash = ((unsigned)((uintptr_t)e.type >> 4) ^
-            (unsigned)((uintptr_t)e.type >> 9));
-
-    for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(),
-         E = e.varargs.end(); I != E; ++I)
-      hash = *I + hash * 37;
-    
-    return hash;
+    using llvm::hash_value;
+    return static_cast<unsigned>(hash_value(e));
   }
   static bool isEqual(const Expression &LHS, const Expression &RHS) {
     return LHS == RHS;
@@ -153,9 +158,24 @@ Expression ValueTable::create_expression(Instruction *I) {
   for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
        OI != OE; ++OI)
     e.varargs.push_back(lookup_or_add(*OI));
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (e.varargs[0] > e.varargs[1])
+      std::swap(e.varargs[0], e.varargs[1]);
+  }
   
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
-    e.opcode = (C->getOpcode() << 8) | C->getPredicate();
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    CmpInst::Predicate Predicate = C->getPredicate();
+    if (e.varargs[0] > e.varargs[1]) {
+      std::swap(e.varargs[0], e.varargs[1]);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (C->getOpcode() << 8) | Predicate;
   } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
     for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
          II != IE; ++II)
@@ -165,6 +185,25 @@ Expression ValueTable::create_expression(Instruction *I) {
   return e;
 }
 
+Expression ValueTable::create_cmp_expression(unsigned Opcode,
+                                             CmpInst::Predicate Predicate,
+                                             Value *LHS, Value *RHS) {
+  assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+         "Not a comparison!");
+  Expression e;
+  e.type = CmpInst::makeCmpResultType(LHS->getType());
+  e.varargs.push_back(lookup_or_add(LHS));
+  e.varargs.push_back(lookup_or_add(RHS));
+
+  // Sort the operand value numbers so x<y and y>x get the same value number.
+  if (e.varargs[0] > e.varargs[1]) {
+    std::swap(e.varargs[0], e.varargs[1]);
+    Predicate = CmpInst::getSwappedPredicate(Predicate);
+  }
+  e.opcode = (Opcode << 8) | Predicate;
+  return e;
+}
+
 Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
   assert(EI != 0 && "Not an ExtractValueInst?");
   Expression e;
@@ -414,6 +453,19 @@ uint32_t ValueTable::lookup(Value *V) const {
   return VI->second;
 }
 
+/// lookup_or_add_cmp - Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before.  Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
+                                       CmpInst::Predicate Predicate,
+                                       Value *LHS, Value *RHS) {
+  Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS);
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  return e;
+}
+
 /// clear - Remove all entries from the ValueTable.
 void ValueTable::clear() {
   valueNumbering.clear();
@@ -446,7 +498,8 @@ namespace {
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
     const TargetData *TD;
-    
+    const TargetLibraryInfo *TLI;
+
     ValueTable VN;
     
     /// LeaderTable - A mapping from value numbers to lists of Value*'s that
@@ -530,6 +583,7 @@ namespace {
     // This transformation requires dominator postdominator info
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetLibraryInfo>();
       if (!NoLoads)
         AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
@@ -568,6 +622,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) {
 INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
@@ -776,7 +831,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const TargetData &TD) {
-  // If the loaded or stored value is an first class array or struct, don't try
+  // If the loaded or stored value is a first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy())
     return -1;
@@ -973,7 +1028,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
 
-/// GetStoreValueForLoad - This function is called when we have a
+/// GetLoadValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering load.  This means
 /// that the load *may* provide bits used by the load but we can't be sure
 /// because the pointers don't mustalias.  Check this case to see if there is
@@ -1274,14 +1329,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // If we had to process more than one hundred blocks to find the
   // dependencies, this load isn't worth worrying about.  Optimizing
   // it will be too expensive.
-  if (Deps.size() > 100)
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > 100)
     return false;
 
   // If we had a phi translation failure, we'll have a single entry which is a
   // clobber in the current block.  Reject this early.
-  if (Deps.size() == 1
-      && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber())
-  {
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
     DEBUG(
       dbgs() << "GVN: non-local load ";
       WriteAsOperand(dbgs(), LI);
@@ -1294,10 +1349,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   // where we have a value available in repl, also keep track of whether we see
   // dependencies that produce an unknown value for the load (such as a call
   // that could potentially clobber the load).
-  SmallVector<AvailableValueInBlock, 16> ValuesPerBlock;
-  SmallVector<BasicBlock*, 16> UnavailableBlocks;
+  SmallVector<AvailableValueInBlock, 64> ValuesPerBlock;
+  SmallVector<BasicBlock*, 64> UnavailableBlocks;
 
-  for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
@@ -1896,12 +1951,19 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
   unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
        UI != UE; ) {
-    Instruction *User = cast<Instruction>(*UI);
-    unsigned OpNum = UI.getOperandNo();
-    ++UI;
+    Use &U = (UI++).getUse();
+
+    // If From occurs as a phi node operand then the use implicitly lives in the
+    // corresponding incoming block.  Otherwise it is the block containing the
+    // user that must be dominated by Root.
+    BasicBlock *UsingBlock;
+    if (PHINode *PN = dyn_cast<PHINode>(U.getUser()))
+      UsingBlock = PN->getIncomingBlock(U);
+    else
+      UsingBlock = cast<Instruction>(U.getUser())->getParent();
 
-    if (DT->dominates(Root, User->getParent())) {
-      User->setOperand(OpNum, To);
+    if (DT->dominates(Root, UsingBlock)) {
+      U.set(To);
       ++Count;
     }
   }
@@ -1912,69 +1974,119 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
 bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
-  if (LHS == RHS) return false;
-  assert(LHS->getType() == RHS->getType() && "Equal but types differ!");
+  SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+  Worklist.push_back(std::make_pair(LHS, RHS));
+  bool Changed = false;
 
-  // Don't try to propagate equalities between constants.
-  if (isa<Constant>(LHS) && isa<Constant>(RHS))
-    return false;
+  while (!Worklist.empty()) {
+    std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+    LHS = Item.first; RHS = Item.second;
+
+    if (LHS == RHS) continue;
+    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+    // Don't try to propagate equalities between constants.
+    if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue;
+
+    // Prefer a constant on the right-hand side, or an Argument if no constants.
+    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+      std::swap(LHS, RHS);
+    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+    // If there is no obvious reason to prefer the left-hand side over the right-
+    // hand side, ensure the longest lived term is on the right-hand side, so the
+    // shortest lived term will be replaced by the longest lived.  This tends to
+    // expose more simplifications.
+    uint32_t LVN = VN.lookup_or_add(LHS);
+    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+      // Move the 'oldest' value to the right-hand side, using the value number as
+      // a proxy for age.
+      uint32_t RVN = VN.lookup_or_add(RHS);
+      if (LVN < RVN) {
+        std::swap(LHS, RHS);
+        LVN = RVN;
+      }
+    }
+    assert((!isa<Instruction>(RHS) ||
+            DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) &&
+           "Instruction doesn't dominate scope!");
+
+    // If value numbering later deduces that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.
+    addToLeaderTable(LVN, RHS, Root);
+
+    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+    // LHS always has at least one use that is not dominated by Root, this will
+    // never do anything if LHS has only one use.
+    if (!LHS->hasOneUse()) {
+      unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
+      Changed |= NumReplacements > 0;
+      NumGVNEqProp += NumReplacements;
+    }
 
-  // Make sure that any constants are on the right-hand side.  In general the
-  // best results are obtained by placing the longest lived value on the RHS.
-  if (isa<Constant>(LHS))
-    std::swap(LHS, RHS);
+    // Now try to deduce additional equalities from this one.  For example, if the
+    // known equality was "(A != B)" == "false" then it follows that A and B are
+    // equal in the scope.  Only boolean equalities with an explicit true or false
+    // RHS are currently supported.
+    if (!RHS->getType()->isIntegerTy(1))
+      // Not a boolean equality - bail out.
+      continue;
+    ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+    if (!CI)
+      // RHS neither 'true' nor 'false' - bail out.
+      continue;
+    // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+    bool isKnownTrue = CI->isAllOnesValue();
+    bool isKnownFalse = !isKnownTrue;
+
+    // If "A && B" is known true then both A and B are known true.  If "A || B"
+    // is known false then both A and B are known false.
+    Value *A, *B;
+    if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+        (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+      Worklist.push_back(std::make_pair(A, RHS));
+      Worklist.push_back(std::make_pair(B, RHS));
+      continue;
+    }
 
-  // If neither term is constant then bail out.  This is not for correctness,
-  // it's just that the non-constant case is much less useful: it occurs just
-  // as often as the constant case but handling it hardly ever results in an
-  // improvement.
-  if (!isa<Constant>(RHS))
-    return false;
+    // If we are propagating an equality like "(A == B)" == "true" then also
+    // propagate the equality A == B.  When propagating a comparison such as
+    // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+    if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
 
-  // If value numbering later deduces that an instruction in the scope is equal
-  // to 'LHS' then ensure it will be turned into 'RHS'.
-  addToLeaderTable(VN.lookup_or_add(LHS), RHS, Root);
-
-  // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.
-  unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
-  bool Changed = NumReplacements > 0;
-  NumGVNEqProp += NumReplacements;
-
-  // Now try to deduce additional equalities from this one.  For example, if the
-  // known equality was "(A != B)" == "false" then it follows that A and B are
-  // equal in the scope.  Only boolean equalities with an explicit true or false
-  // RHS are currently supported.
-  if (!RHS->getType()->isIntegerTy(1))
-    // Not a boolean equality - bail out.
-    return Changed;
-  ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
-  if (!CI)
-    // RHS neither 'true' nor 'false' - bail out.
-    return Changed;
-  // Whether RHS equals 'true'.  Otherwise it equals 'false'.
-  bool isKnownTrue = CI->isAllOnesValue();
-  bool isKnownFalse = !isKnownTrue;
-
-  // If "A && B" is known true then both A and B are known true.  If "A || B"
-  // is known false then both A and B are known false.
-  Value *A, *B;
-  if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
-      (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
-    Changed |= propagateEquality(A, RHS, Root);
-    Changed |= propagateEquality(B, RHS, Root);
-    return Changed;
-  }
+      // If "A == B" is known true, or "A != B" is known false, then replace
+      // A with B everywhere in the scope.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+        Worklist.push_back(std::make_pair(Op0, Op1));
+
+      // If "A >= B" is known true, replace "A < B" with false everywhere.
+      CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+      // Since we don't have the instruction "A < B" immediately to hand, work out
+      // the value number that it would have and use that to find an appropriate
+      // instruction (if any).
+      uint32_t NextNum = VN.getNextUnusedValueNumber();
+      uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      // If the number we were assigned was brand new then there is no point in
+      // looking for an instruction realizing it: there cannot be one!
+      if (Num < NextNum) {
+        Value *NotCmp = findLeader(Root, Num);
+        if (NotCmp && isa<Instruction>(NotCmp)) {
+          unsigned NumReplacements =
+            replaceAllDominatedUsesWith(NotCmp, NotVal, Root);
+          Changed |= NumReplacements > 0;
+          NumGVNEqProp += NumReplacements;
+        }
+      }
+      // Ensure that any instruction in scope that gets the "A < B" value number
+      // is replaced with false.
+      addToLeaderTable(Num, NotVal, Root);
 
-  // If we are propagating an equality like "(A == B)" == "true" then also
-  // propagate the equality A == B.
-  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
-    // Only equality comparisons are supported.
-    if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
-        (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) {
-      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
-      Changed |= propagateEquality(Op0, Op1, Root);
+      continue;
     }
-    return Changed;
   }
 
   return Changed;
@@ -1985,35 +2097,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
 /// particular 'Dst' must not be reachable via another edge from 'Src'.
 static bool isOnlyReachableViaThisEdge(BasicBlock *Src, BasicBlock *Dst,
                                        DominatorTree *DT) {
-  // First off, there must not be more than one edge from Src to Dst, there
-  // should be exactly one.  So keep track of the number of times Src occurs
-  // as a predecessor of Dst and fail if it's more than once.  Secondly, any
-  // other predecessors of Dst should be dominated by Dst (see logic below).
-  bool SawEdgeFromSrc = false;
-  for (pred_iterator PI = pred_begin(Dst), PE = pred_end(Dst); PI != PE; ++PI) {
-    BasicBlock *Pred = *PI;
-    if (Pred == Src) {
-      // An edge from Src to Dst.
-      if (SawEdgeFromSrc)
-        // There are multiple edges from Src to Dst - fail.
-        return false;
-      SawEdgeFromSrc = true;
-      continue;
-    }
-    // If the predecessor is not dominated by Dst, then it must be possible to
-    // reach it either without passing through Src (and thus not via the edge)
-    // or by passing through Src but taking a different edge out of Src.  Either
-    // way it is possible to reach Dst without passing via the edge, so fail.
-    if (!DT->dominates(Dst, *PI))
-      return false;
-  }
-  assert(SawEdgeFromSrc && "No edge between these basic blocks!");
-
-  // Every path from the entry block to Dst must at some point pass to Dst from
-  // a predecessor that is not dominated by Dst.  This predecessor can only be
-  // Src, since all others are dominated by Dst.  As there is only one edge from
-  // Src to Dst, the path passes by this edge.
-  return true;
+  // While in theory it is interesting to consider the case in which Dst has
+  // more than one predecessor, because Dst might be part of a loop which is
+  // only reachable from Src, in practice it is pointless since at the time
+  // GVN runs all such loops have preheaders, which means that Dst will have
+  // been changed to have only one predecessor, namely Src.
+  BasicBlock *Pred = Dst->getSinglePredecessor();
+  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
+  (void)Src;
+  return Pred != 0;
 }
 
 /// processInstruction - When calculating availability, handle an instruction
@@ -2027,7 +2119,7 @@ bool GVN::processInstruction(Instruction *I) {
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
-  if (Value *V = SimplifyInstruction(I, TD, DT)) {
+  if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
     I->replaceAllUsesWith(V);
     if (MD && V->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -2076,16 +2168,17 @@ bool GVN::processInstruction(Instruction *I) {
     Value *SwitchCond = SI->getCondition();
     BasicBlock *Parent = SI->getParent();
     bool Changed = false;
-    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
-      BasicBlock *Dst = SI->getSuccessor(i);
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *Dst = i.getCaseSuccessor();
       if (isOnlyReachableViaThisEdge(Parent, Dst, DT))
-        Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst);
+        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), Dst);
     }
     return Changed;
   }
 
   // Instructions with void type don't return a value, so there's
-  // no point in trying to find redudancies in them.
+  // no point in trying to find redundancies in them.
   if (I->getType()->isVoidTy()) return false;
   
   uint32_t NextNum = VN.getNextUnusedValueNumber();
@@ -2101,7 +2194,7 @@ bool GVN::processInstruction(Instruction *I) {
   // If the number we were assigned was a brand new VN, then we don't
   // need to do a lookup to see if the number already exists
   // somewhere in the domtree: it can't!
-  if (Num == NextNum) {
+  if (Num >= NextNum) {
     addToLeaderTable(Num, I, I->getParent());
     return false;
   }
@@ -2129,6 +2222,7 @@ bool GVN::runOnFunction(Function& F) {
     MD = &getAnalysis<MemoryDependenceAnalysis>();
   DT = &getAnalysis<DominatorTree>();
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
   VN.setDomTree(DT);
@@ -2241,7 +2335,14 @@ bool GVN::performPRE(Function &F) {
           CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
           isa<DbgInfoIntrinsic>(CurInst))
         continue;
-      
+
+      // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+      // sinking the compare again, and it would force the code generator to
+      // move the i1 from processor flags or predicate registers into a general
+      // purpose register.
+      if (isa<CmpInst>(CurInst))
+        continue;
+
       // We don't currently value number ANY inline asm calls.
       if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
         if (CallI->isInlineAsm())
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
new file mode 100644
index 0000000..c2bd6e6
--- /dev/null
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -0,0 +1,226 @@
+//===-- GlobalMerge.cpp - Internal globals merging  -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass merges globals with internal linkage into one. This way all the
+// globals which were merged into a biggest one can be addressed using offsets
+// from the same base pointer (no need for separate base pointer for each of the
+// global). Such a transformation can significantly reduce the register pressure
+// when many globals are involved.
+//
+// For example, consider the code which touches several global variables at 
+// once:
+//
+// static int foo[N], bar[N], baz[N];
+//
+// for (i = 0; i < N; ++i) {
+//    foo[i] = bar[i] * baz[i];
+// }
+//
+//  On ARM the addresses of 3 arrays should be kept in the registers, thus
+//  this code has quite large register pressure (loop body):
+//
+//  ldr     r1, [r5], #4
+//  ldr     r2, [r6], #4
+//  mul     r1, r2, r1
+//  str     r1, [r0], #4
+//
+//  Pass converts the code to something like:
+//
+//  static struct {
+//    int foo[N];
+//    int bar[N];
+//    int baz[N];
+//  } merged;
+//
+//  for (i = 0; i < N; ++i) {
+//    merged.foo[i] = merged.bar[i] * merged.baz[i];
+//  }
+//
+//  and in ARM code this becomes:
+//
+//  ldr     r0, [r5, #40]
+//  ldr     r1, [r5, #80]
+//  mul     r0, r1, r0
+//  str     r0, [r5], #4
+//
+//  note that we saved 2 registers here almostly "for free".
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "global-merge"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Attributes.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumMerged      , "Number of globals merged");
+namespace {
+  class GlobalMerge : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLowering *TLI;
+
+    bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                 Module &M, bool isConst) const;
+
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    explicit GlobalMerge(const TargetLowering *tli = 0)
+      : FunctionPass(ID), TLI(tli) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+    const char *getPassName() const {
+      return "Merge internal globals";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+    struct GlobalCmp {
+      const TargetData *TD;
+
+      GlobalCmp(const TargetData *td) : TD(td) { }
+
+      bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
+        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+
+        return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
+      }
+    };
+  };
+} // end anonymous namespace
+
+char GlobalMerge::ID = 0;
+INITIALIZE_PASS(GlobalMerge, "global-merge",
+                "Global Merge", false, false)
+
+
+bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                             Module &M, bool isConst) const {
+  const TargetData *TD = TLI->getTargetData();
+
+  // FIXME: Infer the maximum possible offset depending on the actual users
+  // (these max offsets are different for the users inside Thumb or ARM
+  // functions)
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+
+  // FIXME: Find better heuristics
+  std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
+
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+
+  for (size_t i = 0, e = Globals.size(); i != e; ) {
+    size_t j = 0;
+    uint64_t MergedSize = 0;
+    std::vector<Type*> Tys;
+    std::vector<Constant*> Inits;
+    for (j = i; j != e; ++j) {
+      Type *Ty = Globals[j]->getType()->getElementType();
+      MergedSize += TD->getTypeAllocSize(Ty);
+      if (MergedSize > MaxOffset) {
+        break;
+      }
+      Tys.push_back(Ty);
+      Inits.push_back(Globals[j]->getInitializer());
+    }
+
+    StructType *MergedTy = StructType::get(M.getContext(), Tys);
+    Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+    GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
+                                                  GlobalValue::InternalLinkage,
+                                                  MergedInit, "_MergedGlobals");
+    for (size_t k = i; k < j; ++k) {
+      Constant *Idx[2] = {
+        ConstantInt::get(Int32Ty, 0),
+        ConstantInt::get(Int32Ty, k-i)
+      };
+      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
+      Globals[k]->replaceAllUsesWith(GEP);
+      Globals[k]->eraseFromParent();
+      NumMerged++;
+    }
+    i = j;
+  }
+
+  return true;
+}
+
+
+bool GlobalMerge::doInitialization(Module &M) {
+  SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals;
+  const TargetData *TD = TLI->getTargetData();
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+  bool Changed = false;
+
+  // Grab all non-const globals.
+  for (Module::global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I) {
+    // Merge is safe for "normal" internal globals only
+    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    // Ignore fancy-aligned globals for now.
+    unsigned Alignment = TD->getPreferredAlignment(I);
+    Type *Ty = I->getType()->getElementType();
+    if (Alignment > TD->getABITypeAlignment(Ty))
+      continue;
+
+    // Ignore all 'special' globals.
+    if (I->getName().startswith("llvm.") ||
+        I->getName().startswith(".llvm."))
+      continue;
+
+    if (TD->getTypeAllocSize(Ty) < MaxOffset) {
+      if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
+          .isBSSLocal())
+        BSSGlobals.push_back(I);
+      else if (I->isConstant())
+        ConstGlobals.push_back(I);
+      else
+        Globals.push_back(I);
+    }
+  }
+
+  if (Globals.size() > 1)
+    Changed |= doMerge(Globals, M, false);
+  if (BSSGlobals.size() > 1)
+    Changed |= doMerge(BSSGlobals, M, false);
+
+  // FIXME: This currently breaks the EH processing due to way how the 
+  // typeinfo detection works. We might want to detect the TIs and ignore 
+  // them in the future.
+  // if (ConstGlobals.size() > 1)
+  //  Changed |= doMerge(ConstGlobals, M, true);
+
+  return Changed;
+}
+
+bool GlobalMerge::runOnFunction(Function &F) {
+  return false;
+}
+
+Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
+  return new GlobalMerge(tli);
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 75fa011..a9ba657 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -33,7 +33,6 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Type.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -50,30 +49,21 @@
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-STATISTIC(NumRemoved     , "Number of aux indvars removed");
 STATISTIC(NumWidened     , "Number of indvars widened");
-STATISTIC(NumInserted    , "Number of canonical indvars added");
 STATISTIC(NumReplaced    , "Number of exit values replaced");
 STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
 STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
 STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
 
-namespace llvm {
-  cl::opt<bool> EnableIVRewrite(
-    "enable-iv-rewrite", cl::Hidden,
-    cl::desc("Enable canonical induction variable rewriting"));
-
-  // Trip count verification can be enabled by default under NDEBUG if we
-  // implement a strong expression equivalence checker in SCEV. Until then, we
-  // use the verify-indvars flag, which may assert in some cases.
-  cl::opt<bool> VerifyIndvars(
-    "verify-indvars", cl::Hidden,
-    cl::desc("Verify the ScalarEvolution result after running indvars"));
-}
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+  "verify-indvars", cl::Hidden,
+  cl::desc("Verify the ScalarEvolution result after running indvars"));
 
 namespace {
   class IndVarSimplify : public LoopPass {
-    IVUsers         *IU;
     LoopInfo        *LI;
     ScalarEvolution *SE;
     DominatorTree   *DT;
@@ -84,7 +74,7 @@ namespace {
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID), IU(0), LI(0), SE(0), DT(0), TD(0),
+    IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0),
                        Changed(false) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
@@ -97,13 +87,9 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      if (EnableIVRewrite)
-        AU.addRequired<IVUsers>();
       AU.addPreserved<ScalarEvolution>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      if (EnableIVRewrite)
-        AU.addPreserved<IVUsers>();
       AU.setPreservesCFG();
     }
 
@@ -121,8 +107,6 @@ namespace {
 
     void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
-    void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
-
     Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
                                      PHINode *IndVar, SCEVExpander &Rewriter);
 
@@ -138,7 +122,6 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(IVUsers)
 INITIALIZE_PASS_END(IndVarSimplify, "indvars",
                 "Induction Variable Simplification", false, false)
 
@@ -180,6 +163,11 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
     // base of a recurrence. This handles the case in which SCEV expansion
     // converts a pointer type recurrence into a nonrecurrent pointer base
     // indexed by an integer recurrence.
+
+    // If the GEP base pointer is a vector of pointers, abort.
+    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+      return false;
+
     const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
     const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
     if (FromBase == ToBase)
@@ -445,11 +433,6 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN);
   }
-
-  // Add a new IVUsers entry for the newly-created integer PHI.
-  if (IU)
-    IU->AddUsersIfInteresting(NewPHI);
-
   Changed = true;
 }
 
@@ -595,124 +578,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 }
 
 //===----------------------------------------------------------------------===//
-//  Rewrite IV users based on a canonical IV.
-//  Only for use with -enable-iv-rewrite.
-//===----------------------------------------------------------------------===//
-
-/// FIXME: It is an extremely bad idea to indvar substitute anything more
-/// complex than affine induction variables.  Doing so will put expensive
-/// polynomial evaluations inside of the loop, and the str reduction pass
-/// currently can only reduce affine polynomials.  For now just disable
-/// indvar subst on anything more complex than an affine addrec, unless
-/// it can be expanded to a trivial value.
-static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
-  // Loop-invariant values are safe.
-  if (SE->isLoopInvariant(S, L)) return true;
-
-  // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how
-  // to transform them into efficient code.
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
-    return AR->isAffine();
-
-  // An add is safe it all its operands are safe.
-  if (const SCEVCommutativeExpr *Commutative
-      = dyn_cast<SCEVCommutativeExpr>(S)) {
-    for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(),
-         E = Commutative->op_end(); I != E; ++I)
-      if (!isSafe(*I, L, SE)) return false;
-    return true;
-  }
-
-  // A cast is safe if its operand is.
-  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
-    return isSafe(C->getOperand(), L, SE);
-
-  // A udiv is safe if its operands are.
-  if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S))
-    return isSafe(UD->getLHS(), L, SE) &&
-           isSafe(UD->getRHS(), L, SE);
-
-  // SCEVUnknown is always safe.
-  if (isa<SCEVUnknown>(S))
-    return true;
-
-  // Nothing else is safe.
-  return false;
-}
-
-void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) {
-  // Rewrite all induction variable expressions in terms of the canonical
-  // induction variable.
-  //
-  // If there were induction variables of other sizes or offsets, manually
-  // add the offsets to the primary induction variable and cast, avoiding
-  // the need for the code evaluation methods to insert induction variables
-  // of different sizes.
-  for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) {
-    Value *Op = UI->getOperandValToReplace();
-    Type *UseTy = Op->getType();
-    Instruction *User = UI->getUser();
-
-    // Compute the final addrec to expand into code.
-    const SCEV *AR = IU->getReplacementExpr(*UI);
-
-    // Evaluate the expression out of the loop, if possible.
-    if (!L->contains(UI->getUser())) {
-      const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop());
-      if (SE->isLoopInvariant(ExitVal, L))
-        AR = ExitVal;
-    }
-
-    // FIXME: It is an extremely bad idea to indvar substitute anything more
-    // complex than affine induction variables.  Doing so will put expensive
-    // polynomial evaluations inside of the loop, and the str reduction pass
-    // currently can only reduce affine polynomials.  For now just disable
-    // indvar subst on anything more complex than an affine addrec, unless
-    // it can be expanded to a trivial value.
-    if (!isSafe(AR, L, SE))
-      continue;
-
-    // Determine the insertion point for this user. By default, insert
-    // immediately before the user. The SCEVExpander class will automatically
-    // hoist loop invariants out of the loop. For PHI nodes, there may be
-    // multiple uses, so compute the nearest common dominator for the
-    // incoming blocks.
-    Instruction *InsertPt = getInsertPointForUses(User, Op, DT);
-
-    // Now expand it into actual Instructions and patch it into place.
-    Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt);
-
-    DEBUG(dbgs() << "INDVARS: Rewrote IV '" << *AR << "' " << *Op << '\n'
-                 << "   into = " << *NewVal << "\n");
-
-    if (!isValidRewrite(Op, NewVal)) {
-      DeadInsts.push_back(NewVal);
-      continue;
-    }
-    // Inform ScalarEvolution that this value is changing. The change doesn't
-    // affect its value, but it does potentially affect which use lists the
-    // value will be on after the replacement, which affects ScalarEvolution's
-    // ability to walk use lists and drop dangling pointers when a value is
-    // deleted.
-    SE->forgetValue(User);
-
-    // Patch the new value into place.
-    if (Op->hasName())
-      NewVal->takeName(Op);
-    if (Instruction *NewValI = dyn_cast<Instruction>(NewVal))
-      NewValI->setDebugLoc(User->getDebugLoc());
-    User->replaceUsesOfWith(Op, NewVal);
-    UI->setOperandValToReplace(NewVal);
-
-    ++NumRemoved;
-    Changed = true;
-
-    // The old value may be dead now.
-    DeadInsts.push_back(Op);
-  }
-}
-
-//===----------------------------------------------------------------------===//
 //  IV Widening - Extend the width of an IV to cover its widest uses.
 //===----------------------------------------------------------------------===//
 
@@ -843,7 +708,7 @@ protected:
 
   const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
 
-  Instruction *WidenIVUse(NarrowIVDefUse DU);
+  Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
@@ -917,7 +782,6 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
     }
     return WideBO;
   }
-  llvm_unreachable(0);
 }
 
 /// No-wrap operations can transfer sign extension of their result to their
@@ -946,9 +810,13 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
   else
     return 0;
 
+  // When creating this AddExpr, don't apply the current operations NSW or NUW
+  // flags. This instruction may be guarded by control flow that the no-wrap
+  // behavior depends on. Non-control-equivalent instructions can be mapped to
+  // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+  // semantics to those operations.
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
-    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr,
-                   IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW));
+    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr));
 
   if (!AddRec || AddRec->getLoop() != L)
     return 0;
@@ -983,7 +851,7 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
 
 /// WidenIVUse - Determine whether an individual user of the narrow IV can be
 /// widened. If so, return the wide clone of the user.
-Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) {
+Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
 
   // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
   if (isa<PHINode>(DU.NarrowUse) &&
@@ -1051,7 +919,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) {
   // NarrowUse.
   Instruction *WideUse = 0;
   if (WideAddRec == WideIncExpr
-      && SCEVExpander::hoistStep(WideInc, DU.NarrowUse, DT))
+      && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
     WideUse = WideInc;
   else {
     WideUse = CloneIVUser(DU);
@@ -1156,7 +1024,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 
     // Process a def-use edge. This may replace the use, so don't hold a
     // use_iterator across it.
-    Instruction *WideUse = WidenIVUse(DU);
+    Instruction *WideUse = WidenIVUse(DU, Rewriter);
 
     // Follow all def-use edges from the previous narrow use.
     if (WideUse)
@@ -1231,7 +1099,11 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
 /// BackedgeTakenInfo. If these expressions have not been reduced, then
 /// expanding them may incur additional cost (albeit in the loop preheader).
 static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
+                                SmallPtrSet<const SCEV*, 8> &Processed,
                                 ScalarEvolution *SE) {
+  if (!Processed.insert(S))
+    return false;
+
   // If the backedge-taken count is a UDiv, it's very likely a UDiv that
   // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a
   // precise expression, rather than a UDiv from the user's code. If we can't
@@ -1250,16 +1122,13 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
     }
   }
 
-  if (EnableIVRewrite)
-    return false;
-
   // Recurse past add expressions, which commonly occur in the
   // BackedgeTakenCount. They may already exist in program code, and if not,
   // they are not too expensive rematerialize.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
          I != E; ++I) {
-      if (isHighCostExpansion(*I, BI, SE))
+      if (isHighCostExpansion(*I, BI, Processed, SE))
         return true;
     }
     return false;
@@ -1270,14 +1139,24 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
   if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S))
     return true;
 
-  // If we haven't recognized an expensive SCEV patter, assume its an expression
-  // produced by program code.
+  // If we haven't recognized an expensive SCEV pattern, assume it's an
+  // expression produced by program code.
   return false;
 }
 
 /// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
 /// count expression can be safely and cheaply expanded into an instruction
 /// sequence that can be used by LinearFunctionTestReplace.
+///
+/// TODO: This fails for pointer-type loop counters with greater than one byte
+/// strides, consequently preventing LFTR from running. For the purpose of LFTR
+/// we could skip this check in the case that the LFTR loop counter (chosen by
+/// FindLoopCounter) is also pointer type. Instead, we could directly convert
+/// the loop test to an inequality test by checking the target data's alignment
+/// of element types (given that the initial pointer value originates from or is
+/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint).
+/// However, we don't yet have a strong motivation for converting loop tests
+/// into inequality tests.
 static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
@@ -1292,42 +1171,13 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
   if (!BI)
     return false;
 
-  if (isHighCostExpansion(BackedgeTakenCount, BI, SE))
+  SmallPtrSet<const SCEV*, 8> Processed;
+  if (isHighCostExpansion(BackedgeTakenCount, BI, Processed, SE))
     return false;
 
   return true;
 }
 
-/// getBackedgeIVType - Get the widest type used by the loop test after peeking
-/// through Truncs.
-///
-/// TODO: Unnecessary when ForceLFTR is removed.
-static Type *getBackedgeIVType(Loop *L) {
-  if (!L->getExitingBlock())
-    return 0;
-
-  // Can't rewrite non-branch yet.
-  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
-  if (!BI)
-    return 0;
-
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
-  if (!Cond)
-    return 0;
-
-  Type *Ty = 0;
-  for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end();
-      OI != OE; ++OI) {
-    assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types");
-    TruncInst *Trunc = dyn_cast<TruncInst>(*OI);
-    if (!Trunc)
-      continue;
-
-    return Trunc->getSrcTy();
-  }
-  return Ty;
-}
-
 /// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
 /// invariant value to the phi.
 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
@@ -1429,6 +1279,10 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 
 /// FindLoopCounter - Find an affine IV in canonical form.
 ///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+///
 /// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
 ///
 /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
@@ -1437,11 +1291,6 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 static PHINode *
 FindLoopCounter(Loop *L, const SCEV *BECount,
                 ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) {
-  // I'm not sure how BECount could be a pointer type, but we definitely don't
-  // want to LFTR that.
-  if (BECount->getType()->isPointerTy())
-    return 0;
-
   uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
 
   Value *Cond =
@@ -1458,6 +1307,10 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
     if (!SE->isSCEVable(Phi->getType()))
       continue;
 
+    // Avoid comparing an integer IV against a pointer Limit.
+    if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+      continue;
+
     const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
     if (!AR || AR->getLoop() != L || !AR->isAffine())
       continue;
@@ -1503,6 +1356,82 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
   return BestPhi;
 }
 
+/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that
+/// holds the RHS of the new loop test.
+static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
+                           SCEVExpander &Rewriter, ScalarEvolution *SE) {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+  const SCEV *IVInit = AR->getStart();
+
+  // IVInit may be a pointer while IVCount is an integer when FindLoopCounter
+  // finds a valid pointer IV. Sign extend BECount in order to materialize a
+  // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+  // the existing GEPs whenever possible.
+  if (IndVar->getType()->isPointerTy()
+      && !IVCount->getType()->isPointerTy()) {
+
+    Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+    const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy);
+
+    // Expand the code for the iteration count.
+    assert(SE->isLoopInvariant(IVOffset, L) &&
+           "Computed iteration count is not loop invariant!");
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI);
+
+    Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+    assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
+    // We could handle pointer IVs other than i8*, but we need to compensate for
+    // gep index scaling. See canExpandBackedgeTakenCount comments.
+    assert(SE->getSizeOfExpr(
+             cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
+           && "unit stride pointer IV must be i8*");
+
+    IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+    return Builder.CreateGEP(GEPBase, GEPOffset, "lftr.limit");
+  }
+  else {
+    // In any other case, convert both IVInit and IVCount to integers before
+    // comparing. This may result in SCEV expension of pointers, but in practice
+    // SCEV will fold the pointer arithmetic away as such:
+    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+    //
+    // Valid Cases: (1) both integers is most common; (2) both may be pointers
+    // for simple memset-style loops; (3) IVInit is an integer and IVCount is a
+    // pointer may occur when enable-iv-rewrite generates a canonical IV on top
+    // of case #2.
+
+    const SCEV *IVLimit = 0;
+    // For unit stride, IVCount = Start + BECount with 2's complement overflow.
+    // For non-zero Start, compute IVCount here.
+    if (AR->getStart()->isZero())
+      IVLimit = IVCount;
+    else {
+      assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+      const SCEV *IVInit = AR->getStart();
+
+      // For integer IVs, truncate the IV before computing IVInit + BECount.
+      if (SE->getTypeSizeInBits(IVInit->getType())
+          > SE->getTypeSizeInBits(IVCount->getType()))
+        IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+
+      IVLimit = SE->getAddExpr(IVInit, IVCount);
+    }
+    // Expand the code for the iteration count.
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    IRBuilder<> Builder(BI);
+    assert(SE->isLoopInvariant(IVLimit, L) &&
+           "Computed iteration count is not loop invariant!");
+    // Ensure that we generate the same type as IndVar, or a smaller integer
+    // type. In the presence of null pointer values, we have an integer type
+    // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+    Type *LimitTy = IVCount->getType()->isPointerTy() ?
+      IndVar->getType() : IVCount->getType();
+    return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+  }
+}
+
 /// LinearFunctionTestReplace - This method rewrites the exit condition of the
 /// loop to be a canonical != comparison against the incremented loop induction
 /// variable.  This pass is able to rewrite the exit tests of any loop where the
@@ -1514,37 +1443,35 @@ LinearFunctionTestReplace(Loop *L,
                           PHINode *IndVar,
                           SCEVExpander &Rewriter) {
   assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
-  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
 
   // LFTR can ignore IV overflow and truncate to the width of
   // BECount. This avoids materializing the add(zext(add)) expression.
-  Type *CntTy = !EnableIVRewrite ?
-    BackedgeTakenCount->getType() : IndVar->getType();
+  Type *CntTy = BackedgeTakenCount->getType();
 
-  const SCEV *IVLimit = BackedgeTakenCount;
+  const SCEV *IVCount = BackedgeTakenCount;
 
-  // If the exiting block is not the same as the backedge block, we must compare
-  // against the preincremented value, otherwise we prefer to compare against
-  // the post-incremented value.
+  // If the exiting block is the same as the backedge block, we prefer to
+  // compare against the post-incremented value, otherwise we must compare
+  // against the preincremented value.
   Value *CmpIndVar;
   if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
     // If this addition may overflow, we have to be more pessimistic and
     // cast the induction variable before doing the add.
     const SCEV *N =
-      SE->getAddExpr(IVLimit, SE->getConstant(IVLimit->getType(), 1));
-    if (CntTy == IVLimit->getType())
-      IVLimit = N;
+      SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1));
+    if (CntTy == IVCount->getType())
+      IVCount = N;
     else {
-      const SCEV *Zero = SE->getConstant(IVLimit->getType(), 0);
+      const SCEV *Zero = SE->getConstant(IVCount->getType(), 0);
       if ((isa<SCEVConstant>(N) && !N->isZero()) ||
           SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
         // No overflow. Cast the sum.
-        IVLimit = SE->getTruncateOrZeroExtend(N, CntTy);
+        IVCount = SE->getTruncateOrZeroExtend(N, CntTy);
       } else {
         // Potential overflow. Cast before doing the add.
-        IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy);
-        IVLimit = SE->getAddExpr(IVLimit, SE->getConstant(CntTy, 1));
+        IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
+        IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1));
       }
     }
     // The BackedgeTaken expression contains the number of times that the
@@ -1552,62 +1479,17 @@ LinearFunctionTestReplace(Loop *L,
     // number of times the loop executes, so use the incremented indvar.
     CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
   } else {
-    // We have to use the preincremented value...
-    IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy);
+    // We must use the preincremented value...
+    IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
     CmpIndVar = IndVar;
   }
 
-  // For unit stride, IVLimit = Start + BECount with 2's complement overflow.
-  // So for, non-zero start compute the IVLimit here.
-  bool isPtrIV = false;
-  Type *CmpTy = CntTy;
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
-  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
-  if (!AR->getStart()->isZero()) {
-    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
-    const SCEV *IVInit = AR->getStart();
-
-    // For pointer types, sign extend BECount in order to materialize a GEP.
-    // Note that for without EnableIVRewrite, we never run SCEVExpander on a
-    // pointer type, because we must preserve the existing GEPs. Instead we
-    // directly generate a GEP later.
-    if (IVInit->getType()->isPointerTy()) {
-      isPtrIV = true;
-      CmpTy = SE->getEffectiveSCEVType(IVInit->getType());
-      IVLimit = SE->getTruncateOrSignExtend(IVLimit, CmpTy);
-    }
-    // For integer types, truncate the IV before computing IVInit + BECount.
-    else {
-      if (SE->getTypeSizeInBits(IVInit->getType())
-          > SE->getTypeSizeInBits(CmpTy))
-        IVInit = SE->getTruncateExpr(IVInit, CmpTy);
-
-      IVLimit = SE->getAddExpr(IVInit, IVLimit);
-    }
-  }
-  // Expand the code for the iteration count.
-  IRBuilder<> Builder(BI);
-
-  assert(SE->isLoopInvariant(IVLimit, L) &&
-         "Computed iteration count is not loop invariant!");
-  Value *ExitCnt = Rewriter.expandCodeFor(IVLimit, CmpTy, BI);
-
-  // Create a gep for IVInit + IVLimit from on an existing pointer base.
-  assert(isPtrIV == IndVar->getType()->isPointerTy() &&
-         "IndVar type must match IVInit type");
-  if (isPtrIV) {
-      Value *IVStart = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
-      assert(AR->getStart() == SE->getSCEV(IVStart) && "bad loop counter");
-      assert(SE->getSizeOfExpr(
-               cast<PointerType>(IVStart->getType())->getElementType())->isOne()
-             && "unit stride pointer IV must be i8*");
-
-      Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
-      ExitCnt = Builder.CreateGEP(IVStart, ExitCnt, "lftr.limit");
-      Builder.SetInsertPoint(BI);
-  }
+  Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+  assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy()
+         && "genLoopLimit missed a cast");
 
   // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
   ICmpInst::Predicate P;
   if (L->contains(BI->getSuccessor(0)))
     P = ICmpInst::ICMP_NE;
@@ -1619,11 +1501,13 @@ LinearFunctionTestReplace(Loop *L,
                << "       op:\t"
                << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
                << "      RHS:\t" << *ExitCnt << "\n"
-               << "     Expr:\t" << *IVLimit << "\n");
+               << "  IVCount:\t" << *IVCount << "\n");
 
+  IRBuilder<> Builder(BI);
   if (SE->getTypeSizeInBits(CmpIndVar->getType())
-      > SE->getTypeSizeInBits(CmpTy)) {
-    CmpIndVar = Builder.CreateTrunc(CmpIndVar, CmpTy, "lftr.wideiv");
+      > SE->getTypeSizeInBits(ExitCnt->getType())) {
+    CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                    "lftr.wideiv");
   }
 
   Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
@@ -1680,11 +1564,12 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     if (isa<LandingPadInst>(I))
       continue;
 
-    // Don't sink static AllocaInsts out of the entry block, which would
-    // turn them into dynamic allocas!
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      if (AI->isStaticAlloca())
-        continue;
+    // Don't sink alloca: we never want to sink static alloca's out of the
+    // entry block, and correctly sinking dynamic alloca's requires
+    // checks for stacksave/stackrestore intrinsics.
+    // FIXME: Refactor this check somehow?
+    if (isa<AllocaInst>(I))
+      continue;
 
     // Determine if there is a use in or before the loop (direct or
     // otherwise).
@@ -1746,8 +1631,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  if (EnableIVRewrite)
-    IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
@@ -1774,10 +1657,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // attempt to avoid evaluating SCEVs for sign/zero extend operations until
   // other expressions involving loop IVs have been evaluated. This helps SCEV
   // set no-wrap flags before normalizing sign/zero extension.
-  if (!EnableIVRewrite) {
-    Rewriter.disableCanonicalMode();
-    SimplifyAndExtend(L, Rewriter, LPM);
-  }
+  Rewriter.disableCanonicalMode();
+  SimplifyAndExtend(L, Rewriter, LPM);
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -1788,106 +1669,28 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     RewriteLoopExitValues(L, Rewriter);
 
-  // Eliminate redundant IV users.
-  if (EnableIVRewrite)
-    Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts);
-
   // Eliminate redundant IV cycles.
-  if (!EnableIVRewrite)
-    NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
-
-  // Compute the type of the largest recurrence expression, and decide whether
-  // a canonical induction variable should be inserted.
-  Type *LargestType = 0;
-  bool NeedCannIV = false;
-  bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
-  if (EnableIVRewrite && ExpandBECount) {
-    // If we have a known trip count and a single exit block, we'll be
-    // rewriting the loop exit test condition below, which requires a
-    // canonical induction variable.
-    NeedCannIV = true;
-    Type *Ty = BackedgeTakenCount->getType();
-    if (!EnableIVRewrite) {
-      // In this mode, SimplifyIVUsers may have already widened the IV used by
-      // the backedge test and inserted a Trunc on the compare's operand. Get
-      // the wider type to avoid creating a redundant narrow IV only used by the
-      // loop test.
-      LargestType = getBackedgeIVType(L);
-    }
-    if (!LargestType ||
-        SE->getTypeSizeInBits(Ty) >
-        SE->getTypeSizeInBits(LargestType))
-      LargestType = SE->getEffectiveSCEVType(Ty);
-  }
-  if (EnableIVRewrite) {
-    for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-      NeedCannIV = true;
-      Type *Ty =
-        SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
-      if (!LargestType ||
-          SE->getTypeSizeInBits(Ty) >
-          SE->getTypeSizeInBits(LargestType))
-        LargestType = Ty;
-    }
-  }
-
-  // Now that we know the largest of the induction variable expressions
-  // in this loop, insert a canonical induction variable of the largest size.
-  PHINode *IndVar = 0;
-  if (NeedCannIV) {
-    // Check to see if the loop already has any canonical-looking induction
-    // variables. If any are present and wider than the planned canonical
-    // induction variable, temporarily remove them, so that the Rewriter
-    // doesn't attempt to reuse them.
-    SmallVector<PHINode *, 2> OldCannIVs;
-    while (PHINode *OldCannIV = L->getCanonicalInductionVariable()) {
-      if (SE->getTypeSizeInBits(OldCannIV->getType()) >
-          SE->getTypeSizeInBits(LargestType))
-        OldCannIV->removeFromParent();
-      else
-        break;
-      OldCannIVs.push_back(OldCannIV);
-    }
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
-    IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L, LargestType);
-
-    ++NumInserted;
-    Changed = true;
-    DEBUG(dbgs() << "INDVARS: New CanIV: " << *IndVar << '\n');
-
-    // Now that the official induction variable is established, reinsert
-    // any old canonical-looking variables after it so that the IR remains
-    // consistent. They will be deleted as part of the dead-PHI deletion at
-    // the end of the pass.
-    while (!OldCannIVs.empty()) {
-      PHINode *OldCannIV = OldCannIVs.pop_back_val();
-      OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt());
-    }
-  }
-  else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) {
-    IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
-  }
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
-  Value *NewICmp = 0;
-  if (ExpandBECount && IndVar) {
-    // Check preconditions for proper SCEVExpander operation. SCEV does not
-    // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
-    // pass that uses the SCEVExpander must do it. This does not work well for
-    // loop passes because SCEVExpander makes assumptions about all loops, while
-    // LoopPassManager only forces the current loop to be simplified.
-    //
-    // FIXME: SCEV expansion has no way to bail out, so the caller must
-    // explicitly check any assumptions made by SCEV. Brittle.
-    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
-    if (!AR || AR->getLoop()->getLoopPreheader())
-      NewICmp =
-        LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter);
+  if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) {
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
+    if (IndVar) {
+      // Check preconditions for proper SCEVExpander operation. SCEV does not
+      // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+      // pass that uses the SCEVExpander must do it. This does not work well for
+      // loop passes because SCEVExpander makes assumptions about all loops, while
+      // LoopPassManager only forces the current loop to be simplified.
+      //
+      // FIXME: SCEV expansion has no way to bail out, so the caller must
+      // explicitly check any assumptions made by SCEV. Brittle.
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+      if (!AR || AR->getLoop()->getLoopPreheader())
+        (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
+    }
   }
-  // Rewrite IV-derived expressions.
-  if (EnableIVRewrite)
-    RewriteIVExpressions(L, Rewriter);
-
   // Clear the rewriter cache, because values that are in the rewriter's cache
   // can be deleted in the loop below, causing the AssertingVH in the cache to
   // trigger.
@@ -1906,13 +1709,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // loop may be sunk below the loop to reduce register pressure.
   SinkUnusedInvariants(L);
 
-  // For completeness, inform IVUsers of the IV use in the newly-created
-  // loop exit test instruction.
-  if (IU && NewICmp) {
-    ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp);
-    if (NewICmpInst)
-      IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)));
-  }
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
   // Check a post-condition.
@@ -1922,8 +1718,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Verify that LFTR, and any other change have not interfered with SCEV's
   // ability to compute trip count.
 #ifndef NDEBUG
-  if (!EnableIVRewrite && VerifyIndvars &&
-      !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
     SE->forgetLoop(L);
     const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
     if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index f410af3..429b61b 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -75,6 +76,7 @@ namespace {
   ///
   class JumpThreading : public FunctionPass {
     TargetData *TD;
+    TargetLibraryInfo *TLI;
     LazyValueInfo *LVI;
 #ifdef NDEBUG
     SmallPtrSet<BasicBlock*, 16> LoopHeaders;
@@ -107,6 +109,7 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<LazyValueInfo>();
       AU.addPreserved<LazyValueInfo>();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
     void FindLoopHeaders(Function &F);
@@ -133,6 +136,7 @@ char JumpThreading::ID = 0;
 INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
@@ -144,6 +148,7 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
 bool JumpThreading::runOnFunction(Function &F) {
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
   LVI = &getAnalysis<LazyValueInfo>();
 
   FindLoopHeaders(F);
@@ -674,7 +679,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   // Run constant folding to see if we can reduce the condition to a simple
   // constant.
   if (Instruction *I = dyn_cast<Instruction>(Condition)) {
-    Value *SimpleVal = ConstantFoldInstruction(I, TD);
+    Value *SimpleVal = ConstantFoldInstruction(I, TD, TLI);
     if (SimpleVal) {
       I->replaceAllUsesWith(SimpleVal);
       I->eraseFromParent();
@@ -852,6 +857,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (BBIt != LoadBB->begin())
     return false;
 
+  // If all of the loads and stores that feed the value have the same TBAA tag,
+  // then we can propagate it onto any newly inserted loads.
+  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); 
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
   typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
@@ -870,11 +878,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6);
+    MDNode *ThisTBAATag = 0;
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
+                                                    0, &ThisTBAATag);
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
     }
+    
+    // If tbaa tags disagree or are not present, forget about them.
+    if (TBAATag != ThisTBAATag) TBAATag = 0;
 
     // If so, this load is partially redundant.  Remember this info so that we
     // can create a PHI node.
@@ -921,8 +934,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Split them out to their own block.
     UnavailablePred =
-      SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
-                             "thread-pre-split", this);
+      SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this);
   }
 
   // If the value isn't available in all predecessors, then there will be
@@ -935,6 +947,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
                                  LI->getAlignment(),
                                  UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
+    if (TBAATag)
+      NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+    
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
 
@@ -1082,9 +1097,9 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
       DestBB = 0;
     else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
-    else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
-      DestBB = SI->getSuccessor(SI->findCaseValue(cast<ConstantInt>(Val)));
-    else {
+    else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+    } else {
       assert(isa<IndirectBrInst>(BB->getTerminator())
               && "Unexpected terminator");
       DestBB = cast<BlockAddress>(Val)->getBasicBlock();
@@ -1334,8 +1349,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   else {
     DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
           << " common predecessors.\n");
-    PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(),
-                                    ".thr_comm", this);
+    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
   }
 
   // And finally, do it!
@@ -1479,8 +1493,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   else {
     DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
           << " common predecessors.\n");
-    PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(),
-                                    ".thr_comm", this);
+    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
   }
 
   // Okay, we decided to do this!  Clone all the instructions in BB onto the end
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index b79bb13..8795cd8 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -43,8 +43,11 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
@@ -84,6 +87,7 @@ namespace {
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved("scalar-evolution");
       AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<TargetLibraryInfo>();
     }
 
     bool doFinalization() {
@@ -96,6 +100,9 @@ namespace {
     LoopInfo      *LI;       // Current LoopInfo
     DominatorTree *DT;       // Dominator Tree for the current Loop.
 
+    TargetData *TD;          // TargetData for constant folding.
+    TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
+
     // State that is updated as we process loops.
     bool Changed;            // Set to true when we change anything.
     BasicBlock *Preheader;   // The preheader block of the current loop...
@@ -177,6 +184,7 @@ INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
 
@@ -194,6 +202,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<DominatorTree>();
 
+  TD = getAnalysisIfAvailable<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
   CurAST = new AliasSetTracker(*AA);
   // Collect Alias info from subloops.
   for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
@@ -333,7 +344,7 @@ void LICM::HoistRegion(DomTreeNode *N) {
       // Try constant folding this instruction.  If all the operands are
       // constants, it is technically hoistable, but it would be better to just
       // fold it.
-      if (Constant *C = ConstantFoldInstruction(&I)) {
+      if (Constant *C = ConstantFoldInstruction(&I, TD, TLI)) {
         DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
         CurAST->copyValue(&I, C);
         CurAST->deleteValue(&I);
@@ -369,6 +380,8 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
     // in the same alias set as something that ends up being modified.
     if (AA->pointsToConstantMemory(LI->getOperand(0)))
       return true;
+    if (LI->getMetadata("invariant.load"))
+      return true;
 
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
@@ -579,7 +592,7 @@ void LICM::hoist(Instruction &I) {
 ///
 bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
   // If it is not a trapping instruction, it is always safe to hoist.
-  if (Inst.isSafeToSpeculativelyExecute())
+  if (isSafeToSpeculativelyExecute(&Inst))
     return true;
 
   return isGuaranteedToExecute(Inst);
diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
new file mode 100644
index 0000000..cee9119
--- /dev/null
+++ b/lib/Transforms/Scalar/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Scalar
+parent = Transforms
+library_name = ScalarOpts
+required_libraries = Analysis Core InstCombine Support Target TransformUtils
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index af25c5c..f0f05e6 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/Statistic.h"
@@ -43,6 +44,7 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved("scalar-evolution");
+      AU.addRequired<TargetLibraryInfo>();
     }
   };
 }
@@ -50,6 +52,7 @@ namespace {
 char LoopInstSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -64,6 +67,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -104,7 +108,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
         // Don't bother simplifying unused instructions.
         if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, TD, DT);
+          Value *V = SimplifyInstruction(I, TD, TLI, DT);
           if (V && LI->replacementPreservesLCSSAForm(I, V)) {
             // Mark all uses for resimplification next time round the loop.
             for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 9fd0958..59aace9 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -52,13 +53,14 @@ namespace {
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
+    void simplifyLoopLatch(Loop *L);
     bool rotateLoop(Loop *L);
-    
+
   private:
     LoopInfo *LI;
   };
 }
-  
+
 char LoopRotate::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -73,6 +75,11 @@ Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
 
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  simplifyLoopLatch(L);
+
   // One loop can be rotated multiple times.
   bool MadeChange = false;
   while (rotateLoop(L))
@@ -92,18 +99,18 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   BasicBlock::iterator I, E = OrigHeader->end();
   for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
     PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-    
+
   // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
   // as necessary.
   SSAUpdater SSA;
   for (I = OrigHeader->begin(); I != E; ++I) {
     Value *OrigHeaderVal = I;
-    
+
     // If there are no uses of the value (e.g. because it returns void), there
     // is nothing to rewrite.
     if (OrigHeaderVal->use_empty())
       continue;
-    
+
     Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal];
 
     // The value now exits in two versions: the initial value in the preheader
@@ -111,27 +118,27 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
     SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
     SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-    
+
     // Visit each use of the OrigHeader instruction.
     for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
          UE = OrigHeaderVal->use_end(); UI != UE; ) {
       // Grab the use before incrementing the iterator.
       Use &U = UI.getUse();
-      
+
       // Increment the iterator before removing the use from the list.
       ++UI;
-      
+
       // SSAUpdater can't handle a non-PHI use in the same block as an
       // earlier def. We can easily handle those cases manually.
       Instruction *UserInst = cast<Instruction>(U.getUser());
       if (!isa<PHINode>(UserInst)) {
         BasicBlock *UserBB = UserInst->getParent();
-        
+
         // The original users in the OrigHeader are already using the
         // original definitions.
         if (UserBB == OrigHeader)
           continue;
-        
+
         // Users in the OrigPreHeader need to use the value to which the
         // original definitions are mapped.
         if (UserBB == OrigPreheader) {
@@ -139,32 +146,128 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
           continue;
         }
       }
-      
+
       // Anything else can be handled by SSAUpdater.
       SSA.RewriteUse(U);
     }
   }
-}  
+}
+
+/// Determine whether the instructions in this range my be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End) {
+  bool seenIncrement = false;
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the cast of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+void LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp))
+    return;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+        << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp);
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+}
 
 /// Rotate loop LP. Return true if the loop is rotated.
 bool LoopRotate::rotateLoop(Loop *L) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
-  
+
   BasicBlock *OrigHeader = L->getHeader();
-  
+
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
   if (BI == 0 || BI->isUnconditional())
     return false;
-  
+
   // If the loop header is not one of the loop exiting blocks then
   // either this loop is already rotated or it is not
   // suitable for loop rotation transformations.
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // Updating PHInodes in loops with multiple exits adds complexity. 
+  // Updating PHInodes in loops with multiple exits adds complexity.
   // Keep it simple, and restrict loop rotation to loops with one exit only.
   // In future, lift this restriction and support for multiple exits if
   // required.
@@ -184,7 +287,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
   BasicBlock *OrigLatch = L->getLoopLatch();
-  
+
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
   if (OrigPreheader == 0 || OrigLatch == 0)
@@ -203,9 +306,9 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (L->contains(Exit))
     std::swap(Exit, NewHeader);
   assert(NewHeader && "Unable to determine new loop header");
-  assert(L->contains(NewHeader) && !L->contains(Exit) && 
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
          "Unable to determine loop header and exit blocks");
-  
+
   // This code assumes that the new header has exactly one predecessor.
   // Remove any single-entry PHI nodes in it.
   assert(NewHeader->getSinglePredecessor() &&
@@ -227,7 +330,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
   TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
   while (I != E) {
     Instruction *Inst = I++;
-    
+
     // If the instruction's operands are invariant and it doesn't read or write
     // memory, then it is safe to hoist.  Doing this doesn't change the order of
     // execution in the preheader, but does prevent the instruction from
@@ -236,18 +339,19 @@ bool LoopRotate::rotateLoop(Loop *L) {
     // memory (without proving that the loop doesn't write).
     if (L->hasLoopInvariantOperands(Inst) &&
         !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() &&
-        !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) {
+        !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) &&
+        !isa<AllocaInst>(Inst)) {
       Inst->moveBefore(LoopEntryBranch);
       continue;
     }
-    
+
     // Otherwise, create a duplicate of the instruction.
     Instruction *C = Inst->clone();
-    
+
     // Eagerly remap the operands of the instruction.
     RemapInstruction(C, ValueMap,
                      RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
-    
+
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
     // nodes allows icmps and other instructions to fold.
@@ -287,7 +391,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
   L->moveToHeader(NewHeader);
   assert(L->getHeader() == NewHeader && "Latch block is our new header");
 
-  
+
   // At this point, we've finished our major CFG changes.  As part of cloning
   // the loop into the preheader we've simplified instructions and the
   // duplicated conditional branch may now be branching on a constant.  If it is
@@ -308,16 +412,16 @@ bool LoopRotate::rotateLoop(Loop *L) {
       // the dominator of Exit.
       DT->changeImmediateDominator(Exit, OrigPreheader);
       DT->changeImmediateDominator(NewHeader, OrigPreheader);
-      
+
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
     }
-    
+
     // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
     // thus is not a preheader anymore.  Split the edge to form a real preheader.
     BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this);
     NewPH->setName(NewHeader->getName() + ".lr.ph");
-    
+
     // Preserve canonical loop form, which means that 'Exit' should have only one
     // predecessor.
     BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this);
@@ -329,7 +433,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
     BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
     NewBI->setDebugLoc(PHBI->getDebugLoc());
     PHBI->eraseFromParent();
-    
+
     // With our CFG finalized, update DomTree if it is available.
     if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
       // Update OrigHeader to be dominated by the new header block.
@@ -337,7 +441,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
     }
   }
-  
+
   assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
   assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
@@ -346,7 +450,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // connected by an unconditional branch.  This is just a cleanup so the
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, this);
-  
+
   ++NumRotated;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3e122c2..d57ec22 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -77,19 +77,22 @@
 #include <algorithm>
 using namespace llvm;
 
-namespace llvm {
-cl::opt<bool> EnableNested(
-  "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
-
-cl::opt<bool> EnableRetry(
-    "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
-
 // Temporary flag to cleanup congruent phis after LSR phi expansion.
 // It's currently disabled until we can determine whether it's truly useful or
 // not. The flag should be removed after the v3.0 release.
-cl::opt<bool> EnablePhiElim(
-    "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination"));
-}
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+  "enable-lsr-phielim", cl::Hidden, cl::init(true),
+  cl::desc("Enable LSR phi elimination"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+  "stress-ivchain", cl::Hidden, cl::init(false),
+  cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
 
 namespace {
 
@@ -636,6 +639,91 @@ static Type *getAccessType(const Instruction *Inst) {
   return AccessTy;
 }
 
+/// isExistingPhi - Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (SE.isSCEVable(PN->getType()) &&
+        (SE.getEffectiveSCEVType(PN->getType()) ==
+         SE.getEffectiveSCEVType(AR->getType())) &&
+        SE.getSCEV(PN) == AR)
+      return true;
+  }
+  return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+                                SmallPtrSet<const SCEV*, 8> &Processed,
+                                ScalarEvolution &SE) {
+  // Zero/One operand expressions
+  switch (S->getSCEVType()) {
+  case scUnknown:
+  case scConstant:
+    return false;
+  case scTruncate:
+    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scZeroExtend:
+    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scSignExtend:
+    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  }
+
+  if (!Processed.insert(S))
+    return false;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I) {
+      if (isHighCostExpansion(*I, Processed, SE))
+        return true;
+    }
+    return false;
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    if (Mul->getNumOperands() == 2) {
+      // Multiplication by a constant is ok
+      if (isa<SCEVConstant>(Mul->getOperand(0)))
+        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+      // If we have the value of one operand, check if an existing
+      // multiplication already generates this expression.
+      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+        Value *UVal = U->getValue();
+        for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end();
+             UI != UE; ++UI) {
+          // If U is a constant, it may be used by a ConstantExpr.
+          Instruction *User = dyn_cast<Instruction>(*UI);
+          if (User && User->getOpcode() == Instruction::Mul
+              && SE.isSCEVable(User->getType())) {
+            return SE.getSCEV(User) == Mul;
+          }
+        }
+      }
+    }
+  }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (isExistingPhi(AR, SE))
+      return false;
+  }
+
+  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+  return true;
+}
+
 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 /// specified set are trivially dead, delete them and see if this makes any of
 /// their operands subsequently dead.
@@ -705,7 +793,8 @@ public:
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
                    const SmallVectorImpl<int64_t> &Offsets,
-                   ScalarEvolution &SE, DominatorTree &DT);
+                   ScalarEvolution &SE, DominatorTree &DT,
+                   SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -718,7 +807,8 @@ private:
   void RatePrimaryRegister(const SCEV *Reg,
                            SmallPtrSet<const SCEV *, 16> &Regs,
                            const Loop *L,
-                           ScalarEvolution &SE, DominatorTree &DT);
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           SmallPtrSet<const SCEV *, 16> *LoserRegs);
 };
 
 }
@@ -729,41 +819,20 @@ void Cost::RateRegister(const SCEV *Reg,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-    if (AR->getLoop() == L)
-      AddRecCost += 1; /// TODO: This should be a function of the stride.
-
     // If this is an addrec for another loop, don't second-guess its addrec phi
     // nodes. LSR isn't currently smart enough to reason about more than one
-    // loop at a time. LSR has either already run on inner loops, will not run
-    // on other loops, and cannot be expected to change sibling loops. If the
-    // AddRec exists, consider it's register free and leave it alone. Otherwise,
-    // do not consider this formula at all.
-    // FIXME: why do we need to generate such fomulae?
-    else if (!EnableNested || L->contains(AR->getLoop()) ||
-             (!AR->getLoop()->contains(L) &&
-              DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
-      for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
-           PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-        if (SE.isSCEVable(PN->getType()) &&
-            (SE.getEffectiveSCEVType(PN->getType()) ==
-             SE.getEffectiveSCEVType(AR->getType())) &&
-            SE.getSCEV(PN) == AR)
-          return;
-      }
-      if (!EnableNested) {
-        Loose();
+    // loop at a time. LSR has already run on inner loops, will not run on outer
+    // loops, and cannot be expected to change sibling loops.
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
+      if (isExistingPhi(AR, SE))
         return;
-      }
-      // If this isn't one of the addrecs that the loop already has, it
-      // would require a costly new phi and add. TODO: This isn't
-      // precisely modeled right now.
-      ++NumBaseAdds;
-      if (!Regs.count(AR->getStart())) {
-        RateRegister(AR->getStart(), Regs, L, SE, DT);
-        if (isLoser())
-          return;
-      }
+
+      // Otherwise, do not consider this formula at all.
+      Loose();
+      return;
     }
+    AddRecCost += 1; /// TODO: This should be a function of the stride.
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
@@ -791,13 +860,22 @@ void Cost::RateRegister(const SCEV *Reg,
 }
 
 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
-/// before, rate it.
+/// before, rate it. Optional LoserRegs provides a way to declare any formula
+/// that refers to one of those regs an instant loser.
 void Cost::RatePrimaryRegister(const SCEV *Reg,
                                SmallPtrSet<const SCEV *, 16> &Regs,
                                const Loop *L,
-                               ScalarEvolution &SE, DominatorTree &DT) {
-  if (Regs.insert(Reg))
+                               ScalarEvolution &SE, DominatorTree &DT,
+                               SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Loose();
+    return;
+  }
+  if (Regs.insert(Reg)) {
     RateRegister(Reg, Regs, L, SE, DT);
+    if (isLoser())
+      LoserRegs->insert(Reg);
+  }
 }
 
 void Cost::RateFormula(const Formula &F,
@@ -805,14 +883,15 @@ void Cost::RateFormula(const Formula &F,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
                        const SmallVectorImpl<int64_t> &Offsets,
-                       ScalarEvolution &SE, DominatorTree &DT) {
+                       ScalarEvolution &SE, DominatorTree &DT,
+                       SmallPtrSet<const SCEV *, 16> *LoserRegs) {
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Loose();
       return;
     }
-    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
     if (isLoser())
       return;
   }
@@ -823,7 +902,7 @@ void Cost::RateFormula(const Formula &F,
       Loose();
       return;
     }
-    RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
     if (isLoser())
       return;
   }
@@ -1105,7 +1184,6 @@ bool LSRUse::InsertFormula(const Formula &F) {
   Formulae.push_back(F);
 
   // Record registers now being used by this use.
-  if (F.ScaledReg) Regs.insert(F.ScaledReg);
   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
 
   return true;
@@ -1116,7 +1194,6 @@ void LSRUse::DeleteFormula(Formula &F) {
   if (&F != &Formulae.back())
     std::swap(F, Formulae.back());
   Formulae.pop_back();
-  assert(!Formulae.empty() && "LSRUse has no formulae left!");
 }
 
 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
@@ -1205,10 +1282,19 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
-      if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
-      return false;
+      if (!TLI)
+        return false;
+      // We have one of:
+      // ICmpZero     BaseReg + Offset => ICmp BaseReg, -Offset
+      // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset
+      // Offs is the ICmp immediate.
+      int64_t Offs = AM.BaseOffs;
+      if (AM.Scale == 0)
+        Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN.
+      return TLI->isLegalICmpImmediate(Offs);
     }
 
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
     return true;
 
   case LSRUse::Basic:
@@ -1220,7 +1306,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     return AM.Scale == 0 || AM.Scale == -1;
   }
 
-  return false;
+  llvm_unreachable("Invalid LSRUse Kind!");
 }
 
 static bool isLegalUse(TargetLowering::AddrMode AM,
@@ -1327,6 +1413,36 @@ struct UseMapDenseMapInfo {
   }
 };
 
+/// IVInc - An individual increment in a Chain of IV increments.
+/// Relate an IV user to an expression that computes the IV it uses from the IV
+/// used by the previous link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+  Instruction *UserInst;
+  Value* IVOperand;
+  const SCEV *IncExpr;
+
+  IVInc(Instruction *U, Value *O, const SCEV *E):
+    UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// IVChain - The list of IV increments in program order.
+// We typically add the head of a chain without finding subsequent links.
+typedef SmallVector<IVInc,1> IVChain;
+
+/// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
+/// Distinguish between FarUsers that definitely cross IV increments and
+/// NearUsers that may be used between IV increments.
+struct ChainUsers {
+  SmallPtrSet<Instruction*, 4> FarUsers;
+  SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
 /// LSRInstance - This class holds state for the main loop strength reduction
 /// logic.
 class LSRInstance {
@@ -1359,11 +1475,29 @@ class LSRInstance {
   /// RegUses - Track which uses use which register candidates.
   RegUseTracker RegUses;
 
+  // Limit the number of chains to avoid quadratic behavior. We don't expect to
+  // have more than a few IV increment chains in a loop. Missing a Chain falls
+  // back to normal LSR behavior for those uses.
+  static const unsigned MaxChains = 8;
+
+  /// IVChainVec - IV users can form a chain of IV increments.
+  SmallVector<IVChain, MaxChains> IVChainVec;
+
+  /// IVIncSet - IV users that belong to profitable IVChains.
+  SmallPtrSet<Use*, MaxChains> IVIncSet;
+
   void OptimizeShadowIV();
   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
   void OptimizeLoopTermCond();
 
+  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
+  void FinalizeChain(IVChain &Chain);
+  void CollectChains();
+  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                       SmallVectorImpl<WeakVH> &DeadInsts);
+
   void CollectInterestingTypesAndFactors();
   void CollectFixupsAndInitialFormulae();
 
@@ -1389,7 +1523,6 @@ class LSRInstance {
 
   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
 
-public:
   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   void CountRegisters(const Formula &F, size_t LUIdx);
@@ -1428,9 +1561,11 @@ public:
   BasicBlock::iterator
     HoistInsertPosition(BasicBlock::iterator IP,
                         const SmallVectorImpl<Instruction *> &Inputs) const;
-  BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
-                                                     const LSRFixup &LF,
-                                                     const LSRUse &LU) const;
+  BasicBlock::iterator
+    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+                                  const LSRFixup &LF,
+                                  const LSRUse &LU,
+                                  SCEVExpander &Rewriter) const;
 
   Value *Expand(const LSRFixup &LF,
                 const Formula &F,
@@ -1450,6 +1585,7 @@ public:
   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                          Pass *P);
 
+public:
   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
 
   bool getChanged() const { return Changed; }
@@ -2045,7 +2181,8 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
     do {
       const SCEV *S = Worklist.pop_back_val();
       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-        Strides.insert(AR->getStepRecurrence(SE));
+        if (AR->getLoop() == L)
+          Strides.insert(AR->getStepRecurrence(SE));
         Worklist.push_back(AR->getStart());
       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
         Worklist.append(Add->op_begin(), Add->op_end());
@@ -2091,11 +2228,544 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
   DEBUG(print_factors_and_types(dbgs()));
 }
 
+/// findIVOperand - Helper for CollectChains that finds an IV operand (computed
+/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped
+/// Instructions to IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+              Loop *L, ScalarEvolution &SE) {
+  for(; OI != OE; ++OI) {
+    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+      if (!SE.isSCEVable(Oper->getType()))
+        continue;
+
+      if (const SCEVAddRecExpr *AR =
+          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+        if (AR->getLoop() == L)
+          break;
+      }
+    }
+  }
+  return OI;
+}
+
+/// getWideOperand - IVChain logic must consistenctly peek base TruncInst
+/// operands, so wrap it in a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+    return Trunc->getOperand(0);
+  return Oper;
+}
+
+/// isCompatibleIVType - Return true if we allow an IV chain to include both
+/// types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+  Type *LType = LVal->getType();
+  Type *RType = RVal->getType();
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+}
+
+/// getExprBase - Return an approximation of this SCEV expression's "base", or
+/// NULL for any constant. Returning the expression itself is
+/// conservative. Returning a deeper subexpression is more precise and valid as
+/// long as it isn't less complex than another subexpression. For expressions
+/// involving multiple unscaled values, we need to return the pointer-type
+/// SCEVUnknown. This avoids forming chains across objects, such as:
+/// PrevOper==a[i], IVOper==b[i], IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+  switch (S->getSCEVType()) {
+  default: // uncluding scUnknown.
+    return S;
+  case scConstant:
+    return 0;
+  case scTruncate:
+    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+  case scZeroExtend:
+    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+  case scSignExtend:
+    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+  case scAddExpr: {
+    // Skip over scaled operands (scMulExpr) to follow add operands as long as
+    // there's nothing more complex.
+    // FIXME: not sure if we want to recognize negation.
+    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+           E(Add->op_begin()); I != E; ++I) {
+      const SCEV *SubExpr = *I;
+      if (SubExpr->getSCEVType() == scAddExpr)
+        return getExprBase(SubExpr);
+
+      if (SubExpr->getSCEVType() != scMulExpr)
+        return SubExpr;
+    }
+    return S; // all operands are scaled, be conservative.
+  }
+  case scAddRecExpr:
+    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+  }
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+static const SCEV *
+getProfitableChainIncrement(Value *NextIV, Value *PrevIV,
+                            const IVChain &Chain, Loop *L,
+                            ScalarEvolution &SE, const TargetLowering *TLI) {
+  // Prune the solution space aggressively by checking that both IV operands
+  // are expressions that operate on the same unscaled SCEVUnknown. This
+  // "base" will be canceled by the subsequent getMinusSCEV call. Checking first
+  // avoids creating extra SCEV expressions.
+  const SCEV *OperExpr = SE.getSCEV(NextIV);
+  const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+  if (getExprBase(OperExpr) != getExprBase(PrevExpr) && !StressIVChain)
+    return 0;
+
+  const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+  if (!SE.isLoopInvariant(IncExpr, L))
+    return 0;
+
+  // We are not able to expand an increment unless it is loop invariant,
+  // however, the following checks are purely for profitability.
+  if (StressIVChain)
+    return IncExpr;
+
+  // Do not replace a constant offset from IV head with a nonconstant IV
+  // increment.
+  if (!isa<SCEVConstant>(IncExpr)) {
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Chain[0].IVOperand));
+    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+      return 0;
+  }
+
+  SmallPtrSet<const SCEV*, 8> Processed;
+  if (isHighCostExpansion(IncExpr, Processed, SE))
+    return 0;
+
+  return IncExpr;
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool
+isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
+                  ScalarEvolution &SE, const TargetLowering *TLI) {
+  if (StressIVChain)
+    return true;
+
+  if (Chain.size() <= 2)
+    return false;
+
+  if (!Users.empty()) {
+    DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " users:\n";
+          for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
+                 E = Users.end(); I != E; ++I) {
+            dbgs() << "  " << **I << "\n";
+          });
+    return false;
+  }
+  assert(!Chain.empty() && "empty IV chains are not allowed");
+
+  // The chain itself may require a register, so intialize cost to 1.
+  int cost = 1;
+
+  // A complete chain likely eliminates the need for keeping the original IV in
+  // a register. LSR does not currently know how to form a complete chain unless
+  // the header phi already exists.
+  if (isa<PHINode>(Chain.back().UserInst)
+      && SE.getSCEV(Chain.back().UserInst) == Chain[0].IncExpr) {
+    --cost;
+  }
+  const SCEV *LastIncExpr = 0;
+  unsigned NumConstIncrements = 0;
+  unsigned NumVarIncrements = 0;
+  unsigned NumReusedIncrements = 0;
+  for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
+       I != E; ++I) {
+
+    if (I->IncExpr->isZero())
+      continue;
+
+    // Incrementing by zero or some constant is neutral. We assume constants can
+    // be folded into an addressing mode or an add's immediate operand.
+    if (isa<SCEVConstant>(I->IncExpr)) {
+      ++NumConstIncrements;
+      continue;
+    }
+
+    if (I->IncExpr == LastIncExpr)
+      ++NumReusedIncrements;
+    else
+      ++NumVarIncrements;
+
+    LastIncExpr = I->IncExpr;
+  }
+  // An IV chain with a single increment is handled by LSR's postinc
+  // uses. However, a chain with multiple increments requires keeping the IV's
+  // value live longer than it needs to be if chained.
+  if (NumConstIncrements > 1)
+    --cost;
+
+  // Materializing increment expressions in the preheader that didn't exist in
+  // the original code may cost a register. For example, sign-extended array
+  // indices can produce ridiculous increments like this:
+  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+  cost += NumVarIncrements;
+
+  // Reusing variable increments likely saves a register to hold the multiple of
+  // the stride.
+  cost -= NumReusedIncrements;
+
+  DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " Cost: " << cost << "\n");
+
+  return cost < 0;
+}
+
+/// ChainInstruction - Add this IV user to an existing chain or make it the head
+/// of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+  // When IVs are used as types of varying widths, they are generally converted
+  // to a wider type with some uses remaining narrow under a (free) trunc.
+  Value *NextIV = getWideOperand(IVOper);
+
+  // Visit all existing chains. Check if its IVOper can be computed as a
+  // profitable loop invariant increment from the last link in the Chain.
+  unsigned ChainIdx = 0, NChains = IVChainVec.size();
+  const SCEV *LastIncExpr = 0;
+  for (; ChainIdx < NChains; ++ChainIdx) {
+    Value *PrevIV = getWideOperand(IVChainVec[ChainIdx].back().IVOperand);
+    if (!isCompatibleIVType(PrevIV, NextIV))
+      continue;
+
+    // A phi node terminates a chain.
+    if (isa<PHINode>(UserInst)
+        && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst))
+      continue;
+
+    if (const SCEV *IncExpr =
+        getProfitableChainIncrement(NextIV, PrevIV, IVChainVec[ChainIdx],
+                                    L, SE, TLI)) {
+      LastIncExpr = IncExpr;
+      break;
+    }
+  }
+  // If we haven't found a chain, create a new one, unless we hit the max. Don't
+  // bother for phi nodes, because they must be last in the chain.
+  if (ChainIdx == NChains) {
+    if (isa<PHINode>(UserInst))
+      return;
+    if (NChains >= MaxChains && !StressIVChain) {
+      DEBUG(dbgs() << "IV Chain Limit\n");
+      return;
+    }
+    LastIncExpr = SE.getSCEV(NextIV);
+    // IVUsers may have skipped over sign/zero extensions. We don't currently
+    // attempt to form chains involving extensions unless they can be hoisted
+    // into this loop's AddRec.
+    if (!isa<SCEVAddRecExpr>(LastIncExpr))
+      return;
+    ++NChains;
+    IVChainVec.resize(NChains);
+    ChainUsersVec.resize(NChains);
+    DEBUG(dbgs() << "IV Head: (" << *UserInst << ") IV=" << *LastIncExpr
+          << "\n");
+  }
+  else
+    DEBUG(dbgs() << "IV  Inc: (" << *UserInst << ") IV+" << *LastIncExpr
+          << "\n");
+
+  // Add this IV user to the end of the chain.
+  IVChainVec[ChainIdx].push_back(IVInc(UserInst, IVOper, LastIncExpr));
+
+  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+  // This chain's NearUsers become FarUsers.
+  if (!LastIncExpr->isZero()) {
+    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+                                            NearUsers.end());
+    NearUsers.clear();
+  }
+
+  // All other uses of IVOperand become near uses of the chain.
+  // We currently ignore intermediate values within SCEV expressions, assuming
+  // they will eventually be used be the current chain, or can be computed
+  // from one of the chain increments. To be more precise we could
+  // transitively follow its user and only add leaf IV users to the set.
+  for (Value::use_iterator UseIter = IVOper->use_begin(),
+         UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
+    Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
+    if (!OtherUse || OtherUse == UserInst)
+      continue;
+    if (SE.isSCEVable(OtherUse->getType())
+        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+        && IU.isIVUserOrOperand(OtherUse)) {
+      continue;
+    }
+    NearUsers.insert(OtherUse);
+  }
+
+  // Since this user is part of the chain, it's no longer considered a use
+  // of the chain.
+  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// CollectChains - Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+///        = A[i]
+///        = A[i+x]
+/// A[i]   =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+  SmallVector<ChainUsers, 8> ChainUsersVec;
+
+  SmallVector<BasicBlock *,8> LatchPath;
+  BasicBlock *LoopHeader = L->getHeader();
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+    LatchPath.push_back(Rung->getBlock());
+  }
+  LatchPath.push_back(LoopHeader);
+
+  // Walk the instruction stream from the loop header to the loop latch.
+  for (SmallVectorImpl<BasicBlock *>::reverse_iterator
+         BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
+       BBIter != BBEnd; ++BBIter) {
+    for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
+         I != E; ++I) {
+      // Skip instructions that weren't seen by IVUsers analysis.
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I))
+        continue;
+
+      // Ignore users that are part of a SCEV expression. This way we only
+      // consider leaf IV Users. This effectively rediscovers a portion of
+      // IVUsers analysis but in program order this time.
+      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I)))
+        continue;
+
+      // Remove this instruction from any NearUsers set it may be in.
+      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+           ChainIdx < NChains; ++ChainIdx) {
+        ChainUsersVec[ChainIdx].NearUsers.erase(I);
+      }
+      // Search for operands that can be chained.
+      SmallPtrSet<Instruction*, 4> UniqueOperands;
+      User::op_iterator IVOpEnd = I->op_end();
+      User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
+      while (IVOpIter != IVOpEnd) {
+        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+        if (UniqueOperands.insert(IVOpInst))
+          ChainInstruction(I, IVOpInst, ChainUsersVec);
+        IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
+      }
+    } // Continue walking down the instructions.
+  } // Continue walking down the domtree.
+  // Visit phi backedges to determine if the chain can generate the IV postinc.
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (!SE.isSCEVable(PN->getType()))
+      continue;
+
+    Instruction *IncV =
+      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+    if (IncV)
+      ChainInstruction(PN, IncV, ChainUsersVec);
+  }
+  // Remove any unprofitable chains.
+  unsigned ChainIdx = 0;
+  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+       UsersIdx < NChains; ++UsersIdx) {
+    if (!isProfitableChain(IVChainVec[UsersIdx],
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TLI))
+      continue;
+    // Preserve the chain at UsesIdx.
+    if (ChainIdx != UsersIdx)
+      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+    FinalizeChain(IVChainVec[ChainIdx]);
+    ++ChainIdx;
+  }
+  IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+  assert(!Chain.empty() && "empty IV chains are not allowed");
+  DEBUG(dbgs() << "Final Chain: " << *Chain[0].UserInst << "\n");
+
+  for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
+    User::op_iterator UseI =
+      std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand);
+    assert(UseI != I->UserInst->op_end() && "cannot find IV operand");
+    IVIncSet.insert(UseI);
+  }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+                             Value *Operand, const TargetLowering *TLI) {
+  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+  if (!IncConst || !isAddressUse(UserInst, Operand))
+    return false;
+
+  if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
+    return false;
+
+  int64_t IncOffset = IncConst->getValue()->getSExtValue();
+  if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false,
+                       LSRUse::Address, getAccessType(UserInst), TLI))
+    return false;
+
+  return true;
+}
+
+/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
+/// materialize the IV user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                                  SmallVectorImpl<WeakVH> &DeadInsts) {
+  // Find the new IVOperand for the head of the chain. It may have been replaced
+  // by LSR.
+  const IVInc &Head = Chain[0];
+  User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+                                             IVOpEnd, L, SE);
+  Value *IVSrc = 0;
+  while (IVOpIter != IVOpEnd) {
+    IVSrc = getWideOperand(*IVOpIter);
+
+    // If this operand computes the expression that the chain needs, we may use
+    // it. (Check this after setting IVSrc which is used below.)
+    //
+    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+    // narrow for the chain, so we can no longer use it. We do allow using a
+    // wider phi, assuming the LSR checked for free truncation. In that case we
+    // should already have a truncate on this operand such that
+    // getSCEV(IVSrc) == IncExpr.
+    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+        || SE.getSCEV(IVSrc) == Head.IncExpr) {
+      break;
+    }
+    IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
+  }
+  if (IVOpIter == IVOpEnd) {
+    // Gracefully give up on this chain.
+    DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+  Type *IVTy = IVSrc->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+  const SCEV *LeftOverExpr = 0;
+  for (IVChain::const_iterator IncI = llvm::next(Chain.begin()),
+         IncE = Chain.end(); IncI != IncE; ++IncI) {
+
+    Instruction *InsertPt = IncI->UserInst;
+    if (isa<PHINode>(InsertPt))
+      InsertPt = L->getLoopLatch()->getTerminator();
+
+    // IVOper will replace the current IV User's operand. IVSrc is the IV
+    // value currently held in a register.
+    Value *IVOper = IVSrc;
+    if (!IncI->IncExpr->isZero()) {
+      // IncExpr was the result of subtraction of two narrow values, so must
+      // be signed.
+      const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy);
+      LeftOverExpr = LeftOverExpr ?
+        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+    }
+    if (LeftOverExpr && !LeftOverExpr->isZero()) {
+      // Expand the IV increment.
+      Rewriter.clearPostInc();
+      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+                                             SE.getUnknown(IncV));
+      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+      // If an IV increment can't be folded, use it as the next IV value.
+      if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
+                            TLI)) {
+        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+        IVSrc = IVOper;
+        LeftOverExpr = 0;
+      }
+    }
+    Type *OperTy = IncI->IVOperand->getType();
+    if (IVTy != OperTy) {
+      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+             "cannot extend a chained IV");
+      IRBuilder<> Builder(InsertPt);
+      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+    }
+    IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper);
+    DeadInsts.push_back(IncI->IVOperand);
+  }
+  // If LSR created a new, wider phi, we may also replace its postinc. We only
+  // do this if we also found a wide value for the head of the chain.
+  if (isa<PHINode>(Chain.back().UserInst)) {
+    for (BasicBlock::iterator I = L->getHeader()->begin();
+         PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
+      if (!isCompatibleIVType(Phi, IVSrc))
+        continue;
+      Instruction *PostIncV = dyn_cast<Instruction>(
+        Phi->getIncomingValueForBlock(L->getLoopLatch()));
+      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+        continue;
+      Value *IVOper = IVSrc;
+      Type *PostIncTy = PostIncV->getType();
+      if (IVTy != PostIncTy) {
+        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+      }
+      Phi->replaceUsesOfWith(PostIncV, IVOper);
+      DeadInsts.push_back(PostIncV);
+    }
+  }
+}
+
 void LSRInstance::CollectFixupsAndInitialFormulae() {
   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
+    Instruction *UserInst = UI->getUser();
+    // Skip IV users that are part of profitable IV Chains.
+    User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
+                                       UI->getOperandValToReplace());
+    assert(UseI != UserInst->op_end() && "cannot find IV operand");
+    if (IVIncSet.count(UseI))
+      continue;
+
     // Record the uses.
     LSRFixup &LF = getNewFixup();
-    LF.UserInst = UI->getUser();
+    LF.UserInst = UserInst;
     LF.OperandValToReplace = UI->getOperandValToReplace();
     LF.PostIncLoops = UI->getPostIncLoops();
 
@@ -2914,6 +3584,7 @@ LSRInstance::GenerateAllReuseFormulae() {
 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
   DenseSet<const SCEV *> VisitedRegs;
   SmallPtrSet<const SCEV *, 16> Regs;
+  SmallPtrSet<const SCEV *, 16> LoserRegs;
 #ifndef NDEBUG
   bool ChangedFormulae = false;
 #endif
@@ -2933,46 +3604,66 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
          FIdx != NumForms; ++FIdx) {
       Formula &F = LU.Formulae[FIdx];
 
-      SmallVector<const SCEV *, 2> Key;
-      for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
-           JE = F.BaseRegs.end(); J != JE; ++J) {
-        const SCEV *Reg = *J;
-        if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
-          Key.push_back(Reg);
+      // Some formulas are instant losers. For example, they may depend on
+      // nonexistent AddRecs from other loops. These need to be filtered
+      // immediately, otherwise heuristics could choose them over others leading
+      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+      // avoids the need to recompute this information across formulae using the
+      // same bad AddRec. Passing LoserRegs is also essential unless we remove
+      // the corresponding bad register from the Regs set.
+      Cost CostF;
+      Regs.clear();
+      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        &LoserRegs);
+      if (CostF.isLoser()) {
+        // During initial formula generation, undesirable formulae are generated
+        // by uses within other loops that have some non-trivial address mode or
+        // use the postinc form of the IV. LSR needs to provide these formulae
+        // as the basis of rediscovering the desired formula that uses an AddRec
+        // corresponding to the existing phi. Once all formulae have been
+        // generated, these initial losers may be pruned.
+        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+              dbgs() << "\n");
       }
-      if (F.ScaledReg &&
-          RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
-        Key.push_back(F.ScaledReg);
-      // Unstable sort by host order ok, because this is only used for
-      // uniquifying.
-      std::sort(Key.begin(), Key.end());
-
-      std::pair<BestFormulaeTy::const_iterator, bool> P =
-        BestFormulae.insert(std::make_pair(Key, FIdx));
-      if (!P.second) {
+      else {
+        SmallVector<const SCEV *, 2> Key;
+        for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
+               JE = F.BaseRegs.end(); J != JE; ++J) {
+          const SCEV *Reg = *J;
+          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+            Key.push_back(Reg);
+        }
+        if (F.ScaledReg &&
+            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+          Key.push_back(F.ScaledReg);
+        // Unstable sort by host order ok, because this is only used for
+        // uniquifying.
+        std::sort(Key.begin(), Key.end());
+
+        std::pair<BestFormulaeTy::const_iterator, bool> P =
+          BestFormulae.insert(std::make_pair(Key, FIdx));
+        if (P.second)
+          continue;
+
         Formula &Best = LU.Formulae[P.first->second];
 
-        Cost CostF;
-        CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
-        Regs.clear();
         Cost CostBest;
-        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
         Regs.clear();
+        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
               dbgs() << "\n"
                         "    in favor of formula "; Best.print(dbgs());
               dbgs() << '\n');
+      }
 #ifndef NDEBUG
-        ChangedFormulae = true;
+      ChangedFormulae = true;
 #endif
-        LU.DeleteFormula(F);
-        --FIdx;
-        --NumForms;
-        Any = true;
-        continue;
-      }
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
     }
 
     // Now that we've filtered out some formulae, recompute the Regs set.
@@ -3284,24 +3975,29 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     if (LU.Regs.count(*I))
       ReqRegs.insert(*I);
 
-  bool AnySatisfiedReqRegs = false;
   SmallPtrSet<const SCEV *, 16> NewRegs;
   Cost NewCost;
-retry:
   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
        E = LU.Formulae.end(); I != E; ++I) {
     const Formula &F = *I;
 
     // Ignore formulae which do not use any of the required registers.
+    bool SatisfiedReqReg = true;
     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
          JE = ReqRegs.end(); J != JE; ++J) {
       const SCEV *Reg = *J;
       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
-          F.BaseRegs.end())
-        goto skip;
+          F.BaseRegs.end()) {
+        SatisfiedReqReg = false;
+        break;
+      }
+    }
+    if (!SatisfiedReqReg) {
+      // If none of the formulae satisfied the required registers, then we could
+      // clear ReqRegs and try again. Currently, we simply give up in this case.
+      continue;
     }
-    AnySatisfiedReqRegs = true;
 
     // Evaluate the cost of the current formula. If it's already worse than
     // the current best, prune the search at that point.
@@ -3317,7 +4013,7 @@ retry:
           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
       } else {
         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
-              dbgs() << ". Regs:";
+              dbgs() << ".\n Regs:";
               for (SmallPtrSet<const SCEV *, 16>::const_iterator
                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
                 dbgs() << ' ' << **I;
@@ -3328,18 +4024,6 @@ retry:
       }
       Workspace.pop_back();
     }
-  skip:;
-  }
-
-  if (!EnableRetry && !AnySatisfiedReqRegs)
-    return;
-
-  // If none of the formulae had all of the required registers, relax the
-  // constraint so that we don't exclude all formulae.
-  if (!AnySatisfiedReqRegs) {
-    assert(!ReqRegs.empty() && "Solver failed even without required registers");
-    ReqRegs.clear();
-    goto retry;
   }
 }
 
@@ -3435,9 +4119,10 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
 /// AdjustInsertPositionForExpand - Determine an input position which will be
 /// dominated by the operands and which will dominate the result.
 BasicBlock::iterator
-LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
                                            const LSRFixup &LF,
-                                           const LSRUse &LU) const {
+                                           const LSRUse &LU,
+                                           SCEVExpander &Rewriter) const {
   // Collect some instructions which must be dominated by the
   // expanding replacement. These must be dominated by any operands that
   // will be required in the expansion.
@@ -3472,9 +4157,13 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
     }
   }
 
+  assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP)
+         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+         "Insertion point must be a normal instruction");
+
   // Then, climb up the immediate dominator tree as far as we can go while
   // still being dominated by the input positions.
-  IP = HoistInsertPosition(IP, Inputs);
+  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
 
   // Don't insert instructions before PHI nodes.
   while (isa<PHINode>(IP)) ++IP;
@@ -3485,6 +4174,11 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   // Ignore debug intrinsics.
   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
 
+  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+  // IP consistent across expansions and allows the previously inserted
+  // instructions to be reused by subsequent expansion.
+  while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP;
+
   return IP;
 }
 
@@ -3499,7 +4193,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
 
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
-  IP = AdjustInsertPositionForExpand(IP, LF, LU);
+  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
 
   // Inform the Rewriter if we have a post-increment use, so that it can
   // perform an advantageous expansion.
@@ -3775,10 +4469,20 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   SmallVector<WeakVH, 16> DeadInsts;
 
   SCEVExpander Rewriter(SE, "lsr");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
   Rewriter.disableCanonicalMode();
   Rewriter.enableLSRMode();
   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
 
+  // Mark phi nodes that terminate chains so the expander tries to reuse them.
+  for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
+         ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
+    if (PHINode *PN = dyn_cast<PHINode>(ChainI->back().UserInst))
+      Rewriter.setChainedPhi(PN);
+  }
+
   // Expand the new value definitions and update the users.
   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
        E = Fixups.end(); I != E; ++I) {
@@ -3789,6 +4493,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
     Changed = true;
   }
 
+  for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
+         ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
+    GenerateIVChain(*ChainI, Rewriter, DeadInsts);
+    Changed = true;
+  }
   // Clean up after ourselves. This must be done before deleting any
   // instructions.
   Rewriter.clear();
@@ -3804,11 +4513,29 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
     TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
 
   // If LoopSimplify form is not available, stay out of trouble.
-  if (!L->isLoopSimplifyForm()) return;
+  if (!L->isLoopSimplifyForm())
+    return;
 
   // If there's no interesting work to be done, bail early.
   if (IU.empty()) return;
 
+#ifndef NDEBUG
+  // All dominating loops must have preheaders, or SCEVExpander may not be able
+  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+  //
+  // IVUsers analysis should only create users that are dominated by simple loop
+  // headers. Since this loop should dominate all of its users, its user list
+  // should be empty if this loop itself is not within a simple loop nest.
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *BB = Rung->getBlock();
+    const Loop *DomLoop = LI.getLoopFor(BB);
+    if (DomLoop && DomLoop->getHeader() == BB) {
+      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+    }
+  }
+#endif // DEBUG
+
   DEBUG(dbgs() << "\nLSR on loop ";
         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
         dbgs() << ":\n");
@@ -3821,24 +4548,18 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   if (IU.empty()) return;
 
   // Skip nested loops until we can model them better with formulae.
-  if (!EnableNested && !L->empty()) {
-
-    if (EnablePhiElim) {
-      // Remove any extra phis created by processing inner loops.
-      SmallVector<WeakVH, 16> DeadInsts;
-      SCEVExpander Rewriter(SE, "lsr");
-      Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
-      Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
-    }
+  if (!L->empty()) {
     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
     return;
   }
 
   // Start collecting data and preparing for the solver.
+  CollectChains();
   CollectInterestingTypesAndFactors();
   CollectFixupsAndInitialFormulae();
   CollectLoopInvariantFixupsAndFormulae();
 
+  assert(!Uses.empty() && "IVUsers reported at least one use");
   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
         print_uses(dbgs()));
 
@@ -3875,14 +4596,6 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
 
   // Now that we've decided what we want, make it so.
   ImplementSolution(Solution, P);
-
-  if (EnablePhiElim) {
-    // Remove any extra phis created by processing inner loops.
-    SmallVector<WeakVH, 16> DeadInsts;
-    SCEVExpander Rewriter(SE, "lsr");
-    Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
-    Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
-  }
 }
 
 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
@@ -4008,9 +4721,21 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   // Run the main LSR transformation.
   Changed |= LSRInstance(TLI, L, this).getChanged();
 
-  // At this point, it is worth checking to see if any recurrence PHIs are also
-  // dead, so that we can remove them as well.
+  // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader());
-
+  if (EnablePhiElim) {
+    SmallVector<WeakVH, 16> DeadInsts;
+    SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
+#ifndef NDEBUG
+    Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+    unsigned numFolded = Rewriter.
+      replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI);
+    if (numFolded) {
+      Changed = true;
+      DeleteTriviallyDeadInstructions(DeadInsts);
+      DeleteDeadPHIs(L->getHeader());
+    }
+  }
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 91395b2..09a186f 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -40,10 +40,9 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
   cl::desc("Allows loops to be partially unrolled until "
            "-unroll-threshold loop size is reached."));
 
-// Temporary flag to be removed in 3.0
 static cl::opt<bool>
-NoSCEVUnroll("disable-unroll-scev", cl::init(false), cl::Hidden,
-  cl::desc("Use ScalarEvolution to analyze loop trip counts for unrolling"));
+UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden,
+  cl::desc("Unroll loops with run-time trip counts"));
 
 namespace {
   class LoopUnroll : public LoopPass {
@@ -68,6 +67,10 @@ namespace {
     // explicit -unroll-threshold).
     static const unsigned OptSizeUnrollThreshold = 50;
 
+    // Default unroll count for loops with run-time trip count if
+    // -unroll-count is not set
+    static const unsigned UnrollRuntimeCount = 8;
+
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
@@ -101,6 +104,7 @@ INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
 Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
@@ -147,23 +151,21 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
   unsigned TripMultiple = 1;
-  if (!NoSCEVUnroll) {
-    // Find "latch trip count". UnrollLoop assumes that control cannot exit
-    // via the loop latch on any iteration prior to TripCount. The loop may exit
-    // early via an earlier branch.
-    BasicBlock *LatchBlock = L->getLoopLatch();
-    if (LatchBlock) {
-      TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
-      TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
-    }
-  }
-  else {
-    TripCount = L->getSmallConstantTripCount();
-    if (TripCount == 0)
-      TripMultiple = L->getSmallConstantTripMultiple();
+  // Find "latch trip count". UnrollLoop assumes that control cannot exit
+  // via the loop latch on any iteration prior to TripCount. The loop may exit
+  // early via an earlier branch.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (LatchBlock) {
+    TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
+    TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
-  // Automatically select an unroll count.
+  // Use a default unroll-count if the user doesn't specify a value
+  // and the trip count is a run-time value.  The default is different
+  // for run-time or compile-time trip count loops.
   unsigned Count = CurrentCount;
+  if (UnrollRuntime && CurrentCount == 0 && TripCount == 0)
+    Count = UnrollRuntimeCount;
+
   if (Count == 0) {
     // Conservative heuristic: if we know the trip count, see if we can
     // completely unroll (subject to the threshold, checked below); otherwise
@@ -188,15 +190,23 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (TripCount != 1 && Size > Threshold) {
       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
             << " because size: " << Size << ">" << Threshold << "\n");
-      if (!CurrentAllowPartial) {
+      if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
               << "-unroll-allow-partial not given\n");
         return false;
       }
-      // Reduce unroll count to be modulo of TripCount for partial unrolling
-      Count = Threshold / LoopSize;
-      while (Count != 0 && TripCount%Count != 0) {
-        Count--;
+      if (TripCount) {
+        // Reduce unroll count to be modulo of TripCount for partial unrolling
+        Count = Threshold / LoopSize;
+        while (Count != 0 && TripCount%Count != 0)
+          Count--;
+      }
+      else if (UnrollRuntime) {
+        // Reduce unroll count to be a lower power-of-two value
+        while (Count != 0 && Size > Threshold) {
+          Count >>= 1;
+          Size = LoopSize*Count;
+        }
       }
       if (Count < 2) {
         DEBUG(dbgs() << "  could not unroll partially\n");
@@ -207,7 +217,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 458949c..ee23268 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -32,7 +32,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -48,6 +48,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <map>
 #include <set>
 using namespace llvm;
 
@@ -56,14 +57,70 @@ STATISTIC(NumSwitches, "Number of switches unswitched");
 STATISTIC(NumSelects , "Number of selects unswitched");
 STATISTIC(NumTrivial , "Number of unswitches that are trivial");
 STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts,  "Total number of instructions analyzed");
 
-// The specific value of 50 here was chosen based only on intuition and a
+// The specific value of 100 here was chosen based only on intuition and a
 // few specific examples.
 static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
-          cl::init(50), cl::Hidden);
-  
+          cl::init(100), cl::Hidden);
+
 namespace {
+
+  class LUAnalysisCache {
+
+    typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
+      UnswitchedValsMap;
+
+    typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+
+    struct LoopProperties {
+      unsigned CanBeUnswitchedCount;
+      unsigned SizeEstimation;
+      UnswitchedValsMap UnswitchedVals;
+    };
+
+    // Here we use std::map instead of DenseMap, since we need to keep valid
+    // LoopProperties pointer for current loop for better performance.
+    typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
+    typedef LoopPropsMap::iterator LoopPropsMapIt;
+
+    LoopPropsMap LoopsProperties;
+    UnswitchedValsMap* CurLoopInstructions;
+    LoopProperties* CurrentLoopProperties;
+
+    // Max size of code we can produce on remained iterations.
+    unsigned MaxSize;
+
+    public:
+
+      LUAnalysisCache() :
+        CurLoopInstructions(NULL), CurrentLoopProperties(NULL),
+        MaxSize(Threshold)
+      {}
+
+      // Analyze loop. Check its size, calculate is it possible to unswitch
+      // it. Returns true if we can unswitch this loop.
+      bool countLoop(const Loop* L);
+
+      // Clean all data related to given loop.
+      void forgetLoop(const Loop* L);
+
+      // Mark case value as unswitched.
+      // Since SI instruction can be partly unswitched, in order to avoid
+      // extra unswitching in cloned loops keep track all unswitched values.
+      void setUnswitched(const SwitchInst* SI, const Value* V);
+
+      // Check was this case value unswitched before or not.
+      bool isUnswitched(const SwitchInst* SI, const Value* V);
+
+      // Clone all loop-unswitch related loop properties.
+      // Redistribute unswitching quotas.
+      // Note, that new loop data is stored inside the VMap.
+      void cloneData(const Loop* NewLoop, const Loop* OldLoop,
+                     const ValueToValueMapTy& VMap);
+  };
+
   class LoopUnswitch : public LoopPass {
     LoopInfo *LI;  // Loop information
     LPPassManager *LPM;
@@ -71,8 +128,9 @@ namespace {
     // LoopProcessWorklist - Used to check if second loop needs processing
     // after RewriteLoopBodyWithConditionConstant rewrites first loop.
     std::vector<Loop*> LoopProcessWorklist;
-    SmallPtrSet<Value *,8> UnswitchedVals;
-    
+
+    LUAnalysisCache BranchesInfo;
+
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -80,9 +138,9 @@ namespace {
     DominatorTree *DT;
     BasicBlock *loopHeader;
     BasicBlock *loopPreheader;
-    
+
     // LoopBlocks contains all of the basic blocks of the loop, including the
-    // preheader of the loop, the body of the loop, and the exit blocks of the 
+    // preheader of the loop, the body of the loop, and the exit blocks of the
     // loop, in that order.
     std::vector<BasicBlock*> LoopBlocks;
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
@@ -90,8 +148,8 @@ namespace {
 
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false) : 
-      LoopPass(ID), OptimizeForSize(Os), redoLoop(false), 
+    explicit LoopUnswitch(bool Os = false) :
+      LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
       currentLoop(NULL), DT(NULL), loopHeader(NULL),
       loopPreheader(NULL) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
@@ -117,7 +175,7 @@ namespace {
   private:
 
     virtual void releaseMemory() {
-      UnswitchedVals.clear();
+      BranchesInfo.forgetLoop(currentLoop);
     }
 
     /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist,
@@ -147,7 +205,7 @@ namespace {
                                               Constant *Val, bool isEqual);
 
     void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
-                                        BasicBlock *TrueDest, 
+                                        BasicBlock *TrueDest,
                                         BasicBlock *FalseDest,
                                         Instruction *InsertPt);
 
@@ -160,6 +218,112 @@ namespace {
 
   };
 }
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop* L) {
+
+  std::pair<LoopPropsMapIt, bool> InsertRes =
+      LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+  LoopProperties& Props = InsertRes.first->second;
+
+  if (InsertRes.second) {
+    // New loop.
+
+    // Limit the number of instructions to avoid causing significant code
+    // expansion, and the number of basic blocks, to avoid loops with
+    // large numbers of branches which cause loop unswitching to go crazy.
+    // This is a very ad-hoc heuristic.
+
+    // FIXME: This is overly conservative because it does not take into
+    // consideration code simplification opportunities and code that can
+    // be shared by the resultant unswitched loops.
+    CodeMetrics Metrics;
+    for (Loop::block_iterator I = L->block_begin(),
+           E = L->block_end();
+         I != E; ++I)
+      Metrics.analyzeBasicBlock(*I);
+
+    Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
+    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+  }
+
+  if (!Props.CanBeUnswitchedCount) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+          << L->getHeader()->getName() << ", cost too high: "
+          << L->getBlocks().size() << "\n");
+
+    return false;
+  }
+
+  // Be careful. This links are good only before new loop addition.
+  CurrentLoopProperties = &Props;
+  CurLoopInstructions = &Props.UnswitchedVals;
+
+  return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop* L) {
+
+  LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+  if (LIt != LoopsProperties.end()) {
+    LoopProperties& Props = LIt->second;
+    MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation;
+    LoopsProperties.erase(LIt);
+  }
+
+  CurrentLoopProperties = NULL;
+  CurLoopInstructions = NULL;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) {
+  (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) {
+  return (*CurLoopInstructions)[SI].count(V);
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
+                     const ValueToValueMapTy& VMap) {
+
+  LoopProperties& NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties& OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals;
+
+  // Reallocate "can-be-unswitched quota"
+
+  --OldLoopProps.CanBeUnswitchedCount;
+  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+  // Clone unswitched values info:
+  // for new loop switches we clone info about values that was
+  // already unswitched and has redundant successors.
+  for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+    const SwitchInst* OldInst = I->first;
+    Value* NewI = VMap.lookup(OldInst);
+    const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI);
+    assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+  }
+}
+
 char LoopUnswitch::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
@@ -169,14 +333,18 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os) { 
-  return new LoopUnswitch(Os); 
+Pass *llvm::createLoopUnswitchPass(bool Os) {
+  return new LoopUnswitch(Os);
 }
 
 /// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
 /// invariant in the loop, or has an invariant piece, return the invariant.
 /// Otherwise, return null.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+
+  // We started analyze new instruction, increment scanned instructions counter.
+  ++TotalInsts;
+
   // We can never unswitch on vector conditions.
   if (Cond->getType()->isVectorTy())
     return 0;
@@ -201,7 +369,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
       if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
         return RHS;
     }
-  
+
   return 0;
 }
 
@@ -226,16 +394,36 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   return Changed;
 }
 
-/// processCurrentLoop - Do actual work and unswitch loop if possible 
+/// processCurrentLoop - Do actual work and unswitch loop if possible
 /// and profitable.
 bool LoopUnswitch::processCurrentLoop() {
   bool Changed = false;
-  LLVMContext &Context = currentLoop->getHeader()->getContext();
+
+  initLoopData();
+
+  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+  if (!loopPreheader)
+    return false;
+
+  // Loops with indirectbr cannot be cloned.
+  if (!currentLoop->isSafeToClone())
+    return false;
+
+  // Without dedicated exits, splitting the exit edge may fail.
+  if (!currentLoop->hasDedicatedExits())
+    return false;
+
+  LLVMContext &Context = loopHeader->getContext();
+
+  // Probably we reach the quota of branches for this loop. If so
+  // stop unswitching.
+  if (!BranchesInfo.countLoop(currentLoop))
+    return false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
   // block that is branching on a loop-invariant condition, we can unswitch this
   // loop.
-  for (Loop::block_iterator I = currentLoop->block_begin(), 
+  for (Loop::block_iterator I = currentLoop->block_begin(),
          E = currentLoop->block_end(); I != E; ++I) {
     TerminatorInst *TI = (*I)->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -244,24 +432,37 @@ bool LoopUnswitch::processCurrentLoop() {
       if (BI->isConditional()) {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
-        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), 
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
                                                currentLoop, Changed);
-        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumBranches;
           return true;
         }
-      }      
+      }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
                                              currentLoop, Changed);
-      if (LoopCond && SI->getNumCases() > 1) {
+      unsigned NumCases = SI->getNumCases();
+      if (LoopCond && NumCases) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
-        Constant *UnswitchVal = SI->getCaseValue(1);
+        Constant *UnswitchVal = NULL;
+
         // Do not process same value again and again.
-        if (!UnswitchedVals.insert(UnswitchVal))
+        // At this point we have some cases already unswitched and
+        // some not yet unswitched. Let's find the first not yet unswitched one.
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
+          Constant* UnswitchValCandidate = i.getCaseValue();
+          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+            UnswitchVal = UnswitchValCandidate;
+            break;
+          }
+        }
+
+        if (!UnswitchVal)
           continue;
 
         if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
@@ -270,14 +471,14 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
     }
-    
+
     // Scan the instructions to check for unswitchable values.
-    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
                                                currentLoop, Changed);
-        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
           return true;
@@ -297,7 +498,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
                                          BasicBlock *&ExitBB,
                                          std::set<BasicBlock*> &Visited) {
   if (!Visited.insert(BB).second) {
-    // Already visited. Without more analysis, this could indicate an infinte loop.
+    // Already visited. Without more analysis, this could indicate an infinite
+    // loop.
     return false;
   } else if (!L->contains(BB)) {
     // Otherwise, this is a loop exit, this is fine so long as this is the
@@ -306,7 +508,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
     ExitBB = BB;
     return true;
   }
-  
+
   // Otherwise, this is an unvisited intra-loop node.  Check all successors.
   for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
     // Check to see if the successor is a trivial loop exit.
@@ -319,12 +521,12 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
     if (I->mayHaveSideEffects())
       return false;
-  
+
   return true;
 }
 
 /// isTrivialLoopExitBlock - Return true if the specified block unconditionally
-/// leads to an exit from the specified loop, and has no side-effects in the 
+/// leads to an exit from the specified loop, and has no side-effects in the
 /// process.  If so, return the block that is exited to, otherwise return null.
 static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   std::set<BasicBlock*> Visited;
@@ -352,49 +554,61 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
   BasicBlock *Header = currentLoop->getHeader();
   TerminatorInst *HeaderTerm = Header->getTerminator();
   LLVMContext &Context = Header->getContext();
-  
+
   BasicBlock *LoopExitBB = 0;
   if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
     // If the header block doesn't end with a conditional branch on Cond, we
     // can't handle it.
     if (!BI->isConditional() || BI->getCondition() != Cond)
       return false;
-  
-    // Check to see if a successor of the branch is guaranteed to 
-    // exit through a unique exit block without having any 
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
     // side-effects.  If so, determine the value of Cond that causes it to do
     // this.
-    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
                                              BI->getSuccessor(0)))) {
       if (Val) *Val = ConstantInt::getTrue(Context);
-    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
                                                     BI->getSuccessor(1)))) {
       if (Val) *Val = ConstantInt::getFalse(Context);
     }
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
     // If this isn't a switch on Cond, we can't handle it.
     if (SI->getCondition() != Cond) return false;
-    
+
     // Check to see if a successor of the switch is guaranteed to go to the
-    // latch block or exit through a one exit block without having any 
+    // latch block or exit through a one exit block without having any
     // side-effects.  If so, determine the value of Cond that causes it to do
-    // this.  Note that we can't trivially unswitch on the default case.
-    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
-      if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
-                                               SI->getSuccessor(i)))) {
+    // this.
+    // Note that we can't trivially unswitch on the default case or
+    // on already unswitched cases.
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock* LoopExitCandidate;
+      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
+                                               i.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        if (Val) *Val = SI->getCaseValue(i);
+        ConstantInt* CaseVal = i.getCaseValue();
+
+        // Check that it was not unswitched before, since already unswitched
+        // trivial vals are looks trivial too.
+        if (BranchesInfo.isUnswitched(SI, CaseVal))
+          continue;
+        LoopExitBB = LoopExitCandidate;
+        if (Val) *Val = CaseVal;
         break;
       }
+    }
   }
 
   // If we didn't find a single unique LoopExit block, or if the loop exit block
   // contains phi nodes, this isn't trivial.
   if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
     return false;   // Can't handle this.
-  
+
   if (LoopExit) *LoopExit = LoopExitBB;
-  
+
   // We already know that nothing uses any scalar values defined inside of this
   // loop.  As such, we just have to check to see if this loop will execute any
   // side-effecting instructions (e.g. stores, calls, volatile loads) in the
@@ -411,12 +625,6 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
 
-  initLoopData();
-
-  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
-  if (!loopPreheader)
-    return false;
-
   Function *F = loopHeader->getParent();
 
   Constant *CondVal = 0;
@@ -434,28 +642,6 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
   if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize))
     return false;
 
-  // FIXME: This is overly conservative because it does not take into
-  // consideration code simplification opportunities and code that can
-  // be shared by the resultant unswitched loops.
-  CodeMetrics Metrics;
-  for (Loop::block_iterator I = currentLoop->block_begin(), 
-         E = currentLoop->block_end();
-       I != E; ++I)
-    Metrics.analyzeBasicBlock(*I);
-
-  // Limit the number of instructions to avoid causing significant code
-  // expansion, and the number of basic blocks, to avoid loops with
-  // large numbers of branches which cause loop unswitching to go crazy.
-  // This is a very ad-hoc heuristic.
-  if (Metrics.NumInsts > Threshold ||
-      Metrics.NumBlocks * 5 > Threshold ||
-      Metrics.containsIndirectBr || Metrics.isRecursive) {
-    DEBUG(dbgs() << "NOT unswitching loop %"
-          << currentLoop->getHeader()->getName() << ", cost too high: "
-          << currentLoop->getBlocks().size() << "\n");
-    return false;
-  }
-
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
   return true;
 }
@@ -508,17 +694,17 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 
 /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
 /// condition in it (a cond branch from its header block to its latch block,
-/// where the path through the loop that doesn't execute its body has no 
+/// where the path through the loop that doesn't execute its body has no
 /// side-effects), unswitch it.  This doesn't involve any code duplication, just
 /// moving the conditional branch outside of the loop and updating loop info.
-void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, 
-                                            Constant *Val, 
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
+                                            Constant *Val,
                                             BasicBlock *ExitBlock) {
   DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
         << loopHeader->getName() << " [" << L->getBlocks().size()
         << " blocks] in Function " << L->getHeader()->getParent()->getName()
         << " on cond: " << *Val << " == " << *Cond << "\n");
-  
+
   // First step, split the preheader, so that we know that there is a safe place
   // to insert the conditional branch.  We will change loopPreheader to have a
   // conditional branch on Cond.
@@ -527,24 +713,24 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
   // Now that we have a place to insert the conditional branch, create a place
   // to branch to: this is the exit block out of the loop that we should
   // short-circuit to.
-  
+
   // Split this block now, so that the loop maintains its exit block, and so
   // that the jump from the preheader can execute the contents of the exit block
   // without actually branching to it (the exit block should be dominated by the
   // loop header, not the preheader).
   assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
   BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
-    
-  // Okay, now we have a position to branch from and a position to branch to, 
+
+  // Okay, now we have a position to branch from and a position to branch to,
   // insert the new conditional branch.
-  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, 
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
                                  loopPreheader->getTerminator());
   LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
   loopPreheader->getTerminator()->eraseFromParent();
 
   // We need to reprocess this loop, it could be unswitched again.
   redoLoop = true;
-  
+
   // Now that we know that the loop is never entered when this condition is a
   // particular value, rewrite the loop with this info.  We know that this will
   // at least eliminate the old branch.
@@ -554,7 +740,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
 
 /// SplitExitEdges - Split all of the edges from inside the loop to their exit
 /// blocks.  Update the appropriate Phi nodes as we do so.
-void LoopUnswitch::SplitExitEdges(Loop *L, 
+void LoopUnswitch::SplitExitEdges(Loop *L,
                                 const SmallVector<BasicBlock *, 8> &ExitBlocks){
 
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
@@ -565,8 +751,7 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
     if (!ExitBlock->isLandingPad()) {
-      SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
-                             ".us-lcssa", this);
+      SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this);
     } else {
       SmallVector<BasicBlock*, 2> NewBBs;
       SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
@@ -575,10 +760,10 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
   }
 }
 
-/// UnswitchNontrivialCondition - We determined that the loop is profitable 
-/// to unswitch when LIC equal Val.  Split it into loop versions and test the 
+/// UnswitchNontrivialCondition - We determined that the loop is profitable
+/// to unswitch when LIC equal Val.  Split it into loop versions and test the
 /// condition outside of either loop.  Return the loops created as Out1/Out2.
-void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, 
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                                Loop *L) {
   Function *F = loopHeader->getParent();
   DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
@@ -621,6 +806,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   ValueToValueMapTy VMap;
   for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
     BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+
     NewBlocks.push_back(NewBB);
     VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
     LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
@@ -633,6 +819,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // Now we create the new Loop object for the versioned loop.
   Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+  // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+  // Probably clone more loop-unswitch related loop properties.
+  BranchesInfo.cloneData(NewLoop, L, VMap);
+
   Loop *ParentLoop = L->getParentLoop();
   if (ParentLoop) {
     // Make sure to add the cloned preheader and exit blocks to the parent loop
@@ -645,7 +836,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     // The new exit block should be in the same loop as the old one.
     if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
       ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
-    
+
     assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
            "Exit block should have been split to have one successor!");
     BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
@@ -680,7 +871,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
       RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
-  
+
   // Rewrite the original preheader to select between versions of the loop.
   BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
   assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
@@ -699,7 +890,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   // the condition that we're unswitching on), we don't rewrite the second
   // iteration.
   WeakVH LICHandle(LIC);
-  
+
   // Now we rewrite the original code to know that the condition is true and the
   // new code to know that the condition is false.
   RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
@@ -714,7 +905,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
 /// RemoveFromWorklist - Remove all instances of I from the worklist vector
 /// specified.
-static void RemoveFromWorklist(Instruction *I, 
+static void RemoveFromWorklist(Instruction *I,
                                std::vector<Instruction*> &Worklist) {
   std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
                                                      Worklist.end(), I);
@@ -727,7 +918,7 @@ static void RemoveFromWorklist(Instruction *I,
 
 /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
 /// program, replacing all uses with V and update the worklist.
-static void ReplaceUsesOfWith(Instruction *I, Value *V, 
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
                               std::vector<Instruction*> &Worklist,
                               Loop *L, LPPassManager *LPM) {
   DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
@@ -760,10 +951,10 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
     if (BasicBlock *Pred = BB->getSinglePredecessor()) {
       // If it has one pred, fold phi nodes in BB.
       while (isa<PHINode>(BB->begin()))
-        ReplaceUsesOfWith(BB->begin(), 
-                          cast<PHINode>(BB->begin())->getIncomingValue(0), 
+        ReplaceUsesOfWith(BB->begin(),
+                          cast<PHINode>(BB->begin())->getIncomingValue(0),
                           Worklist, L, LPM);
-      
+
       // If this is the header of a loop and the only pred is the latch, we now
       // have an unreachable loop.
       if (Loop *L = LI->getLoopFor(BB))
@@ -774,15 +965,15 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
           LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
           Pred->getTerminator()->eraseFromParent();
           new UnreachableInst(BB->getContext(), Pred);
-          
+
           // The loop is now broken, remove it from LI.
           RemoveLoopFromHierarchy(L);
-          
+
           // Reprocess the header, which now IS dead.
           RemoveBlockIfDead(BB, Worklist, L);
           return;
         }
-      
+
       // If pred ends in a uncond branch, add uncond branch to worklist so that
       // the two blocks will get merged.
       if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
@@ -793,11 +984,11 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
   }
 
   DEBUG(dbgs() << "Nuking dead block: " << *BB);
-  
+
   // Remove the instructions in the basic block from the worklist.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     RemoveFromWorklist(I, Worklist);
-    
+
     // Anything that uses the instructions in this basic block should have their
     // uses replaced with undefs.
     // If I is not void type then replaceAllUsesWith undef.
@@ -805,7 +996,7 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
     if (!I->getType()->isVoidTy())
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
   }
-  
+
   // If this is the edge to the header block for a loop, remove the loop and
   // promote all subloops.
   if (Loop *BBLoop = LI->getLoopFor(BB)) {
@@ -821,8 +1012,8 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
   // Remove the block from the loop info, which removes it from any loops it
   // was in.
   LI->removeBlock(BB);
-  
-  
+
+
   // Remove phi node entries in successors for this block.
   TerminatorInst *TI = BB->getTerminator();
   SmallVector<BasicBlock*, 4> Succs;
@@ -830,13 +1021,13 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
     Succs.push_back(TI->getSuccessor(i));
     TI->getSuccessor(i)->removePredecessor(BB);
   }
-  
+
   // Unique the successors, remove anything with multiple uses.
   array_pod_sort(Succs.begin(), Succs.end());
   Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
-  
+
   // Remove the basic block, including all of the instructions contained in it.
-  LPM->deleteSimpleAnalysisValue(BB, L);  
+  LPM->deleteSimpleAnalysisValue(BB, L);
   BB->eraseFromParent();
   // Remove successor blocks here that are not dead, so that we know we only
   // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
@@ -854,7 +1045,7 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
         --i;
       }
     }
-  
+
   for (unsigned i = 0, e = Succs.size(); i != e; ++i)
     RemoveBlockIfDead(Succs[i], Worklist, L);
 }
@@ -877,14 +1068,14 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                                         Constant *Val,
                                                         bool IsEqual) {
   assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-  
+
   // FIXME: Support correlated properties, like:
   //  for (...)
   //    if (li1 < li2)
   //      ...
   //    if (li1 > li2)
   //      ...
-  
+
   // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
   // selects, switches.
   std::vector<Instruction*> Worklist;
@@ -899,21 +1090,25 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (IsEqual)
       Replacement = Val;
     else
-      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), 
+      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
                                      !cast<ConstantInt>(Val)->getZExtValue());
-    
+
     for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
          UI != E; ++UI) {
       Instruction *U = dyn_cast<Instruction>(*UI);
       if (!U || !L->contains(U))
         continue;
-      U->replaceUsesOfWith(LIC, Replacement);
       Worklist.push_back(U);
     }
+
+    for (std::vector<Instruction*>::iterator UI = Worklist.begin();
+         UI != Worklist.end(); ++UI)
+      (*UI)->replaceUsesOfWith(LIC, Replacement);
+
     SimplifyCode(Worklist, L);
     return;
   }
-  
+
   // Otherwise, we don't know the precise value of LIC, but we do know that it
   // is certainly NOT "Val".  As such, simplify any uses in the loop that we
   // can.  This case occurs when we unswitch switch statements.
@@ -925,23 +1120,27 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
 
     Worklist.push_back(U);
 
-    // TODO: We could do other simplifications, for example, turning 
+    // TODO: We could do other simplifications, for example, turning
     // 'icmp eq LIC, Val' -> false.
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(U);
     if (SI == 0 || !isa<ConstantInt>(Val)) continue;
-    
-    unsigned DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
-    if (DeadCase == 0) continue;  // Default case is live for multiple values.
-    
-    // Found a dead case value.  Don't remove PHI nodes in the 
+
+    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // Default case is live for multiple values.
+    if (DeadCase == SI->case_default()) continue;
+
+    // Found a dead case value.  Don't remove PHI nodes in the
     // successor if they become single-entry, those PHI nodes may
     // be in the Users list.
 
     BasicBlock *Switch = SI->getParent();
-    BasicBlock *SISucc = SI->getSuccessor(DeadCase);
+    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
+
+    BranchesInfo.setUnswitched(SI, Val);
+
     if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
     // If the DeadCase successor dominates the loop latch, then the
     // transformation isn't safe since it will delete the sole predecessor edge
@@ -957,7 +1156,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // Compute the successors instead of relying on the return value
     // of SplitEdge, since it may have split the switch successor
     // after PHI nodes.
-    BasicBlock *NewSISucc = SI->getSuccessor(DeadCase);
+    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
     BasicBlock *OldSISucc = *succ_begin(NewSISucc);
     // Create an "unreachable" destination.
     BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
@@ -981,7 +1180,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (DT)
       DT->addNewBlock(Abort, NewSISucc);
   }
-  
+
   SimplifyCode(Worklist, L);
 }
 
@@ -1002,7 +1201,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     // Simple DCE.
     if (isInstructionTriviallyDead(I)) {
       DEBUG(dbgs() << "Remove dead instruction '" << *I);
-      
+
       // Add uses to the worklist, which may be dead now.
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
         if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
@@ -1017,7 +1216,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     // See if instruction simplification can hack this up.  This is common for
     // things like "select false, X, Y" after unswitching made the condition be
     // 'false'.
-    if (Value *V = SimplifyInstruction(I, 0, DT))
+    if (Value *V = SimplifyInstruction(I, 0, 0, DT))
       if (LI->replacementPreservesLCSSAForm(I, V)) {
         ReplaceUsesOfWith(I, V, Worklist, L, LPM);
         continue;
@@ -1034,24 +1233,24 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         if (!SinglePred) continue;  // Nothing to do.
         assert(SinglePred == Pred && "CFG broken");
 
-        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " 
+        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
               << Succ->getName() << "\n");
-        
+
         // Resolve any single entry PHI nodes in Succ.
         while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
           ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
-        
+
         // If Succ has any successors with PHI nodes, update them to have
         // entries coming from Pred instead of Succ.
         Succ->replaceAllUsesWith(Pred);
-        
+
         // Move all of the successor contents from Succ to Pred.
         Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
                                    Succ->end());
         LPM->deleteSimpleAnalysisValue(BI, L);
         BI->eraseFromParent();
         RemoveFromWorklist(BI, Worklist);
-        
+
         // Remove Succ from the loop tree.
         LI->removeBlock(Succ);
         LPM->deleteSimpleAnalysisValue(Succ, L);
@@ -1059,7 +1258,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         ++NumSimplify;
         continue;
       }
-      
+
       if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
         // Conditional branch.  Turn it into an unconditional branch, then
         // remove dead blocks.
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index eeb8931..a87cce3 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -147,8 +147,8 @@ struct MemsetRange {
 } // end anon namespace
 
 bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
-  // If we found more than 8 stores to merge or 64 bytes, use memset.
-  if (TheStores.size() >= 8 || End-Start >= 64) return true;
+  // If we found more than 4 stores to merge or 16 bytes, use memset.
+  if (TheStores.size() >= 4 || End-Start >= 16) return true;
 
   // If there is nothing to merge, don't do anything.
   if (TheStores.size() < 2) return false;
@@ -806,21 +806,25 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
   MemDepResult DepInfo = MD->getDependency(M);
-  if (!DepInfo.isClobber())
-    return false;
-  
-  if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()))
-    return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
-    
-  if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
-    if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                             CopySize->getZExtValue(), C)) {
-      MD->removeInstruction(M);
-      M->eraseFromParent();
-      return true;
+  if (DepInfo.isClobber()) {
+    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                               CopySize->getZExtValue(), C)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        return true;
+      }
     }
   }
-  
+
+  AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M);
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,
+                                                         M, M->getParent());
+  if (SrcDepInfo.isClobber()) {
+    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+      return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
+  }
+
   return false;
 }
 
@@ -945,7 +949,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
         RepeatInstruction = processMemMove(M);
       else if (CallSite CS = (Value*)I) {
         for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
-          if (CS.paramHasAttr(i+1, Attribute::ByVal))
+          if (CS.isByValArgument(i))
             MadeChange |= processByValArgument(CS, i);
       }
 
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index da74e9c..40b0b20 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -88,13 +88,14 @@ namespace {
     }
 #endif
 
-    ValueT &operator[](KeyT Arg) {
+    ValueT &operator[](const KeyT &Arg) {
       std::pair<typename MapTy::iterator, bool> Pair =
         Map.insert(std::make_pair(Arg, size_t(0)));
       if (Pair.second) {
-        Pair.first->second = Vector.size();
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
         Vector.push_back(std::make_pair(Arg, ValueT()));
-        return Vector.back().second;
+        return Vector[Num].second;
       }
       return Vector[Pair.first->second].second;
     }
@@ -104,14 +105,15 @@ namespace {
       std::pair<typename MapTy::iterator, bool> Pair =
         Map.insert(std::make_pair(InsertPair.first, size_t(0)));
       if (Pair.second) {
-        Pair.first->second = Vector.size();
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
         Vector.push_back(InsertPair);
-        return std::make_pair(llvm::prior(Vector.end()), true);
+        return std::make_pair(Vector.begin() + Num, true);
       }
       return std::make_pair(Vector.begin() + Pair.first->second, false);
     }
 
-    const_iterator find(KeyT Key) const {
+    const_iterator find(const KeyT &Key) const {
       typename MapTy::const_iterator It = Map.find(Key);
       if (It == Map.end()) return Vector.end();
       return Vector.begin() + It->second;
@@ -121,7 +123,7 @@ namespace {
     /// from the vector, it just zeros out the key in the vector. This leaves
     /// iterators intact, but clients must be prepared for zeroed-out keys when
     /// iterating.
-    void blot(KeyT Key) {
+    void blot(const KeyT &Key) {
       typename MapTy::iterator It = Map.find(Key);
       if (It == Map.end()) return;
       Vector[It->second].first = KeyT();
@@ -179,9 +181,13 @@ static bool IsPotentialUse(const Value *Op) {
         Arg->hasNestAttr() ||
         Arg->hasStructRetAttr())
       return false;
-  // Only consider values with pointer types, and not function pointers.
+  // Only consider values with pointer types.
+  // It seemes intuitive to exclude function pointer types as well, since
+  // functions are never reference-counted, however clang occasionally
+  // bitcasts reference-counted pointers to function-pointer type
+  // temporarily.
   PointerType *Ty = dyn_cast<PointerType>(Op->getType());
-  if (!Ty || isa<FunctionType>(Ty->getElementType()))
+  if (!Ty)
     return false;
   // Conservatively assume anything else is a potential use.
   return true;
@@ -371,7 +377,7 @@ static InstructionClass GetBasicInstructionClass(const Value *V) {
   }
 
   // Otherwise, be conservative.
-  return IC_User;
+  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
 }
 
 /// IsRetain - Test if the the given class is objc_retain or
@@ -597,6 +603,46 @@ static bool ModuleHasARC(const Module &M) {
     M.getNamedValue("objc_unretainedPointer");
 }
 
+/// DoesObjCBlockEscape - Test whether the given pointer, which is an
+/// Objective C block pointer, does not "escape". This differs from regular
+/// escape analysis in that a use as an argument to a call is not considered
+/// an escape.
+static bool DoesObjCBlockEscape(const Value *BlockPtr) {
+  // Walk the def-use chains.
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(BlockPtr);
+  do {
+    const Value *V = Worklist.pop_back_val();
+    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      const User *UUser = *UI;
+      // Special - Use by a call (callee or argument) is not considered
+      // to be an escape.
+      if (isa<CallInst>(UUser) || isa<InvokeInst>(UUser))
+        continue;
+      // Use by an instruction which copies the value is an escape if the
+      // result is an escape.
+      if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
+          isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
+        Worklist.push_back(UUser);
+        continue;
+      }
+      // Use by a load is not an escape.
+      if (isa<LoadInst>(UUser))
+        continue;
+      // Use by a store is not an escape if the use is the address.
+      if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
+        if (V != SI->getValueOperand())
+          continue;
+      // Otherwise, conservatively assume an escape.
+      return true;
+    }
+  } while (!Worklist.empty());
+
+  // No escapes found.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // ARC AliasAnalysis.
 //===----------------------------------------------------------------------===//
@@ -850,6 +896,139 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 }
 
 //===----------------------------------------------------------------------===//
+// ARC autorelease pool elimination.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Constants.h"
+
+namespace {
+  /// ObjCARCAPElim - Autorelease pool elimination.
+  class ObjCARCAPElim : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnModule(Module &M);
+
+    bool MayAutorelease(CallSite CS, unsigned Depth = 0);
+    bool OptimizeBB(BasicBlock *BB);
+
+  public:
+    static char ID;
+    ObjCARCAPElim() : ModulePass(ID) {
+      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim,
+                "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination",
+                false, false)
+
+Pass *llvm::createObjCARCAPElimPass() {
+  return new ObjCARCAPElim();
+}
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+/// MayAutorelease - Interprocedurally determine if calls made by the
+/// given call site can possibly produce autoreleases.
+bool ObjCARCAPElim::MayAutorelease(CallSite CS, unsigned Depth) {
+  if (Function *Callee = CS.getCalledFunction()) {
+    if (Callee->isDeclaration() || Callee->mayBeOverridden())
+      return true;
+    for (Function::iterator I = Callee->begin(), E = Callee->end();
+         I != E; ++I) {
+      BasicBlock *BB = I;
+      for (BasicBlock::iterator J = BB->begin(), F = BB->end(); J != F; ++J)
+        if (CallSite JCS = CallSite(J))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 &&
+              !JCS.onlyReadsMemory() &&
+              MayAutorelease(JCS, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+  bool Changed = false;
+
+  Instruction *Push = 0;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case IC_AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = 0;
+      break;
+    case IC_CallOrUser:
+      if (MayAutorelease(CallSite(Inst)))
+        Push = 0;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of pairs where the second members
+    // are constructor functions.
+    Function *F = cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (llvm::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(F->begin());
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
 // ARC optimization.
 //===----------------------------------------------------------------------===//
 
@@ -896,8 +1075,9 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 #include "llvm/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseSet.h"
 
 STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
 STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
@@ -1158,6 +1338,12 @@ namespace {
     /// with the "tail" keyword.
     bool IsTailCallRelease;
 
+    /// Partial - True of we've seen an opportunity for partial RR elimination,
+    /// such as pushing calls into a CFG triangle or into one side of a
+    /// CFG diamond.
+    /// TODO: Consider moving this to PtrState.
+    bool Partial;
+
     /// ReleaseMetadata - If the Calls are objc_release calls and they all have
     /// a clang.imprecise_release tag, this is the metadata tag.
     MDNode *ReleaseMetadata;
@@ -1171,7 +1357,8 @@ namespace {
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
     RRInfo() :
-      KnownSafe(false), IsRetainBlock(false), IsTailCallRelease(false),
+      KnownSafe(false), IsRetainBlock(false),
+      IsTailCallRelease(false), Partial(false),
       ReleaseMetadata(0) {}
 
     void clear();
@@ -1182,6 +1369,7 @@ void RRInfo::clear() {
   KnownSafe = false;
   IsRetainBlock = false;
   IsTailCallRelease = false;
+  Partial = false;
   ReleaseMetadata = 0;
   Calls.clear();
   ReverseInsertPts.clear();
@@ -1239,16 +1427,6 @@ namespace {
       Seq = NewSeq;
     }
 
-    void SetSeqToRelease(MDNode *M) {
-      if (Seq == S_None || Seq == S_Use) {
-        Seq = M ? S_MovableRelease : S_Release;
-        RRI.ReleaseMetadata = M;
-      } else if (Seq != S_MovableRelease || RRI.ReleaseMetadata != M) {
-        Seq = S_Release;
-        RRI.ReleaseMetadata = 0;
-      }
-    }
-
     Sequence GetSeq() const {
       return Seq;
     }
@@ -1272,8 +1450,16 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
     Seq = S_None;
 
+  // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
     RRI.clear();
+  } else if (RRI.Partial || Other.RRI.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    Seq = S_None;
+    RRI.clear();
   } else {
     // Conservatively merge the ReleaseMetadata information.
     if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
@@ -1282,8 +1468,15 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
     RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
     RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
-    RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(),
-                                Other.RRI.ReverseInsertPts.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    RRI.Partial = RRI.ReverseInsertPts.size() !=
+                  Other.RRI.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.RRI.ReverseInsertPts.begin(),
+         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
+      RRI.Partial |= RRI.ReverseInsertPts.insert(*I);
   }
 }
 
@@ -1460,6 +1653,14 @@ namespace {
     /// metadata.
     unsigned ImpreciseReleaseMDKind;
 
+    /// CopyOnEscapeMDKind - The Metadata Kind for clang.arc.copy_on_escape
+    /// metadata.
+    unsigned CopyOnEscapeMDKind;
+
+    /// NoObjCARCExceptionsMDKind - The Metadata Kind for
+    /// clang.arc.no_objc_arc_exceptions metadata.
+    unsigned NoObjCARCExceptionsMDKind;
+
     Constant *getRetainRVCallee(Module *M);
     Constant *getAutoreleaseRVCallee(Module *M);
     Constant *getReleaseCallee(Module *M);
@@ -1467,6 +1668,8 @@ namespace {
     Constant *getRetainBlockCallee(Module *M);
     Constant *getAutoreleaseCallee(Module *M);
 
+    bool IsRetainBlockOptimizable(const Instruction *Inst);
+
     void OptimizeRetainCall(Function &F, Instruction *Retain);
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV);
@@ -1475,9 +1678,16 @@ namespace {
     void CheckForCFGHazards(const BasicBlock *BB,
                             DenseMap<const BasicBlock *, BBState> &BBStates,
                             BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst,
+                                  BasicBlock *BB,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
     bool VisitBottomUp(BasicBlock *BB,
                        DenseMap<const BasicBlock *, BBState> &BBStates,
                        MapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
     bool VisitTopDown(BasicBlock *BB,
                       DenseMap<const BasicBlock *, BBState> &BBStates,
                       DenseMap<Value *, RRInfo> &Releases);
@@ -1534,6 +1744,22 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
 }
 
+bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
+  // Without the magic metadata tag, we have to assume this might be an
+  // objc_retainBlock call inserted to convert a block pointer to an id,
+  // in which case it really is needed.
+  if (!Inst->getMetadata(CopyOnEscapeMDKind))
+    return false;
+
+  // If the pointer "escapes" (not including being used in a call),
+  // the copy may be needed.
+  if (DoesObjCBlockEscape(Inst))
+    return false;
+
+  // Otherwise, it's not needed.
+  return true;
+}
+
 Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
   if (!RetainRVCallee) {
     LLVMContext &C = M->getContext();
@@ -1737,6 +1963,7 @@ namespace {
   /// use here.
   enum DependenceKind {
     NeedsPositiveRetainCount,
+    AutoreleasePoolBoundary,
     CanChangeRetainCount,
     RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
     RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
@@ -1766,6 +1993,19 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
     }
   }
 
+  case AutoreleasePoolBoundary: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
   case CanChangeRetainCount: {
     InstructionClass Class = GetInstructionClass(Inst);
     switch (Class) {
@@ -1783,6 +2023,7 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
   case RetainAutoreleaseDep:
     switch (GetBasicInstructionClass(Inst)) {
     case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
       // Don't merge an objc_autorelease with an objc_retain inside a different
       // autoreleasepool scope.
       return true;
@@ -1794,7 +2035,6 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
       // Nothing else matters for objc_retainAutorelease formation.
       return false;
     }
-    break;
 
   case RetainAutoreleaseRVDep: {
     InstructionClass Class = GetBasicInstructionClass(Inst);
@@ -1808,7 +2048,6 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
       // retainAutoreleaseReturnValue formation.
       return CanInterruptRV(Class);
     }
-    break;
   }
 
   case RetainRVDep:
@@ -1816,7 +2055,6 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
   }
 
   llvm_unreachable("Invalid dependence flavor");
-  return true;
 }
 
 /// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and
@@ -1920,17 +2158,26 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
 /// return true.
 bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
-  // Check for the argument being from an immediately preceding call.
+  // Check for the argument being from an immediately preceding call or invoke.
   Value *Arg = GetObjCArg(RetainRV);
   CallSite CS(Arg);
-  if (Instruction *Call = CS.getInstruction())
+  if (Instruction *Call = CS.getInstruction()) {
     if (Call->getParent() == RetainRV->getParent()) {
       BasicBlock::iterator I = Call;
       ++I;
       while (isNoopInstruction(I)) ++I;
       if (&*I == RetainRV)
         return false;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::iterator I = RetainRVParent->begin();
+        while (isNoopInstruction(I)) ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
     }
+  }
 
   // Check for being preceded by an objc_autoreleaseReturnValue on the same
   // pointer. In this case, we can delete the pair.
@@ -2144,9 +2391,34 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
         // Check that there is nothing that cares about the reference
         // count between the call and the phi.
-        FindDependencies(NeedsPositiveRetainCount, Arg,
-                         Inst->getParent(), Inst,
-                         DependingInstructions, Visited, PA);
+        switch (Class) {
+        case IC_Retain:
+        case IC_RetainBlock:
+          // These can always be moved up.
+          break;
+        case IC_Release:
+          // These can't be moved across things that care about the retain count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_RetainRV:
+        case IC_AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
         if (DependingInstructions.size() == 1 &&
             *DependingInstructions.begin() == PN) {
           Changed = true;
@@ -2186,7 +2458,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
                                BBState &MyStates) const {
   // If any top-down local-use or possible-dec has a succ which is earlier in
   // the sequence, forget it.
-  for (BBState::ptr_const_iterator I = MyStates.top_down_ptr_begin(),
+  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
        E = MyStates.top_down_ptr_end(); I != E; ++I)
     switch (I->second.GetSeq()) {
     default: break;
@@ -2195,14 +2467,32 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      PtrState &S = MyStates.getPtrTopDownState(Arg);
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
-        switch (SuccS.GetSeq()) {
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      // If the terminator is an invoke marked with the
+      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
+      // ignored, for ARC purposes.
+      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
+        --SE;
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has visited this successor, take what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI);
+        if (BBI != BBStates.end()) {
+          const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+          SuccSSeq = SuccS.GetSeq();
+          SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        }
+        switch (SuccSSeq) {
         case S_None:
         case S_CanRelease: {
-          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
             S.ClearSequenceProgress();
+            break;
+          }
           continue;
         }
         case S_Use:
@@ -2211,7 +2501,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
         case S_Stop:
         case S_Release:
         case S_MovableRelease:
-          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
             AllSuccsHaveSame = false;
           break;
         case S_Retain:
@@ -2223,19 +2513,38 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
         S.ClearSequenceProgress();
+      break;
     }
     case S_CanRelease: {
       const Value *Arg = I->first;
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      PtrState &S = MyStates.getPtrTopDownState(Arg);
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
-        switch (SuccS.GetSeq()) {
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      // If the terminator is an invoke marked with the
+      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
+      // ignored, for ARC purposes.
+      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
+        --SE;
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has visited this successor, take what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI);
+        if (BBI != BBStates.end()) {
+          const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+          SuccSSeq = SuccS.GetSeq();
+          SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        }
+        switch (SuccSSeq) {
         case S_None: {
-          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
             S.ClearSequenceProgress();
+            break;
+          }
           continue;
         }
         case S_CanRelease:
@@ -2245,7 +2554,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
         case S_Release:
         case S_MovableRelease:
         case S_Use:
-          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
             AllSuccsHaveSame = false;
           break;
         case S_Retain:
@@ -2257,8 +2566,167 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
         S.ClearSequenceProgress();
+      break;
+    }
+    }
+}
+
+bool
+ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
+                                     BasicBlock *BB,
+                                     MapVector<Value *, RRInfo> &Retains,
+                                     BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+    // If we see two releases in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second release, which may allow us to
+    // eliminate the first release too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
+      NestingDetected = true;
+
+    S.RRI.clear();
+
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+    S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release);
+    S.RRI.ReleaseMetadata = ReleaseMetadata;
+    S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
+    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+    S.RRI.Calls.insert(Inst);
+
+    S.IncrementRefCount();
+    S.IncrementNestCount();
+    break;
+  }
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    S.DecrementRefCount();
+    S.SetAtLeastOneRefCount();
+    S.DecrementNestCount();
+
+    switch (S.GetSeq()) {
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+    case S_Use:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_CanRelease:
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV) {
+        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+        Retains[Inst] = S.RRI;
+      }
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
     }
+    return NestingDetected;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.DecrementRefCount();
+      switch (Seq) {
+      case S_Use:
+        S.SetSeq(S_CanRelease);
+        continue;
+      case S_CanRelease:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Stop:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
     }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_Release:
+    case S_MovableRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        assert(S.RRI.ReverseInsertPts.empty());
+        // If this is an invoke instruction, we're scanning it as part of
+        // one of its successor blocks, since we can't insert code after it
+        // in its own block, and we don't want to split critical edges.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+        S.SetSeq(S_Use);
+      } else if (Seq == S_Release &&
+                 (Class == IC_User || Class == IC_CallOrUser)) {
+        // Non-movable releases depend on any possible objc pointer use.
+        S.SetSeq(S_Stop);
+        assert(S.RRI.ReverseInsertPts.empty());
+        // As above; handle invoke specially.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+      }
+      break;
+    case S_Stop:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_CanRelease:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+  }
+
+  return NestingDetected;
 }
 
 bool
@@ -2274,7 +2742,13 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
   succ_const_iterator SI(TI), SE(TI, false);
   if (SI == SE)
     MyStates.SetAsExit();
-  else
+  else {
+    // If the terminator is an invoke marked with the
+    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
+    // ignored, for ARC purposes.
+    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
+      --SE;
+
     do {
       const BasicBlock *Succ = *SI++;
       if (Succ == BB)
@@ -2295,145 +2769,169 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       }
       break;
     } while (SI != SE);
+  }
 
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
     Instruction *Inst = llvm::prior(I);
-    InstructionClass Class = GetInstructionClass(Inst);
-    const Value *Arg = 0;
 
-    switch (Class) {
-    case IC_Release: {
-      Arg = GetObjCArg(Inst);
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as
+  // if it were part of this block, since we can't insert code after
+  // an invoke in its own block, and we don't want to split critical
+  // edges.
+  for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back());
+    if (isa<InvokeInst>(PredTI))
+      NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates);
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
 
-      PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
 
-      // If we see two releases in a row on the same pointer. If so, make
+    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // better to let it remain as the first instruction after a call.
+    if (Class != IC_RetainRV) {
+      // If we see two retains in a row on the same pointer. If so, make
       // a note, and we'll cicle back to revisit it after we've
-      // hopefully eliminated the second release, which may allow us to
-      // eliminate the first release too.
+      // hopefully eliminated the second retain, which may allow us to
+      // eliminate the first retain too.
       // Theoretically we could implement removal of nested retain+release
       // pairs by making PtrState hold a stack of states, but this is
       // simple and avoids adding overhead for the non-nested case.
-      if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
+      if (S.GetSeq() == S_Retain)
         NestingDetected = true;
 
-      S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind));
+      S.SetSeq(S_Retain);
       S.RRI.clear();
-      S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
-      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+      // Don't check S.IsKnownIncremented() here because it's not
+      // sufficient.
+      S.RRI.KnownSafe = S.IsKnownNested();
       S.RRI.Calls.insert(Inst);
+    }
 
-      S.IncrementRefCount();
-      S.IncrementNestCount();
+    S.SetAtLeastOneRefCount();
+    S.IncrementRefCount();
+    S.IncrementNestCount();
+    return NestingDetected;
+  }
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+    S.DecrementRefCount();
+    S.DecrementNestCount();
+
+    switch (S.GetSeq()) {
+    case S_Retain:
+    case S_CanRelease:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_Use:
+      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      Releases[Inst] = S.RRI;
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
       break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
     }
-    case IC_RetainBlock:
-    case IC_Retain:
-    case IC_RetainRV: {
-      Arg = GetObjCArg(Inst);
+    break;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
 
-      PtrState &S = MyStates.getPtrBottomUpState(Arg);
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
       S.DecrementRefCount();
-      S.SetAtLeastOneRefCount();
-      S.DecrementNestCount();
-
-      // An objc_retainBlock call with just a use still needs to be kept,
-      // because it may be copying a block from the stack to the heap.
-      if (Class == IC_RetainBlock && S.GetSeq() == S_Use)
+      switch (Seq) {
+      case S_Retain:
         S.SetSeq(S_CanRelease);
+        assert(S.RRI.ReverseInsertPts.empty());
+        S.RRI.ReverseInsertPts.insert(Inst);
 
-      switch (S.GetSeq()) {
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
+        // One call can't cause a transition from S_Retain to S_CanRelease
+        // and S_CanRelease to S_Use. If we've made the first transition,
+        // we're done.
+        continue;
       case S_Use:
-        S.RRI.ReverseInsertPts.clear();
-        // FALL THROUGH
       case S_CanRelease:
-        // Don't do retain+release tracking for IC_RetainRV, because it's
-        // better to let it remain as the first instruction after a call.
-        if (Class != IC_RetainRV) {
-          S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-          Retains[Inst] = S.RRI;
-        }
-        S.ClearSequenceProgress();
-        break;
       case S_None:
         break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
-      }
-      continue;
-    }
-    case IC_AutoreleasepoolPop:
-      // Conservatively, clear MyStates for all known pointers.
-      MyStates.clearBottomUpPointers();
-      continue;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      // These are irrelevant.
-      continue;
-    default:
-      break;
-    }
-
-    // Consider any other possible effects of this instruction on each
-    // pointer being tracked.
-    for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
-         ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
-      const Value *Ptr = MI->first;
-      if (Ptr == Arg)
-        continue; // Handled above.
-      PtrState &S = MI->second;
-      Sequence Seq = S.GetSeq();
-
-      // Check for possible releases.
-      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        S.DecrementRefCount();
-        switch (Seq) {
-        case S_Use:
-          S.SetSeq(S_CanRelease);
-          continue;
-        case S_CanRelease:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Stop:
-        case S_None:
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-
-      // Check for possible direct uses.
-      switch (Seq) {
+      case S_Stop:
       case S_Release:
       case S_MovableRelease:
-        if (CanUse(Inst, Ptr, PA, Class)) {
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
-          S.SetSeq(S_Use);
-        } else if (Seq == S_Release &&
-                   (Class == IC_User || Class == IC_CallOrUser)) {
-          // Non-movable releases depend on any possible objc pointer use.
-          S.SetSeq(S_Stop);
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
-        }
-        break;
-      case S_Stop:
-        if (CanUse(Inst, Ptr, PA, Class))
-          S.SetSeq(S_Use);
-        break;
-      case S_CanRelease:
-      case S_Use:
-      case S_None:
-        break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
+        llvm_unreachable("top-down pointer in release state!");
       }
     }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_CanRelease:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_Retain:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
   }
 
   return NestingDetected;
@@ -2453,22 +2951,31 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
     MyStates.SetAsEntry();
   else
     do {
-      const BasicBlock *Pred = *PI++;
+      unsigned OperandNo = PI.getOperandNo();
+      const Use &Us = PI.getUse();
+      ++PI;
+
+      // Skip invoke unwind edges on invoke instructions marked with
+      // clang.arc.no_objc_arc_exceptions.
+      if (const InvokeInst *II = dyn_cast<InvokeInst>(Us.getUser()))
+        if (OperandNo == II->getNumArgOperands() + 2 &&
+            II->getMetadata(NoObjCARCExceptionsMDKind))
+          continue;
+
+      const BasicBlock *Pred = cast<TerminatorInst>(Us.getUser())->getParent();
       if (Pred == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-      assert(I != BBStates.end());
       // If we haven't seen this node yet, then we've found a CFG cycle.
       // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
-      if (!I->second.isVisitedTopDown())
+      if (I == BBStates.end() || !I->second.isVisitedTopDown())
         continue;
       MyStates.InitFromPred(I->second);
       while (PI != PE) {
         Pred = *PI++;
         if (Pred != BB) {
           I = BBStates.find(Pred);
-          assert(I != BBStates.end());
-          if (I->second.isVisitedTopDown())
+          if (I != BBStates.end() && I->second.isVisitedTopDown())
             MyStates.MergePred(I->second);
         }
       }
@@ -2478,147 +2985,89 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     Instruction *Inst = I;
-    InstructionClass Class = GetInstructionClass(Inst);
-    const Value *Arg = 0;
+    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
+  }
 
-    switch (Class) {
-    case IC_RetainBlock:
-    case IC_Retain:
-    case IC_RetainRV: {
-      Arg = GetObjCArg(Inst);
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  return NestingDetected;
+}
 
-      PtrState &S = MyStates.getPtrTopDownState(Arg);
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder) {
+  /// Backedges - Backedges detected in the DFS. These edges will be
+  /// ignored in the reverse-CFG DFS, so that loops with multiple exits will be
+  /// traversed in the desired order.
+  DenseSet<std::pair<BasicBlock *, BasicBlock *> > Backedges;
+
+  /// Visited - The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
 
-      // Don't do retain+release tracking for IC_RetainRV, because it's
-      // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        // If we see two retains in a row on the same pointer. If so, make
-        // a note, and we'll cicle back to revisit it after we've
-        // hopefully eliminated the second retain, which may allow us to
-        // eliminate the first retain too.
-        // Theoretically we could implement removal of nested retain+release
-        // pairs by making PtrState hold a stack of states, but this is
-        // simple and avoids adding overhead for the non-nested case.
-        if (S.GetSeq() == S_Retain)
-          NestingDetected = true;
-
-        S.SetSeq(S_Retain);
-        S.RRI.clear();
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        // Don't check S.IsKnownIncremented() here because it's not
-        // sufficient.
-        S.RRI.KnownSafe = S.IsKnownNested();
-        S.RRI.Calls.insert(Inst);
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  SuccStack.push_back(std::make_pair(EntryBB, succ_begin(EntryBB)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    TerminatorInst *TI = cast<TerminatorInst>(&SuccStack.back().first->back());
+    succ_iterator End = succ_iterator(TI, true);
+    while (SuccStack.back().second != End) {
+      BasicBlock *BB = *SuccStack.back().second++;
+      if (Visited.insert(BB)) {
+        SuccStack.push_back(std::make_pair(BB, succ_begin(BB)));
+        OnStack.insert(BB);
+        goto dfs_next_succ;
       }
-
-      S.SetAtLeastOneRefCount();
-      S.IncrementRefCount();
-      S.IncrementNestCount();
-      continue;
+      if (OnStack.count(BB))
+        Backedges.insert(std::make_pair(SuccStack.back().first, BB));
     }
-    case IC_Release: {
-      Arg = GetObjCArg(Inst);
+    OnStack.erase(SuccStack.back().first);
+    PostOrder.push_back(SuccStack.pop_back_val().first);
+  } while (!SuccStack.empty());
 
-      PtrState &S = MyStates.getPtrTopDownState(Arg);
-      S.DecrementRefCount();
-      S.DecrementNestCount();
-
-      switch (S.GetSeq()) {
-      case S_Retain:
-      case S_CanRelease:
-        S.RRI.ReverseInsertPts.clear();
-        // FALL THROUGH
-      case S_Use:
-        S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-        S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-        Releases[Inst] = S.RRI;
-        S.ClearSequenceProgress();
-        break;
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-      break;
-    }
-    case IC_AutoreleasepoolPop:
-      // Conservatively, clear MyStates for all known pointers.
-      MyStates.clearTopDownPointers();
-      continue;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      // These are irrelevant.
-      continue;
-    default:
-      break;
-    }
+  Visited.clear();
 
-    // Consider any other possible effects of this instruction on each
-    // pointer being tracked.
-    for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
-         ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
-      const Value *Ptr = MI->first;
-      if (Ptr == Arg)
-        continue; // Handled above.
-      PtrState &S = MI->second;
-      Sequence Seq = S.GetSeq();
-
-      // Check for possible releases.
-      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        S.DecrementRefCount();
-        switch (Seq) {
-        case S_Retain:
-          S.SetSeq(S_CanRelease);
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
+  // Compute the exits, which are the starting points for reverse-CFG DFS.
+  // This includes blocks where all the successors are backedges that
+  // we're skipping.
+  SmallVector<BasicBlock *, 4> Exits;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *BB = I;
+    TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_iterator SI(TI), SE(TI, true); SI != SE; ++SI)
+      if (!Backedges.count(std::make_pair(BB, *SI)))
+        goto HasNonBackedgeSucc;
+    Exits.push_back(BB);
+  HasNonBackedgeSucc:;
+  }
 
-          // One call can't cause a transition from S_Retain to S_CanRelease
-          // and S_CanRelease to S_Use. If we've made the first transition,
-          // we're done.
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> PredStack;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = Exits.begin(), E = Exits.end();
+       I != E; ++I) {
+    BasicBlock *ExitBB = *I;
+    PredStack.push_back(std::make_pair(ExitBB, pred_begin(ExitBB)));
+    Visited.insert(ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      pred_iterator End = pred_end(PredStack.back().first);
+      while (PredStack.back().second != End) {
+        BasicBlock *BB = *PredStack.back().second++;
+        // Skip backedges detected in the forward-CFG DFS.
+        if (Backedges.count(std::make_pair(BB, PredStack.back().first)))
           continue;
-        case S_Use:
-        case S_CanRelease:
-        case S_None:
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          llvm_unreachable("top-down pointer in release state!");
-        }
-      }
-
-      // Check for possible direct uses.
-      switch (Seq) {
-      case S_CanRelease:
-        if (CanUse(Inst, Ptr, PA, Class))
-          S.SetSeq(S_Use);
-        break;
-      case S_Retain:
-        // An objc_retainBlock call may be responsible for copying the block
-        // data from the stack to the heap. Model this by moving it straight
-        // from S_Retain to S_Use.
-        if (S.RRI.IsRetainBlock &&
-            CanUse(Inst, Ptr, PA, Class)) {
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
-          S.SetSeq(S_Use);
+        if (Visited.insert(BB)) {
+          PredStack.push_back(std::make_pair(BB, pred_begin(BB)));
+          goto reverse_dfs_next_succ;
         }
-        break;
-      case S_Use:
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
       }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
     }
   }
-
-  CheckForCFGHazards(BB, BBStates, MyStates);
-  return NestingDetected;
 }
 
 // Visit - Visit the function both top-down and bottom-up.
@@ -2627,43 +3076,29 @@ ObjCARCOpt::Visit(Function &F,
                   DenseMap<const BasicBlock *, BBState> &BBStates,
                   MapVector<Value *, RRInfo> &Retains,
                   DenseMap<Value *, RRInfo> &Releases) {
-  // Use reverse-postorder on the reverse CFG for bottom-up, because we
-  // magically know that loops will be well behaved, i.e. they won't repeatedly
-  // call retain on a single pointer without doing a release. We can't use
-  // ReversePostOrderTraversal here because we want to walk up from each
-  // function exit point.
-  SmallPtrSet<BasicBlock *, 16> Visited;
-  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> Stack;
-  SmallVector<BasicBlock *, 16> Order;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *BB = I;
-    if (BB->getTerminator()->getNumSuccessors() == 0)
-      Stack.push_back(std::make_pair(BB, pred_begin(BB)));
-  }
-  while (!Stack.empty()) {
-    pred_iterator End = pred_end(Stack.back().first);
-    while (Stack.back().second != End) {
-      BasicBlock *BB = *Stack.back().second++;
-      if (Visited.insert(BB))
-        Stack.push_back(std::make_pair(BB, pred_begin(BB)));
-    }
-    Order.push_back(Stack.pop_back_val().first);
-  }
+
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
   bool BottomUpNestingDetected = false;
   for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-         Order.rbegin(), E = Order.rend(); I != E; ++I) {
-    BasicBlock *BB = *I;
-    BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
-  }
+       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
+       I != E; ++I)
+    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
 
-  // Use regular reverse-postorder for top-down.
+  // Use reverse-postorder for top-down.
   bool TopDownNestingDetected = false;
-  typedef ReversePostOrderTraversal<Function *> RPOTType;
-  RPOTType RPOT(&F);
-  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    BasicBlock *BB = *I;
-    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
-  }
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       PostOrder.rbegin(), E = PostOrder.rend();
+       I != E; ++I)
+    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
 
   return TopDownNestingDetected && BottomUpNestingDetected;
 }
@@ -2691,40 +3126,26 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                          getRetainBlockCallee(M) : getRetainCallee(M),
                        MyArg, "", InsertPt);
     Call->setDoesNotThrow();
-    if (!RetainsToMove.IsRetainBlock)
+    if (RetainsToMove.IsRetainBlock)
+      Call->setMetadata(CopyOnEscapeMDKind,
+                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
+    else
       Call->setTailCall();
   }
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        PI = RetainsToMove.ReverseInsertPts.begin(),
        PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *LastUse = *PI;
-    Instruction *InsertPts[] = { 0, 0, 0 };
-    if (InvokeInst *II = dyn_cast<InvokeInst>(LastUse)) {
-      // We can't insert code immediately after an invoke instruction, so
-      // insert code at the beginning of both successor blocks instead.
-      // The invoke's return value isn't available in the unwind block,
-      // but our releases will never depend on it, because they must be
-      // paired with retains from before the invoke.
-      InsertPts[0] = II->getNormalDest()->getFirstInsertionPt();
-      InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
-    } else {
-      // Insert code immediately after the last use.
-      InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
-    }
-
-    for (Instruction **I = InsertPts; *I; ++I) {
-      Instruction *InsertPt = *I;
-      Value *MyArg = ArgTy == ParamTy ? Arg :
-                     new BitCastInst(Arg, ParamTy, "", InsertPt);
-      CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
-                                        "", InsertPt);
-      // Attach a clang.imprecise_release metadata tag, if appropriate.
-      if (MDNode *M = ReleasesToMove.ReleaseMetadata)
-        Call->setMetadata(ImpreciseReleaseMDKind, M);
-      Call->setDoesNotThrow();
-      if (ReleasesToMove.IsTailCallRelease)
-        Call->setTailCall();
-    }
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                      "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(ImpreciseReleaseMDKind, M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
   }
 
   // Delete the original retain and release calls.
@@ -2765,17 +3186,11 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     Instruction *Retain = cast<Instruction>(V);
     Value *Arg = GetObjCArg(Retain);
 
-    // If the object being released is in static storage, we know it's
+    // If the object being released is in static or stack storage, we know it's
     // not being managed by ObjC reference counting, so we can delete pairs
     // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg);
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
    
-    // Same for stack storage, unless this is an objc_retainBlock call,
-    // which is responsible for copying the block data from the stack to
-    // the heap.
-    if (!I->second.IsRetainBlock && isa<AllocaInst>(Arg))
-      KnownSafe = true;
-
     // A constant pointer can't be pointing to an object on the heap. It may
     // be reference-counted, but it won't be deleted.
     if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
@@ -3091,7 +3506,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
            UE = Alloca->use_end(); UI != UE; ) {
         CallInst *UserInst = cast<CallInst>(*UI++);
         if (!UserInst->use_empty())
-          UserInst->replaceAllUsesWith(UserInst->getOperand(1));
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(0));
         UserInst->eraseFromParent();
       }
       Alloca->eraseFromParent();
@@ -3243,6 +3658,10 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   // Identify the imprecise release metadata kind.
   ImpreciseReleaseMDKind =
     M.getContext().getMDKindID("clang.imprecise_release");
+  CopyOnEscapeMDKind =
+    M.getContext().getMDKindID("clang.arc.copy_on_escape");
+  NoObjCARCExceptionsMDKind =
+    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
@@ -3344,6 +3763,11 @@ namespace {
     /// RetainRV calls to make the optimization work on targets which need it.
     const MDString *RetainRVMarker;
 
+    /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If
+    /// at the end of walking the function we have found no alloca
+    /// instructions, these calls can be marked "tail".
+    DenseSet<CallInst *> StoreStrongCalls;
+
     Constant *getStoreStrongCallee(Module *M);
     Constant *getRetainAutoreleaseCallee(Module *M);
     Constant *getRetainAutoreleaseRVCallee(Module *M);
@@ -3547,6 +3971,11 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   StoreStrong->setDoesNotThrow();
   StoreStrong->setDebugLoc(Store->getDebugLoc());
 
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
   if (&*Iter == Store) ++Iter;
   Store->eraseFromParent();
   Release->eraseFromParent();
@@ -3593,6 +4022,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
 
   PA.setAA(&getAnalysis<AliasAnalysis>());
 
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs = !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
+
   // For ObjC library calls which return their argument, replace uses of the
   // argument with uses of the call return value, if it dominates the use. This
   // reduces register pressure.
@@ -3649,6 +4085,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     case IC_Release:
       ContractRelease(Inst, I);
       continue;
+    case IC_User:
+      // Be conservative if the function has any alloca instructions.
+      // Technically we only care about escaping alloca instructions,
+      // but this is sufficient to handle some interesting cases.
+      if (isa<AllocaInst>(Inst))
+        TailOkForStoreStrongs = false;
+      continue;
     default:
       continue;
     }
@@ -3666,36 +4109,37 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         Use &U = UI.getUse();
         unsigned OperandNo = UI.getOperandNo();
         ++UI; // Increment UI now, because we may unlink its element.
-        if (Instruction *UserInst = dyn_cast<Instruction>(U.getUser()))
-          if (Inst != UserInst && DT->dominates(Inst, UserInst)) {
-            Changed = true;
-            Instruction *Replacement = Inst;
-            Type *UseTy = U.get()->getType();
-            if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
-              // For PHI nodes, insert the bitcast in the predecessor block.
-              unsigned ValNo =
-                PHINode::getIncomingValueNumForOperand(OperandNo);
-              BasicBlock *BB =
-                PHI->getIncomingBlock(ValNo);
-              if (Replacement->getType() != UseTy)
-                Replacement = new BitCastInst(Replacement, UseTy, "",
-                                              &BB->back());
-              for (unsigned i = 0, e = PHI->getNumIncomingValues();
-                   i != e; ++i)
-                if (PHI->getIncomingBlock(i) == BB) {
-                  // Keep the UI iterator valid.
-                  if (&PHI->getOperandUse(
-                        PHINode::getOperandNumForIncomingValue(i)) ==
-                        &UI.getUse())
-                    ++UI;
-                  PHI->setIncomingValue(i, Replacement);
-                }
-            } else {
-              if (Replacement->getType() != UseTy)
-                Replacement = new BitCastInst(Replacement, UseTy, "", UserInst);
-              U.set(Replacement);
-            }
+        if (DT->isReachableFromEntry(U) &&
+            DT->dominates(Inst, U)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo =
+              PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB =
+              PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            for (unsigned i = 0, e = PHI->getNumIncomingValues();
+                 i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (&PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) ==
+                    &UI.getUse())
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
+            U.set(Replacement);
           }
+        }
       }
 
       // If Arg is a no-op casted pointer, strip one level of casts and
@@ -3713,5 +4157,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     }
   }
 
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (DenseSet<CallInst *>::iterator I = StoreStrongCalls.begin(),
+         E = StoreStrongCalls.end(); I != E; ++I)
+      (*I)->setTailCall();
+  StoreStrongCalls.clear();
+
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 8f98a5b..cb408a1 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -74,7 +74,7 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
 namespace {
   class Reassociate : public FunctionPass {
     DenseMap<BasicBlock*, unsigned> RankMap;
-    DenseMap<AssertingVH<>, unsigned> ValueRankMap;
+    DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
     SmallVector<WeakVH, 8> RedoInsts;
     SmallVector<WeakVH, 8> DeadInsts;
     bool MadeChange;
@@ -210,7 +210,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
 static Instruction *LowerNegateToMultiply(Instruction *Neg,
-                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
   Constant *Cst = Constant::getAllOnesValue(Neg->getType());
 
   Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
@@ -492,7 +492,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
 /// only used by an add, transform this into (X+(0-Y)) to promote better
 /// reassociation.
 static Instruction *BreakUpSubtract(Instruction *Sub,
-                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
@@ -517,8 +517,8 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
 /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
 /// by one, change this into a multiply by a constant to assist with further
 /// reassociation.
-static Instruction *ConvertShiftToMul(Instruction *Shl, 
-                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+static Instruction *ConvertShiftToMul(Instruction *Shl,
+                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
   // If an operand of this shift is a reassociable multiply, or if the shift
   // is used by a reassociable multiply or add, turn into a multiply.
   if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 196a847..16b64a5 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -25,9 +25,9 @@
 #include "llvm/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -39,9 +39,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include <algorithm>
-#include <map>
 using namespace llvm;
 
 STATISTIC(NumInstRemoved, "Number of instructions removed");
@@ -59,7 +57,7 @@ class LatticeVal {
   enum LatticeValueTy {
     /// undefined - This LLVM Value has no known value yet.
     undefined,
-    
+
     /// constant - This LLVM Value has a specific constant value.
     constant,
 
@@ -68,7 +66,7 @@ class LatticeVal {
     /// with another (different) constant, it goes to overdefined, instead of
     /// asserting.
     forcedconstant,
-    
+
     /// overdefined - This instruction is not known to be constant, and we know
     /// it has a value.
     overdefined
@@ -77,30 +75,30 @@ class LatticeVal {
   /// Val: This stores the current lattice value along with the Constant* for
   /// the constant if this is a 'constant' or 'forcedconstant' value.
   PointerIntPair<Constant *, 2, LatticeValueTy> Val;
-  
+
   LatticeValueTy getLatticeValue() const {
     return Val.getInt();
   }
-  
+
 public:
   LatticeVal() : Val(0, undefined) {}
-  
+
   bool isUndefined() const { return getLatticeValue() == undefined; }
   bool isConstant() const {
     return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
   }
   bool isOverdefined() const { return getLatticeValue() == overdefined; }
-  
+
   Constant *getConstant() const {
     assert(isConstant() && "Cannot get the constant of a non-constant!");
     return Val.getPointer();
   }
-  
+
   /// markOverdefined - Return true if this is a change in status.
   bool markOverdefined() {
     if (isOverdefined())
       return false;
-    
+
     Val.setInt(overdefined);
     return true;
   }
@@ -111,17 +109,17 @@ public:
       assert(getConstant() == V && "Marking constant with different value");
       return false;
     }
-    
+
     if (isUndefined()) {
       Val.setInt(constant);
       assert(V && "Marking constant with NULL");
       Val.setPointer(V);
     } else {
-      assert(getLatticeValue() == forcedconstant && 
+      assert(getLatticeValue() == forcedconstant &&
              "Cannot move from overdefined to constant!");
       // Stay at forcedconstant if the constant is the same.
       if (V == getConstant()) return false;
-      
+
       // Otherwise, we go to overdefined.  Assumptions made based on the
       // forced value are possibly wrong.  Assuming this is another constant
       // could expose a contradiction.
@@ -137,7 +135,7 @@ public:
       return dyn_cast<ConstantInt>(getConstant());
     return 0;
   }
-  
+
   void markForcedConstant(Constant *V) {
     assert(isUndefined() && "Can't force a defined value!");
     Val.setInt(forcedconstant);
@@ -156,6 +154,7 @@ namespace {
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const TargetData *TD;
+  const TargetLibraryInfo *TLI;
   SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
 
@@ -163,7 +162,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   /// StructType, for example for formal arguments, calls, insertelement, etc.
   ///
   DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
-  
+
   /// GlobalValue - If we are tracking any values for the contents of a global
   /// variable, we keep a mapping from the constant accessor to the element of
   /// the global, to the currently known value.  If the value becomes
@@ -178,7 +177,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
   /// that return multiple values.
   DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
-  
+
   /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
   /// represented here for efficient lookup.
   SmallPtrSet<Function*, 16> MRVFunctionsTracked;
@@ -187,7 +186,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   /// arguments we make optimistic assumptions about and try to prove as
   /// constants.
   SmallPtrSet<Function*, 16> TrackingIncomingArguments;
-  
+
   /// The reason for two worklists is that overdefined is the lowest state
   /// on the lattice, and moving things to overdefined as fast as possible
   /// makes SCCP converge much faster.
@@ -201,16 +200,13 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
 
   SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
 
-  /// UsersOfOverdefinedPHIs - Keep track of any users of PHI nodes that are not
-  /// overdefined, despite the fact that the PHI node is overdefined.
-  std::multimap<PHINode*, Instruction*> UsersOfOverdefinedPHIs;
-
   /// KnownFeasibleEdges - Entries in this set are edges which have already had
   /// PHI nodes retriggered.
   typedef std::pair<BasicBlock*, BasicBlock*> Edge;
   DenseSet<Edge> KnownFeasibleEdges;
 public:
-  SCCPSolver(const TargetData *td) : TD(td) {}
+  SCCPSolver(const TargetData *td, const TargetLibraryInfo *tli)
+    : TD(td), TLI(tli) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
@@ -253,7 +249,7 @@ public:
   void AddArgumentTrackedFunction(Function *F) {
     TrackingIncomingArguments.insert(F);
   }
-  
+
   /// Solve - Solve for constants and executable blocks.
   ///
   void Solve();
@@ -274,9 +270,9 @@ public:
     assert(I != ValueState.end() && "V is not in valuemap!");
     return I->second;
   }
-  
+
   /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const {
-    DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I = 
+    DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I =
       StructValueState.find(std::make_pair(V, i));
     assert(I != StructValueState.end() && "V is not in valuemap!");
     return I->second;
@@ -308,7 +304,7 @@ public:
     else
       markOverdefined(V);
   }
-  
+
 private:
   // markConstant - Make a value be marked as "constant".  If the value
   // is not already a constant, add it to the instruction work list so that
@@ -322,7 +318,7 @@ private:
     else
       InstWorkList.push_back(V);
   }
-  
+
   void markConstant(Value *V, Constant *C) {
     assert(!V->getType()->isStructTy() && "Should use other method");
     markConstant(ValueState[V], V, C);
@@ -338,14 +334,14 @@ private:
     else
       InstWorkList.push_back(V);
   }
-  
-  
+
+
   // markOverdefined - Make a value be marked as "overdefined". If the
   // value is not already overdefined, add it to the overdefined instruction
   // work list so that the users of the instruction are updated later.
   void markOverdefined(LatticeVal &IV, Value *V) {
     if (!IV.markOverdefined()) return;
-    
+
     DEBUG(dbgs() << "markOverdefined: ";
           if (Function *F = dyn_cast<Function>(V))
             dbgs() << "Function '" << F->getName() << "'\n";
@@ -365,7 +361,7 @@ private:
     else if (IV.getConstant() != MergeWithV.getConstant())
       markOverdefined(IV, V);
   }
-  
+
   void mergeInValue(Value *V, LatticeVal MergeWithV) {
     assert(!V->getType()->isStructTy() && "Should use other method");
     mergeInValue(ValueState[V], V, MergeWithV);
@@ -390,7 +386,7 @@ private:
       if (!isa<UndefValue>(V))
         LV.markConstant(C);          // Constants are constant
     }
-    
+
     // All others are underdefined by default.
     return LV;
   }
@@ -412,21 +408,20 @@ private:
       return LV;  // Common case, already in the map.
 
     if (Constant *C = dyn_cast<Constant>(V)) {
-      if (isa<UndefValue>(C))
-        ; // Undef values remain undefined.
-      else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C))
-        LV.markConstant(CS->getOperand(i));      // Constants are constant.
-      else if (isa<ConstantAggregateZero>(C)) {
-        Type *FieldTy = cast<StructType>(V->getType())->getElementType(i);
-        LV.markConstant(Constant::getNullValue(FieldTy));
-      } else
+      Constant *Elt = C->getAggregateElement(i);
+      
+      if (Elt == 0)
         LV.markOverdefined();      // Unknown sort of constant.
+      else if (isa<UndefValue>(Elt))
+        ; // Undef values remain undefined.
+      else
+        LV.markConstant(Elt);      // Constants are constant.
     }
-    
+
     // All others are underdefined by default.
     return LV;
   }
-  
+
 
   /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
   /// work list if it is not already executable.
@@ -466,33 +461,6 @@ private:
     if (BBExecutable.count(I->getParent()))   // Inst is executable?
       visit(*I);
   }
-  
-  /// RemoveFromOverdefinedPHIs - If I has any entries in the
-  /// UsersOfOverdefinedPHIs map for PN, remove them now.
-  void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) {
-    if (UsersOfOverdefinedPHIs.empty()) return;
-    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
-    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
-    for (ItTy It = Range.first, E = Range.second; It != E;) {
-      if (It->second == I)
-        UsersOfOverdefinedPHIs.erase(It++);
-      else
-        ++It;
-    }
-  }
-
-  /// InsertInOverdefinedPHIs - Insert an entry in the UsersOfOverdefinedPHIS
-  /// map for I and PN, but if one is there already, do not create another.
-  /// (Duplicate entries do not break anything directly, but can lead to
-  /// exponential growth of the table in rare cases.)
-  void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) {
-    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
-    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
-    for (ItTy J = Range.first, E = Range.second; J != E; ++J)
-      if (J->second == I)
-        return;
-    UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I));
-  }
 
 private:
   friend class InstVisitor<SCCPSolver>;
@@ -559,7 +527,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
       Succs[0] = true;
       return;
     }
-    
+
     LatticeVal BCValue = getValueState(BI->getCondition());
     ConstantInt *CI = BCValue.getConstantInt();
     if (CI == 0) {
@@ -569,44 +537,44 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
         Succs[0] = Succs[1] = true;
       return;
     }
-    
+
     // Constant condition variables mean the branch can only go a single way.
     Succs[CI->isZero()] = true;
     return;
   }
-  
+
   if (isa<InvokeInst>(TI)) {
     // Invoke instructions successors are always executable.
     Succs[0] = Succs[1] = true;
     return;
   }
-  
+
   if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
-    if (TI.getNumSuccessors() < 2) {
+    if (!SI->getNumCases()) {
       Succs[0] = true;
       return;
     }
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
-    
+
     if (CI == 0) {   // Overdefined or undefined condition?
       // All destinations are executable!
       if (!SCValue.isUndefined())
         Succs.assign(TI.getNumSuccessors(), true);
       return;
     }
-      
-    Succs[SI->findCaseValue(CI)] = true;
+
+    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
     return;
   }
-  
+
   // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
   if (isa<IndirectBrInst>(&TI)) {
     // Just mark all destinations executable!
     Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
-  
+
 #ifndef NDEBUG
   dbgs() << "Unknown terminator instruction: " << TI << '\n';
 #endif
@@ -628,7 +596,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
   if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
     if (BI->isUnconditional())
       return true;
-    
+
     LatticeVal BCValue = getValueState(BI->getCondition());
 
     // Overdefined condition variables mean the branch could go either way,
@@ -636,40 +604,33 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     ConstantInt *CI = BCValue.getConstantInt();
     if (CI == 0)
       return !BCValue.isUndefined();
-    
+
     // Constant condition variables mean the branch can only go a single way.
     return BI->getSuccessor(CI->isZero()) == To;
   }
-  
+
   // Invoke instructions successors are always executable.
   if (isa<InvokeInst>(TI))
     return true;
-  
+
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-    if (SI->getNumSuccessors() < 2)
+    if (SI->getNumCases() < 1)
       return true;
 
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
-    
+
     if (CI == 0)
       return !SCValue.isUndefined();
 
-    // Make sure to skip the "default value" which isn't a value
-    for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i)
-      if (SI->getSuccessorValue(i) == CI) // Found the taken branch.
-        return SI->getSuccessor(i) == To;
-
-    // If the constant value is not equal to any of the branches, we must
-    // execute default branch.
-    return SI->getDefaultDest() == To;
+    return SI->findCaseValue(CI).getCaseSuccessor() == To;
   }
-  
+
   // Just mark all destinations executable!
   // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
   if (isa<IndirectBrInst>(TI))
     return true;
-  
+
 #ifndef NDEBUG
   dbgs() << "Unknown terminator instruction: " << *TI << '\n';
 #endif
@@ -699,30 +660,15 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // TODO: We could do a lot better than this if code actually uses this.
   if (PN.getType()->isStructTy())
     return markAnythingOverdefined(&PN);
-  
-  if (getValueState(&PN).isOverdefined()) {
-    // There may be instructions using this PHI node that are not overdefined
-    // themselves.  If so, make sure that they know that the PHI node operand
-    // changed.
-    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
-    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(&PN);
-    
-    if (Range.first == Range.second)
-      return;
-    
-    SmallVector<Instruction*, 16> Users;
-    for (ItTy I = Range.first, E = Range.second; I != E; ++I)
-      Users.push_back(I->second);
-    while (!Users.empty())
-      visit(Users.pop_back_val());
+
+  if (getValueState(&PN).isOverdefined())
     return;  // Quick exit
-  }
 
   // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
   // and slow us down a lot.  Just mark them overdefined.
   if (PN.getNumIncomingValues() > 64)
     return markOverdefined(&PN);
-  
+
   // Look at all of the executable operands of the PHI node.  If any of them
   // are overdefined, the PHI becomes overdefined as well.  If they are all
   // constant, and they agree with each other, the PHI becomes the identical
@@ -736,7 +682,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
 
     if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
       continue;
-    
+
     if (IV.isOverdefined())    // PHI node becomes overdefined!
       return markOverdefined(&PN);
 
@@ -744,11 +690,11 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
       OperandVal = IV.getConstant();
       continue;
     }
-    
+
     // There is already a reachable operand.  If we conflict with it,
     // then the PHI node becomes overdefined.  If we agree with it, we
     // can continue on.
-    
+
     // Check to see if there are two different constants merging, if so, the PHI
     // node is overdefined.
     if (IV.getConstant() != OperandVal)
@@ -772,7 +718,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
 
   Function *F = I.getParent()->getParent();
   Value *ResultOp = I.getOperand(0);
-  
+
   // If we are tracking the return value of this function, merge it in.
   if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
     DenseMap<Function*, LatticeVal>::iterator TFRVI =
@@ -782,7 +728,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
       return;
     }
   }
-  
+
   // Handle functions that return multiple values.
   if (!TrackedMultipleRetVals.empty()) {
     if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
@@ -790,7 +736,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
           mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
                        getStructValueState(ResultOp, i));
-    
+
   }
 }
 
@@ -811,7 +757,7 @@ void SCCPSolver::visitCastInst(CastInst &I) {
   if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
     markOverdefined(&I);
   else if (OpSt.isConstant())        // Propagate constant value
-    markConstant(&I, ConstantExpr::getCast(I.getOpcode(), 
+    markConstant(&I, ConstantExpr::getCast(I.getOpcode(),
                                            OpSt.getConstant(), I.getType()));
 }
 
@@ -821,7 +767,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // structs in structs.
   if (EVI.getType()->isStructTy())
     return markAnythingOverdefined(&EVI);
-    
+
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
     return markOverdefined(&EVI);
@@ -841,15 +787,15 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   StructType *STy = dyn_cast<StructType>(IVI.getType());
   if (STy == 0)
     return markOverdefined(&IVI);
-  
+
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
     return markAnythingOverdefined(&IVI);
-  
+
   Value *Aggr = IVI.getAggregateOperand();
   unsigned Idx = *IVI.idx_begin();
-  
+
   // Compute the result based on what we're inserting.
   for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
     // This passes through all values that aren't the inserted element.
@@ -858,7 +804,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
       mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
       continue;
     }
-    
+
     Value *Val = IVI.getInsertedValueOperand();
     if (Val->getType()->isStructTy())
       // We don't track structs in structs.
@@ -875,25 +821,25 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // TODO: We could do a lot better than this if code actually uses this.
   if (I.getType()->isStructTy())
     return markAnythingOverdefined(&I);
-  
+
   LatticeVal CondValue = getValueState(I.getCondition());
   if (CondValue.isUndefined())
     return;
-  
+
   if (ConstantInt *CondCB = CondValue.getConstantInt()) {
     Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
     mergeInValue(&I, getValueState(OpVal));
     return;
   }
-  
+
   // Otherwise, the condition is overdefined or a constant we can't evaluate.
   // See if we can produce something better than overdefined based on the T/F
   // value.
   LatticeVal TVal = getValueState(I.getTrueValue());
   LatticeVal FVal = getValueState(I.getFalseValue());
-  
+
   // select ?, C, C -> C.
-  if (TVal.isConstant() && FVal.isConstant() && 
+  if (TVal.isConstant() && FVal.isConstant() &&
       TVal.getConstant() == FVal.getConstant())
     return markConstant(&I, FVal.getConstant());
 
@@ -908,7 +854,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
 void SCCPSolver::visitBinaryOperator(Instruction &I) {
   LatticeVal V1State = getValueState(I.getOperand(0));
   LatticeVal V2State = getValueState(I.getOperand(1));
-  
+
   LatticeVal &IV = ValueState[&I];
   if (IV.isOverdefined()) return;
 
@@ -916,14 +862,14 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
     return markConstant(IV, &I,
                         ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
                                           V2State.getConstant()));
-  
+
   // If something is undef, wait for it to resolve.
   if (!V1State.isOverdefined() && !V2State.isOverdefined())
     return;
-  
+
   // Otherwise, one of our operands is overdefined.  Try to produce something
   // better than overdefined with some tricks.
-  
+
   // If this is an AND or OR with 0 or -1, it doesn't matter that the other
   // operand is overdefined.
   if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
@@ -945,7 +891,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
                        Constant::getAllOnesValue(I.getType()));
         return;
       }
-      
+
       if (I.getOpcode() == Instruction::And) {
         // X and 0 = 0
         if (NonOverdefVal->getConstant()->isNullValue())
@@ -959,64 +905,6 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
   }
 
 
-  // If both operands are PHI nodes, it is possible that this instruction has
-  // a constant value, despite the fact that the PHI node doesn't.  Check for
-  // this condition now.
-  if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
-    if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
-      if (PN1->getParent() == PN2->getParent()) {
-        // Since the two PHI nodes are in the same basic block, they must have
-        // entries for the same predecessors.  Walk the predecessor list, and
-        // if all of the incoming values are constants, and the result of
-        // evaluating this expression with all incoming value pairs is the
-        // same, then this expression is a constant even though the PHI node
-        // is not a constant!
-        LatticeVal Result;
-        for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
-          LatticeVal In1 = getValueState(PN1->getIncomingValue(i));
-          BasicBlock *InBlock = PN1->getIncomingBlock(i);
-          LatticeVal In2 =getValueState(PN2->getIncomingValueForBlock(InBlock));
-
-          if (In1.isOverdefined() || In2.isOverdefined()) {
-            Result.markOverdefined();
-            break;  // Cannot fold this operation over the PHI nodes!
-          }
-          
-          if (In1.isConstant() && In2.isConstant()) {
-            Constant *V = ConstantExpr::get(I.getOpcode(), In1.getConstant(),
-                                            In2.getConstant());
-            if (Result.isUndefined())
-              Result.markConstant(V);
-            else if (Result.isConstant() && Result.getConstant() != V) {
-              Result.markOverdefined();
-              break;
-            }
-          }
-        }
-
-        // If we found a constant value here, then we know the instruction is
-        // constant despite the fact that the PHI nodes are overdefined.
-        if (Result.isConstant()) {
-          markConstant(IV, &I, Result.getConstant());
-          // Remember that this instruction is virtually using the PHI node
-          // operands. 
-          InsertInOverdefinedPHIs(&I, PN1);
-          InsertInOverdefinedPHIs(&I, PN2);
-          return;
-        }
-        
-        if (Result.isUndefined())
-          return;
-
-        // Okay, this really is overdefined now.  Since we might have
-        // speculatively thought that this was not overdefined before, and
-        // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
-        // make sure to clean out any entries that we put there, for
-        // efficiency.
-        RemoveFromOverdefinedPHIs(&I, PN1);
-        RemoveFromOverdefinedPHIs(&I, PN2);
-      }
-
   markOverdefined(&I);
 }
 
@@ -1029,75 +917,13 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
   if (IV.isOverdefined()) return;
 
   if (V1State.isConstant() && V2State.isConstant())
-    return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), 
-                                                         V1State.getConstant(), 
+    return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(),
+                                                         V1State.getConstant(),
                                                         V2State.getConstant()));
-  
+
   // If operands are still undefined, wait for it to resolve.
   if (!V1State.isOverdefined() && !V2State.isOverdefined())
     return;
-  
-  // If something is overdefined, use some tricks to avoid ending up and over
-  // defined if we can.
-  
-  // If both operands are PHI nodes, it is possible that this instruction has
-  // a constant value, despite the fact that the PHI node doesn't.  Check for
-  // this condition now.
-  if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
-    if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
-      if (PN1->getParent() == PN2->getParent()) {
-        // Since the two PHI nodes are in the same basic block, they must have
-        // entries for the same predecessors.  Walk the predecessor list, and
-        // if all of the incoming values are constants, and the result of
-        // evaluating this expression with all incoming value pairs is the
-        // same, then this expression is a constant even though the PHI node
-        // is not a constant!
-        LatticeVal Result;
-        for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
-          LatticeVal In1 = getValueState(PN1->getIncomingValue(i));
-          BasicBlock *InBlock = PN1->getIncomingBlock(i);
-          LatticeVal In2 =getValueState(PN2->getIncomingValueForBlock(InBlock));
-
-          if (In1.isOverdefined() || In2.isOverdefined()) {
-            Result.markOverdefined();
-            break;  // Cannot fold this operation over the PHI nodes!
-          }
-          
-          if (In1.isConstant() && In2.isConstant()) {
-            Constant *V = ConstantExpr::getCompare(I.getPredicate(), 
-                                                   In1.getConstant(), 
-                                                   In2.getConstant());
-            if (Result.isUndefined())
-              Result.markConstant(V);
-            else if (Result.isConstant() && Result.getConstant() != V) {
-              Result.markOverdefined();
-              break;
-            }
-          }
-        }
-
-        // If we found a constant value here, then we know the instruction is
-        // constant despite the fact that the PHI nodes are overdefined.
-        if (Result.isConstant()) {
-          markConstant(&I, Result.getConstant());
-          // Remember that this instruction is virtually using the PHI node
-          // operands.
-          InsertInOverdefinedPHIs(&I, PN1);
-          InsertInOverdefinedPHIs(&I, PN2);
-          return;
-        }
-        
-        if (Result.isUndefined())
-          return;
-
-        // Okay, this really is overdefined now.  Since we might have
-        // speculatively thought that this was not overdefined before, and
-        // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
-        // make sure to clean out any entries that we put there, for
-        // efficiency.
-        RemoveFromOverdefinedPHIs(&I, PN1);
-        RemoveFromOverdefinedPHIs(&I, PN2);
-      }
 
   markOverdefined(&I);
 }
@@ -1135,7 +961,7 @@ void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
                                                     EltState.getConstant(),
                                                     IdxState.getConstant()));
   else if (ValState.isUndefined() && EltState.isConstant() &&
-           IdxState.isConstant()) 
+           IdxState.isConstant())
     markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
                                                    EltState.getConstant(),
                                                    IdxState.getConstant()));
@@ -1153,17 +979,17 @@ void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
   if (MaskState.isUndefined() ||
       (V1State.isUndefined() && V2State.isUndefined()))
     return;  // Undefined output if mask or both inputs undefined.
-  
+
   if (V1State.isOverdefined() || V2State.isOverdefined() ||
       MaskState.isOverdefined()) {
     markOverdefined(&I);
   } else {
     // A mix of constant/undef inputs.
-    Constant *V1 = V1State.isConstant() ? 
+    Constant *V1 = V1State.isConstant() ?
         V1State.getConstant() : UndefValue::get(I.getType());
-    Constant *V2 = V2State.isConstant() ? 
+    Constant *V2 = V2State.isConstant() ?
         V2State.getConstant() : UndefValue::get(I.getType());
-    Constant *Mask = MaskState.isConstant() ? 
+    Constant *Mask = MaskState.isConstant() ?
       MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
     markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
   }
@@ -1183,7 +1009,7 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
     LatticeVal State = getValueState(I.getOperand(i));
     if (State.isUndefined())
       return;  // Operands are not resolved yet.
-    
+
     if (State.isOverdefined())
       return markOverdefined(&I);
 
@@ -1200,10 +1026,10 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
   // If this store is of a struct, ignore it.
   if (SI.getOperand(0)->getType()->isStructTy())
     return;
-  
+
   if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
     return;
-  
+
   GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
   DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
   if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
@@ -1221,22 +1047,22 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
   // If this load is of a struct, just mark the result overdefined.
   if (I.getType()->isStructTy())
     return markAnythingOverdefined(&I);
-  
+
   LatticeVal PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
-  
+
   LatticeVal &IV = ValueState[&I];
   if (IV.isOverdefined()) return;
 
   if (!PtrVal.isConstant() || I.isVolatile())
     return markOverdefined(IV, &I);
-    
+
   Constant *Ptr = PtrVal.getConstant();
 
   // load null -> null
   if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
     return markConstant(IV, &I, Constant::getNullValue(I.getType()));
-  
+
   // Transform load (constant global) into the value loaded.
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
     if (!TrackedGlobals.empty()) {
@@ -1262,7 +1088,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
 void SCCPSolver::visitCallSite(CallSite CS) {
   Function *F = CS.getCalledFunction();
   Instruction *I = CS.getInstruction();
-  
+
   // The common case is that we aren't tracking the callee, either because we
   // are not doing interprocedural analysis or the callee is indirect, or is
   // external.  Handle these cases first.
@@ -1270,17 +1096,17 @@ void SCCPSolver::visitCallSite(CallSite CS) {
 CallOverdefined:
     // Void return and not tracking callee, just bail.
     if (I->getType()->isVoidTy()) return;
-    
+
     // Otherwise, if we have a single return value case, and if the function is
     // a declaration, maybe we can constant fold it.
     if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
         canConstantFoldCallTo(F)) {
-      
+
       SmallVector<Constant*, 8> Operands;
       for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
            AI != E; ++AI) {
         LatticeVal State = getValueState(*AI);
-        
+
         if (State.isUndefined())
           return;  // Operands are not resolved yet.
         if (State.isOverdefined())
@@ -1288,10 +1114,10 @@ CallOverdefined:
         assert(State.isConstant() && "Unknown state!");
         Operands.push_back(State.getConstant());
       }
-     
+
       // If we can constant fold this, mark the result of the call as a
       // constant.
-      if (Constant *C = ConstantFoldCall(F, Operands))
+      if (Constant *C = ConstantFoldCall(F, Operands, TLI))
         return markConstant(I, C);
     }
 
@@ -1304,7 +1130,7 @@ CallOverdefined:
   // the formal arguments of the function.
   if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
     MarkBlockExecutable(F->begin());
-    
+
     // Propagate information from this call site into the callee.
     CallSite::arg_iterator CAI = CS.arg_begin();
     for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
@@ -1315,7 +1141,7 @@ CallOverdefined:
         markOverdefined(AI);
         continue;
       }
-      
+
       if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           LatticeVal CallArg = getStructValueState(*CAI, i);
@@ -1326,22 +1152,22 @@ CallOverdefined:
       }
     }
   }
-  
+
   // If this is a single/zero retval case, see if we're tracking the function.
   if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
     if (!MRVFunctionsTracked.count(F))
       goto CallOverdefined;  // Not tracking this callee.
-    
+
     // If we are tracking this callee, propagate the result of the function
     // into this call site.
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-      mergeInValue(getStructValueState(I, i), I, 
+      mergeInValue(getStructValueState(I, i), I,
                    TrackedMultipleRetVals[std::make_pair(F, i)]);
   } else {
     DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
     if (TFRVI == TrackedRetVals.end())
       goto CallOverdefined;  // Not tracking this callee.
-      
+
     // If so, propagate the return value of the callee into this call result.
     mergeInValue(I, TFRVI->second);
   }
@@ -1370,7 +1196,7 @@ void SCCPSolver::Solve() {
         if (Instruction *I = dyn_cast<Instruction>(*UI))
           OperandChangedState(I);
     }
-    
+
     // Process the instruction work list.
     while (!InstWorkList.empty()) {
       Value *I = InstWorkList.pop_back_val();
@@ -1427,11 +1253,11 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
     if (!BBExecutable.count(BB))
       continue;
-    
+
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
       // Look for instructions which produce undef values.
       if (I->getType()->isVoidTy()) continue;
-      
+
       if (StructType *STy = dyn_cast<StructType>(I->getType())) {
         // Only a few things that can be structs matter for undef.
 
@@ -1442,7 +1268,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
               continue;
 
         // extractvalue and insertvalue don't need to be marked; they are
-        // tracked as precisely as their operands. 
+        // tracked as precisely as their operands.
         if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
           continue;
 
@@ -1549,12 +1375,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // X / undef -> undef.  No change.
         // X % undef -> undef.  No change.
         if (Op1LV.isUndefined()) break;
-        
+
         // undef / X -> 0.   X could be maxint.
         // undef % X -> 0.   X could be 1.
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
-        
+
       case Instruction::AShr:
         // X >>a undef -> undef.
         if (Op1LV.isUndefined()) break;
@@ -1587,7 +1413,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         } else {
           // Leave Op1LV as Operand(1)'s LatticeValue.
         }
-        
+
         if (Op1LV.isConstant())
           markForcedConstant(I, Op1LV.getConstant());
         else
@@ -1627,7 +1453,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         return true;
       }
     }
-  
+
     // Check to see if we have a branch or switch on an undefined value.  If so
     // we force the branch to go one way or the other to make the successor
     // values live.  It doesn't really matter which way we force it.
@@ -1636,7 +1462,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       if (!BI->isConditional()) continue;
       if (!getValueState(BI->getCondition()).isUndefined())
         continue;
-    
+
       // If the input to SCCP is actually branch on undef, fix the undef to
       // false.
       if (isa<UndefValue>(BI->getCondition())) {
@@ -1644,7 +1470,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         markEdgeExecutable(BB, TI->getSuccessor(1));
         return true;
       }
-      
+
       // Otherwise, it is a branch on a symbolic value which is currently
       // considered to be undef.  Handle this by forcing the input value to the
       // branch to false.
@@ -1652,22 +1478,22 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
                          ConstantInt::getFalse(TI->getContext()));
       return true;
     }
-    
+
     if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      if (SI->getNumSuccessors() < 2)   // no cases
+      if (!SI->getNumCases())
         continue;
       if (!getValueState(SI->getCondition()).isUndefined())
         continue;
-      
+
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->getCaseValue(1));
-        markEdgeExecutable(BB, TI->getSuccessor(1));
+        SI->setCondition(SI->case_begin().getCaseValue());
+        markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor());
         return true;
       }
-      
-      markForcedConstant(SI->getCondition(), SI->getCaseValue(1));
+
+      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
       return true;
     }
   }
@@ -1683,6 +1509,9 @@ namespace {
   /// Sparse Conditional Constant Propagator.
   ///
   struct SCCP : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
+    }
     static char ID; // Pass identification, replacement for typeid
     SCCP() : FunctionPass(ID) {
       initializeSCCPPass(*PassRegistry::getPassRegistry());
@@ -1735,7 +1564,9 @@ static void DeleteInstructionInBlock(BasicBlock *BB) {
 //
 bool SCCP::runOnFunction(Function &F) {
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  SCCPSolver Solver(getAnalysisIfAvailable<TargetData>());
+  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  SCCPSolver Solver(TD, TLI);
 
   // Mark the first block of the function as being executable.
   Solver.MarkBlockExecutable(F.begin());
@@ -1764,7 +1595,7 @@ bool SCCP::runOnFunction(Function &F) {
       MadeChanges = true;
       continue;
     }
-  
+
     // Iterate over all of the instructions in a function, replacing them with
     // constants if we have found them to be of constant values.
     //
@@ -1772,25 +1603,25 @@ bool SCCP::runOnFunction(Function &F) {
       Instruction *Inst = BI++;
       if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
         continue;
-      
+
       // TODO: Reconstruct structs from their elements.
       if (Inst->getType()->isStructTy())
         continue;
-      
+
       LatticeVal IV = Solver.getLatticeValueFor(Inst);
       if (IV.isOverdefined())
         continue;
-      
+
       Constant *Const = IV.isConstant()
         ? IV.getConstant() : UndefValue::get(Inst->getType());
       DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
 
       // Replaces all of the uses of a variable with uses of the constant.
       Inst->replaceAllUsesWith(Const);
-      
+
       // Delete the instruction.
       Inst->eraseFromParent();
-      
+
       // Hey, we just changed something!
       MadeChanges = true;
       ++NumInstRemoved;
@@ -1807,6 +1638,9 @@ namespace {
   /// Constant Propagation.
   ///
   struct IPSCCP : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
+    }
     static char ID;
     IPSCCP() : ModulePass(ID) {
       initializeIPSCCPPass(*PassRegistry::getPassRegistry());
@@ -1816,7 +1650,11 @@ namespace {
 } // end anonymous namespace
 
 char IPSCCP::ID = 0;
-INITIALIZE_PASS(IPSCCP, "ipsccp",
+INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
+                "Interprocedural Sparse Conditional Constant Propagation",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(IPSCCP, "ipsccp",
                 "Interprocedural Sparse Conditional Constant Propagation",
                 false, false)
 
@@ -1855,7 +1693,9 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 }
 
 bool IPSCCP::runOnModule(Module &M) {
-  SCCPSolver Solver(getAnalysisIfAvailable<TargetData>());
+  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  SCCPSolver Solver(TD, TLI);
 
   // AddressTakenFunctions - This set keeps track of the address-taken functions
   // that are in the input.  As IPSCCP runs through and simplifies code,
@@ -1863,19 +1703,19 @@ bool IPSCCP::runOnModule(Module &M) {
   // address-taken-ness.  Because of this, we keep track of their addresses from
   // the first pass so we can use them for the later simplification pass.
   SmallPtrSet<Function*, 32> AddressTakenFunctions;
-  
+
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
   //
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->isDeclaration())
       continue;
-    
+
     // If this is a strong or ODR definition of this function, then we can
     // propagate information about its result into callsites of it.
     if (!F->mayBeOverridden())
       Solver.AddTrackedFunction(F);
-    
+
     // If this function only has direct calls that we can see, we can track its
     // arguments and return value aggressively, and can assume it is not called
     // unless we see evidence to the contrary.
@@ -1890,7 +1730,7 @@ bool IPSCCP::runOnModule(Module &M) {
 
     // Assume the function is called.
     Solver.MarkBlockExecutable(F->begin());
-    
+
     // Assume nothing about the incoming arguments.
     for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
          AI != E; ++AI)
@@ -1928,17 +1768,17 @@ bool IPSCCP::runOnModule(Module &M) {
       for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
            AI != E; ++AI) {
         if (AI->use_empty() || AI->getType()->isStructTy()) continue;
-        
+
         // TODO: Could use getStructLatticeValueFor to find out if the entire
         // result is a constant and replace it entirely if so.
 
         LatticeVal IV = Solver.getLatticeValueFor(AI);
         if (IV.isOverdefined()) continue;
-        
+
         Constant *CST = IV.isConstant() ?
         IV.getConstant() : UndefValue::get(AI->getType());
         DEBUG(dbgs() << "***  Arg " << *AI << " = " << *CST <<"\n");
-        
+
         // Replaces all of the uses of a variable with uses of the
         // constant.
         AI->replaceAllUsesWith(CST);
@@ -1967,19 +1807,19 @@ bool IPSCCP::runOnModule(Module &M) {
           new UnreachableInst(M.getContext(), BB);
         continue;
       }
-      
+
       for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
         Instruction *Inst = BI++;
         if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())
           continue;
-        
+
         // TODO: Could use getStructLatticeValueFor to find out if the entire
         // result is a constant and replace it entirely if so.
-        
+
         LatticeVal IV = Solver.getLatticeValueFor(Inst);
         if (IV.isOverdefined())
           continue;
-        
+
         Constant *Const = IV.isConstant()
           ? IV.getConstant() : UndefValue::get(Inst->getType());
         DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
@@ -1987,7 +1827,7 @@ bool IPSCCP::runOnModule(Module &M) {
         // Replaces all of the uses of a variable with uses of the
         // constant.
         Inst->replaceAllUsesWith(Const);
-        
+
         // Delete the instruction.
         if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
           Inst->eraseFromParent();
@@ -2029,15 +1869,15 @@ bool IPSCCP::runOnModule(Module &M) {
             llvm_unreachable("Didn't fold away reference to block!");
           }
 #endif
-          
+
           // Make this an uncond branch to the first successor.
           TerminatorInst *TI = I->getParent()->getTerminator();
           BranchInst::Create(TI->getSuccessor(0), TI);
-          
+
           // Remove entries in successor phi nodes to remove edges.
           for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
             TI->getSuccessor(i)->removePredecessor(TI->getParent());
-          
+
           // Remove the old terminator.
           TI->eraseFromParent();
         }
@@ -2060,7 +1900,7 @@ bool IPSCCP::runOnModule(Module &M) {
   // last use of a function, the order of processing functions would affect
   // whether other functions are optimizable.
   SmallVector<ReturnInst*, 8> ReturnsToZap;
-  
+
   // TODO: Process multiple value ret instructions also.
   const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
   for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
@@ -2068,11 +1908,11 @@ bool IPSCCP::runOnModule(Module &M) {
     Function *F = I->first;
     if (I->second.isOverdefined() || F->getReturnType()->isVoidTy())
       continue;
-  
+
     // We can only do this if we know that nothing else can call the function.
     if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
       continue;
-    
+
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
         if (!isa<UndefValue>(RI->getOperand(0)))
@@ -2084,9 +1924,9 @@ bool IPSCCP::runOnModule(Module &M) {
     Function *F = ReturnsToZap[i]->getParent()->getParent();
     ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
   }
-    
-  // If we inferred constant or undef values for globals variables, we can delete
-  // the global and any stores that remain to it.
+
+  // If we inferred constant or undef values for globals variables, we can
+  // delete the global and any stores that remain to it.
   const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
   for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
          E = TG.end(); I != E; ++I) {
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index f6918de..7d65bcc 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -51,6 +51,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
   initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
   initializeObjCARCExpandPass(Registry);
   initializeObjCARCContractPass(Registry);
   initializeObjCARCOptPass(Registry);
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index c6d9123..026fea1 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -13,7 +13,7 @@
 // each member (if possible).  Then, if possible, it transforms the individual
 // alloca instructions into nice clean scalar SSA form.
 //
-// This combines a simple SRoA algorithm with the Mem2Reg algorithm because
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they
 // often interact, especially for C++ programs.  As such, iterating between
 // SRoA, then Mem2Reg until we run out of things to promote works well.
 //
@@ -453,6 +453,8 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      if (!GEP->getPointerOperandType()->isPointerTy())
+        return false;
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
                                                Indices);
       // See if all uses can be converted.
@@ -572,8 +574,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // transform it into a store of the expanded constant value.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
       assert(MSI->getRawDest() == Ptr && "Consistency error!");
-      unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-      if (NumBytes != 0) {
+      int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
+      if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
+        unsigned NumBytes = static_cast<unsigned>(SNumBytes);
         unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
 
         // Compute the value replicated the right number of times.
@@ -806,8 +809,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
         return Builder.CreateBitCast(SV, AllocaType);
 
     // Must be an element insertion.
-    assert(SV->getType() == VTy->getElementType());
-    uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
+    Type *EltTy = VTy->getElementType();
+    if (SV->getType() != EltTy)
+      SV = Builder.CreateBitCast(SV, EltTy);
+    uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
     unsigned Elt = Offset/EltSize;
     return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
   }
@@ -934,13 +939,14 @@ public:
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
     this->AI = AI;
-    if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI))
+    if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) {
       for (Value::use_iterator UI = DebugNode->use_begin(),
              E = DebugNode->use_end(); UI != E; ++UI)
         if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
           DDIs.push_back(DDI);
         else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
           DVIs.push_back(DVI);
+    }
 
     LoadAndStorePromoter::run(Insts);
     AI->eraseFromParent();
@@ -975,30 +981,25 @@ public:
     for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
+      Value *Arg = NULL;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        Instruction *DbgVal = NULL;
         // If an argument is zero extended then use argument directly. The ZExt
         // may be zapped by an optimization pass in future.
-        Argument *ExtendedArg = NULL;
         if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
-          ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
+          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
         if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
-          ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
-        if (ExtendedArg)
-          DbgVal = DIB->insertDbgValueIntrinsic(ExtendedArg, 0, 
-                                                DIVariable(DVI->getVariable()),
-                                                SI);
-        else
-          DbgVal = DIB->insertDbgValueIntrinsic(SI->getOperand(0), 0, 
-                                                DIVariable(DVI->getVariable()),
-                                                SI);
-        DbgVal->setDebugLoc(DVI->getDebugLoc());
+          Arg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (!Arg)
+          Arg = SI->getOperand(0);
       } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Instruction *DbgVal = 
-          DIB->insertDbgValueIntrinsic(LI->getOperand(0), 0, 
-                                       DIVariable(DVI->getVariable()), LI);
-        DbgVal->setDebugLoc(DVI->getDebugLoc());
+        Arg = LI->getOperand(0);
+      } else {
+        continue;
       }
+      Instruction *DbgVal =
+        DIB->insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                     Inst);
+      DbgVal->setDebugLoc(DVI->getDebugLoc());
     }
   }
 };
@@ -1517,6 +1518,9 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       if (Length == 0)
         return MarkUnsafe(Info, User);
+      if (Length->isNegative())
+        return MarkUnsafe(Info, User);
+
       isSafeMemAccess(Offset, Length->getZExtValue(), 0,
                       UI.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
@@ -1873,8 +1877,14 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
     return;
 
   // The bitcast references the original alloca.  Replace its uses with
-  // references to the first new element alloca.
-  Instruction *Val = NewElts[0];
+  // references to the alloca containing offset zero (which is normally at
+  // index zero, but might not be in cases involving structs with elements
+  // of size zero).
+  Type *T = AI->getAllocatedType();
+  uint64_t EltOffset = 0;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+  Instruction *Val = NewElts[Idx];
   if (Val->getType() != BC->getDestTy()) {
     Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
     Val->takeName(BC);
@@ -2146,8 +2156,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           // If the requested value was a vector constant, create it.
           if (EltTy->isVectorTy()) {
             unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
-            SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
-            StoreVal = ConstantVector::get(Elts);
+            StoreVal = ConstantVector::getSplat(NumElts, StoreVal);
           }
         }
         new StoreInst(StoreVal, EltPtr, MI);
@@ -2158,6 +2167,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     }
 
     unsigned EltSize = TD->getTypeAllocSize(EltTy);
+    if (!EltSize)
+      continue;
 
     IRBuilder<> Builder(MI);
 
@@ -2524,13 +2535,12 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
       // ignore it if we know that the value isn't captured.
       unsigned ArgNo = CS.getArgumentNo(UI);
       if (CS.onlyReadsMemory() &&
-          (CS.getInstruction()->use_empty() ||
-           CS.paramHasAttr(ArgNo+1, Attribute::NoCapture)))
+          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
         continue;
 
       // If this is being passed as a byval argument, the caller is making a
       // copy, so it is only a read of the alloca.
-      if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal))
+      if (CS.isByValArgument(ArgNo))
         continue;
     }
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index fbb9465..9c49ec1 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -256,19 +256,18 @@ struct StrChrOpt : public LibCallOptimization {
                         ConstantInt::get(TD->getIntPtrType(*Context), Len),
                         B, TD);
     }
-
+    
     // Otherwise, the character is a constant, see if the first argument is
     // a string literal.  If so, we can constant fold.
-    std::string Str;
-    if (!GetConstantStringInfo(SrcStr, Str))
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str))
       return 0;
 
-    // strchr can find the nul character.
-    Str += '\0';
-
-    // Compute the offset.
-    size_t I = Str.find(CharC->getSExtValue());
-    if (I == std::string::npos) // Didn't find the char.  strchr returns null.
+    // Compute the offset, make sure to handle the case when we're searching for
+    // zero (a weird way to spell strlen).
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.find(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
       return Constant::getNullValue(CI->getType());
 
     // strchr(s+n,c)  -> gep(s+n+i,c)
@@ -296,20 +295,18 @@ struct StrRChrOpt : public LibCallOptimization {
     if (!CharC)
       return 0;
 
-    std::string Str;
-    if (!GetConstantStringInfo(SrcStr, Str)) {
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str)) {
       // strrchr(s, 0) -> strchr(s, 0)
       if (TD && CharC->isZero())
         return EmitStrChr(SrcStr, '\0', B, TD);
       return 0;
     }
 
-    // strrchr can find the nul character.
-    Str += '\0';
-
     // Compute the offset.
-    size_t I = Str.rfind(CharC->getSExtValue());
-    if (I == std::string::npos) // Didn't find the char. Return null.
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.rfind(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char. Return null.
       return Constant::getNullValue(CI->getType());
 
     // strrchr(s+n,c) -> gep(s+n+i,c)
@@ -334,14 +331,13 @@ struct StrCmpOpt : public LibCallOptimization {
     if (Str1P == Str2P)      // strcmp(x,x)  -> 0
       return ConstantInt::get(CI->getType(), 0);
 
-    std::string Str1, Str2;
-    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
-    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
 
     // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
     if (HasStr1 && HasStr2)
-      return ConstantInt::get(CI->getType(),
-                              StringRef(Str1).compare(Str2));
+      return ConstantInt::get(CI->getType(), Str1.compare(Str2));
 
     if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
       return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
@@ -397,14 +393,14 @@ struct StrNCmpOpt : public LibCallOptimization {
     if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
       return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD);
 
-    std::string Str1, Str2;
-    bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
-    bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
 
     // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
     if (HasStr1 && HasStr2) {
-      StringRef SubStr1 = StringRef(Str1).substr(0, Length);
-      StringRef SubStr2 = StringRef(Str2).substr(0, Length);
+      StringRef SubStr1 = Str1.substr(0, Length);
+      StringRef SubStr2 = Str2.substr(0, Length);
       return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
     }
 
@@ -549,9 +545,9 @@ struct StrPBrkOpt : public LibCallOptimization {
         FT->getReturnType() != FT->getParamType(0))
       return 0;
 
-    std::string S1, S2;
-    bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2);
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
 
     // strpbrk(s, "") -> NULL
     // strpbrk("", s) -> NULL
@@ -609,9 +605,9 @@ struct StrSpnOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy())
       return 0;
 
-    std::string S1, S2;
-    bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2);
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
 
     // strspn(s, "") -> 0
     // strspn("", s) -> 0
@@ -619,8 +615,11 @@ struct StrSpnOpt : public LibCallOptimization {
       return Constant::getNullValue(CI->getType());
 
     // Constant folding.
-    if (HasS1 && HasS2)
-      return ConstantInt::get(CI->getType(), strspn(S1.c_str(), S2.c_str()));
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_not_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
 
     return 0;
   }
@@ -638,17 +637,20 @@ struct StrCSpnOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy())
       return 0;
 
-    std::string S1, S2;
-    bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2);
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
 
     // strcspn("", s) -> 0
     if (HasS1 && S1.empty())
       return Constant::getNullValue(CI->getType());
 
     // Constant folding.
-    if (HasS1 && HasS2)
-      return ConstantInt::get(CI->getType(), strcspn(S1.c_str(), S2.c_str()));
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
 
     // strcspn(s, "") -> strlen(s)
     if (TD && HasS2 && S2.empty())
@@ -692,9 +694,9 @@ struct StrStrOpt : public LibCallOptimization {
     }
 
     // See if either input string is a constant string.
-    std::string SearchStr, ToFindStr;
-    bool HasStr1 = GetConstantStringInfo(CI->getArgOperand(0), SearchStr);
-    bool HasStr2 = GetConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+    StringRef SearchStr, ToFindStr;
+    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
 
     // fold strstr(x, "") -> x.
     if (HasStr2 && ToFindStr.empty())
@@ -704,7 +706,7 @@ struct StrStrOpt : public LibCallOptimization {
     if (HasStr1 && HasStr2) {
       std::string::size_type Offset = SearchStr.find(ToFindStr);
 
-      if (Offset == std::string::npos) // strstr("foo", "bar") -> null
+      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
         return Constant::getNullValue(CI->getType());
 
       // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
@@ -756,11 +758,11 @@ struct MemCmpOpt : public LibCallOptimization {
     }
 
     // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
-    std::string LHSStr, RHSStr;
-    if (GetConstantStringInfo(LHS, LHSStr) &&
-        GetConstantStringInfo(RHS, RHSStr)) {
+    StringRef LHSStr, RHSStr;
+    if (getConstantStringInfo(LHS, LHSStr) &&
+        getConstantStringInfo(RHS, RHSStr)) {
       // Make sure we're not reading out-of-bounds memory.
-      if (Len > LHSStr.length() || Len > RHSStr.length())
+      if (Len > LHSStr.size() || Len > RHSStr.size())
         return 0;
       uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
       return ConstantInt::get(CI->getType(), Ret);
@@ -841,6 +843,28 @@ struct MemSetOpt : public LibCallOptimization {
 //===----------------------------------------------------------------------===//
 
 //===---------------------------------------===//
+// 'cos*' Optimizations
+
+struct CosOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return 0;
+
+    // cos(-x) -> cos(x)
+    Value *Op1 = CI->getArgOperand(0);
+    if (BinaryOperator::isFNeg(Op1)) {
+      BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
+      return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
+    }
+    return 0;
+  }
+};
+
+//===---------------------------------------===//
 // 'pow*' Optimizations
 
 struct PowOpt : public LibCallOptimization {
@@ -870,7 +894,7 @@ struct PowOpt : public LibCallOptimization {
     if (Op2C->isExactlyValue(0.5)) {
       // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
       // This is faster than calling pow, and still handles negative zero
-      // and negative infinite correctly.
+      // and negative infinity correctly.
       // TODO: In fast-math mode, this could be just sqrt(x).
       // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
       Value *Inf = ConstantFP::getInfinity(CI->getType());
@@ -963,8 +987,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
 
     // floor((double)floatval) -> (double)floorf(floatval)
     Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName().data(), B,
-                             Callee->getAttributes());
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
     return B.CreateFPExt(V, B.getDoubleTy());
   }
 };
@@ -1000,7 +1023,7 @@ struct FFSOpt : public LibCallOptimization {
     Type *ArgType = Op->getType();
     Value *F = Intrinsic::getDeclaration(Callee->getParent(),
                                          Intrinsic::cttz, ArgType);
-    Value *V = B.CreateCall(F, Op, "cttz");
+    Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
     V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
     V = B.CreateIntCast(V, B.getInt32Ty(), false);
 
@@ -1095,8 +1118,8 @@ struct PrintFOpt : public LibCallOptimization {
   Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
     // Check for a fixed format string.
-    std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getArgOperand(0), FormatStr))
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
       return 0;
 
     // Empty format string -> noop.
@@ -1122,11 +1145,9 @@ struct PrintFOpt : public LibCallOptimization {
         FormatStr.find('%') == std::string::npos) {  // no format characters.
       // Create a string literal with no \n on it.  We expect the constant merge
       // pass to be run after this pass, to merge duplicate strings.
-      FormatStr.erase(FormatStr.end()-1);
-      Constant *C = ConstantArray::get(*Context, FormatStr, true);
-      C = new GlobalVariable(*Callee->getParent(), C->getType(), true,
-                             GlobalVariable::InternalLinkage, C, "str");
-      EmitPutS(C, B, TD);
+      FormatStr = FormatStr.drop_back();
+      Value *GV = B.CreateGlobalString(FormatStr, "str");
+      EmitPutS(GV, B, TD);
       return CI->use_empty() ? (Value*)CI :
                     ConstantInt::get(CI->getType(), FormatStr.size()+1);
     }
@@ -1184,8 +1205,8 @@ struct SPrintFOpt : public LibCallOptimization {
   Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
     // Check for a fixed format string.
-    std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
     // If we just have a format string (nothing else crazy) transform it.
@@ -1296,7 +1317,8 @@ struct FWriteOpt : public LibCallOptimization {
       return ConstantInt::get(CI->getType(), 0);
 
     // If this is writing one byte, turn it into fputc.
-    if (Bytes == 1) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
       Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
       EmitFPutC(Char, CI->getArgOperand(3), B, TD);
       return ConstantInt::get(CI->getType(), 1);
@@ -1326,7 +1348,7 @@ struct FPutsOpt : public LibCallOptimization {
     if (!Len) return 0;
     EmitFWrite(CI->getArgOperand(0),
                ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-               CI->getArgOperand(1), B, TD);
+               CI->getArgOperand(1), B, TD, TLI);
     return CI;  // Known to have no uses (see above).
   }
 };
@@ -1338,8 +1360,8 @@ struct FPrintFOpt : public LibCallOptimization {
   Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
     // All the optimizations depend on the format string.
-    std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
     // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
@@ -1354,7 +1376,7 @@ struct FPrintFOpt : public LibCallOptimization {
       EmitFWrite(CI->getArgOperand(1),
                  ConstantInt::get(TD->getIntPtrType(*Context),
                                   FormatStr.size()),
-                 CI->getArgOperand(0), B, TD);
+                 CI->getArgOperand(0), B, TD, TLI);
       return ConstantInt::get(CI->getType(), FormatStr.size());
     }
 
@@ -1376,7 +1398,7 @@ struct FPrintFOpt : public LibCallOptimization {
       // fprintf(F, "%s", str) --> fputs(str, F)
       if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
         return 0;
-      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
+      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
       return CI;
     }
     return 0;
@@ -1422,8 +1444,8 @@ struct PutsOpt : public LibCallOptimization {
       return 0;
 
     // Check for a constant string.
-    std::string Str;
-    if (!GetConstantStringInfo(CI->getArgOperand(0), Str))
+    StringRef Str;
+    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
       return 0;
 
     if (Str.empty() && CI->use_empty()) {
@@ -1457,7 +1479,7 @@ namespace {
     StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
-    PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
     // Integer Optimizations
     FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
     ToAsciiOpt ToAscii;
@@ -1472,6 +1494,7 @@ namespace {
     SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
+    void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1502,6 +1525,11 @@ FunctionPass *llvm::createSimplifyLibCallsPass() {
   return new SimplifyLibCalls();
 }
 
+void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+  if (TLI->has(F))
+    Optimizations[TLI->getName(F)] = Opt;
+}
+
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
@@ -1527,14 +1555,17 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["strcspn"] = &StrCSpn;
   Optimizations["strstr"] = &StrStr;
   Optimizations["memcmp"] = &MemCmp;
-  if (TLI->has(LibFunc::memcpy)) Optimizations["memcpy"] = &MemCpy;
+  AddOpt(LibFunc::memcpy, &MemCpy);
   Optimizations["memmove"] = &MemMove;
-  if (TLI->has(LibFunc::memset)) Optimizations["memset"] = &MemSet;
+  AddOpt(LibFunc::memset, &MemSet);
 
   // _chk variants of String and Memory LibCall Optimizations.
   Optimizations["__strcpy_chk"] = &StrCpyChk;
 
   // Math Library Optimizations
+  Optimizations["cosf"] = &Cos;
+  Optimizations["cos"] = &Cos;
+  Optimizations["cosl"] = &Cos;
   Optimizations["powf"] = &Pow;
   Optimizations["pow"] = &Pow;
   Optimizations["powl"] = &Pow;
@@ -1582,8 +1613,8 @@ void SimplifyLibCalls::InitOptimizations() {
   // Formatting and IO Optimizations
   Optimizations["sprintf"] = &SPrintF;
   Optimizations["printf"] = &PrintF;
-  Optimizations["fwrite"] = &FWrite;
-  Optimizations["fputs"] = &FPuts;
+  AddOpt(LibFunc::fwrite, &FWrite);
+  AddOpt(LibFunc::fputs, &FPuts);
   Optimizations["fprintf"] = &FPrintF;
   Optimizations["puts"] = &Puts;
 }
@@ -2348,9 +2379,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * cbrt(sqrt(x))  -> pow(x,1/6)
 //   * cbrt(sqrt(x))  -> pow(x,1/9)
 //
-// cos, cosf, cosl:
-//   * cos(-x)  -> cos(x)
-//
 // exp, expf, expl:
 //   * exp(log(x))  -> x
 //
@@ -2387,6 +2415,8 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * stpcpy(str, "literal") ->
 //           llvm.memcpy(str,"literal",strlen("literal")+1,1)
 //
+// strchr:
+//   * strchr(p, 0) -> strlen(p)
 // tan, tanf, tanl:
 //   * tan(atan(x)) -> x
 //
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index c83f56c..ef65c0a 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CFG.h"
@@ -240,7 +241,7 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   if (SuccToSinkTo->getUniquePredecessor() != ParentBlock) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
-    if (!Inst->isSafeToSpeculativelyExecute()) {
+    if (!isSafeToSpeculativelyExecute(Inst)) {
       DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n");
       return false;
     }
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index 8e5a1eb..d831452 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -473,14 +473,7 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
   // Check to see if this value is already used in the memory instruction's
   // block.  If so, it's already live into the block at the very least, so we
   // can reasonably fold it.
-  BasicBlock *MemBB = MemoryInst->getParent();
-  for (Value::use_iterator UI = Val->use_begin(), E = Val->use_end();
-       UI != E; ++UI)
-    // We know that uses of arguments and instructions have to be instructions.
-    if (cast<Instruction>(*UI)->getParent() == MemBB)
-      return true;
-  
-  return false;
+  return Val->isUsedInBasicBlock(MemoryInst->getParent());
 }
 
 
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index a7f9efd..3859a1a 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -249,7 +249,6 @@ unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
     if (Term->getSuccessor(i) == Succ)
       return i;
   }
-  return 0;
 }
 
 /// SplitEdge -  Split the edge connecting specified block. Pass P must 
@@ -453,9 +452,8 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
 /// of the edges being split is an exit of a loop with other exits).
 ///
 BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
-                                         BasicBlock *const *Preds,
-                                         unsigned NumPreds, const char *Suffix,
-                                         Pass *P) {
+                                         ArrayRef<BasicBlock*> Preds,
+                                         const char *Suffix, Pass *P) {
   // Create new basic block, insert right before the original block.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
                                          BB->getParent(), BB);
@@ -464,7 +462,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   BranchInst *BI = BranchInst::Create(BB, NewBB);
   
   // Move the edges from Preds to point to NewBB instead of BB.
-  for (unsigned i = 0; i != NumPreds; ++i) {
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     // This is slightly more strict than necessary; the minimum requirement
     // is that there be no more than one indirectbr branching to BB. And
     // all BlockAddress uses would need to be updated.
@@ -477,7 +475,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   // node becomes an incoming value for BB's phi node.  However, if the Preds
   // list is empty, we need to insert dummy entries into the PHI nodes in BB to
   // account for the newly created predecessor.
-  if (NumPreds == 0) {
+  if (Preds.size() == 0) {
     // Insert dummy values as the incoming value.
     for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
       cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
@@ -486,12 +484,10 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // Update DominatorTree, LoopInfo, and LCCSA analysis information.
   bool HasLoopExit = false;
-  UpdateAnalysisInformation(BB, NewBB, ArrayRef<BasicBlock*>(Preds, NumPreds),
-                            P, HasLoopExit);
+  UpdateAnalysisInformation(BB, NewBB, Preds, P, HasLoopExit);
 
   // Update the PHI nodes in BB with the values coming from NewBB.
-  UpdatePHINodes(BB, NewBB, ArrayRef<BasicBlock*>(Preds, NumPreds), BI,
-                 P, HasLoopExit);
+  UpdatePHINodes(BB, NewBB, Preds, BI, P, HasLoopExit);
   return NewBB;
 }
 
diff --git a/lib/Transforms/Utils/BasicInliner.cpp b/lib/Transforms/Utils/BasicInliner.cpp
deleted file mode 100644
index 23a30cc5..0000000
--- a/lib/Transforms/Utils/BasicInliner.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===- BasicInliner.cpp - Basic function level inliner --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a simple function based inliner that does not use
-// call graph information. 
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "basicinliner"
-#include "llvm/Module.h"
-#include "llvm/Function.h"
-#include "llvm/Transforms/Utils/BasicInliner.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include <vector>
-
-using namespace llvm;
-
-static cl::opt<unsigned>     
-BasicInlineThreshold("basic-inline-threshold", cl::Hidden, cl::init(200),
-   cl::desc("Control the amount of basic inlining to perform (default = 200)"));
-
-namespace llvm {
-
-  /// BasicInlinerImpl - BasicInliner implemantation class. This hides
-  /// container info, used by basic inliner, from public interface.
-  struct BasicInlinerImpl {
-    
-    BasicInlinerImpl(const BasicInlinerImpl&); // DO NOT IMPLEMENT
-    void operator=(const BasicInlinerImpl&); // DO NO IMPLEMENT
-  public:
-    BasicInlinerImpl(TargetData *T) : TD(T) {}
-
-    /// addFunction - Add function into the list of functions to process.
-    /// All functions must be inserted using this interface before invoking
-    /// inlineFunctions().
-    void addFunction(Function *F) {
-      Functions.push_back(F);
-    }
-
-    /// neverInlineFunction - Sometimes a function is never to be inlined 
-    /// because of one or other reason. 
-    void neverInlineFunction(Function *F) {
-      NeverInline.insert(F);
-    }
-
-    /// inlineFuctions - Walk all call sites in all functions supplied by
-    /// client. Inline as many call sites as possible. Delete completely
-    /// inlined functions.
-    void inlineFunctions();
-    
-  private:
-    TargetData *TD;
-    std::vector<Function *> Functions;
-    SmallPtrSet<const Function *, 16> NeverInline;
-    SmallPtrSet<Function *, 8> DeadFunctions;
-    InlineCostAnalyzer CA;
-  };
-
-/// inlineFuctions - Walk all call sites in all functions supplied by
-/// client. Inline as many call sites as possible. Delete completely
-/// inlined functions.
-void BasicInlinerImpl::inlineFunctions() {
-      
-  // Scan through and identify all call sites ahead of time so that we only
-  // inline call sites in the original functions, not call sites that result
-  // from inlining other functions.
-  std::vector<CallSite> CallSites;
-  
-  for (std::vector<Function *>::iterator FI = Functions.begin(),
-         FE = Functions.end(); FI != FE; ++FI) {
-    Function *F = *FI;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
-        CallSite CS(cast<Value>(I));
-        if (CS && CS.getCalledFunction()
-            && !CS.getCalledFunction()->isDeclaration())
-          CallSites.push_back(CS);
-      }
-  }
-  
-  DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
-  
-  // Inline call sites.
-  bool Changed = false;
-  do {
-    Changed = false;
-    for (unsigned index = 0; index != CallSites.size() && !CallSites.empty(); 
-         ++index) {
-      CallSite CS = CallSites[index];
-      if (Function *Callee = CS.getCalledFunction()) {
-        
-        // Eliminate calls that are never inlinable.
-        if (Callee->isDeclaration() ||
-            CS.getInstruction()->getParent()->getParent() == Callee) {
-          CallSites.erase(CallSites.begin() + index);
-          --index;
-          continue;
-        }
-        InlineCost IC = CA.getInlineCost(CS, NeverInline);
-        if (IC.isAlways()) {        
-          DEBUG(dbgs() << "  Inlining: cost=always"
-                       <<", call: " << *CS.getInstruction());
-        } else if (IC.isNever()) {
-          DEBUG(dbgs() << "  NOT Inlining: cost=never"
-                       <<", call: " << *CS.getInstruction());
-          continue;
-        } else {
-          int Cost = IC.getValue();
-          
-          if (Cost >= (int) BasicInlineThreshold) {
-            DEBUG(dbgs() << "  NOT Inlining: cost = " << Cost
-                         << ", call: " <<  *CS.getInstruction());
-            continue;
-          } else {
-            DEBUG(dbgs() << "  Inlining: cost = " << Cost
-                         << ", call: " <<  *CS.getInstruction());
-          }
-        }
-        
-        // Inline
-        InlineFunctionInfo IFI(0, TD);
-        if (InlineFunction(CS, IFI)) {
-          if (Callee->use_empty() && (Callee->hasLocalLinkage() ||
-                                      Callee->hasAvailableExternallyLinkage()))
-            DeadFunctions.insert(Callee);
-          Changed = true;
-          CallSites.erase(CallSites.begin() + index);
-          --index;
-        }
-      }
-    }
-  } while (Changed);
-  
-  // Remove completely inlined functions from module.
-  for(SmallPtrSet<Function *, 8>::iterator I = DeadFunctions.begin(),
-        E = DeadFunctions.end(); I != E; ++I) {
-    Function *D = *I;
-    Module *M = D->getParent();
-    M->getFunctionList().remove(D);
-  }
-}
-
-BasicInliner::BasicInliner(TargetData *TD) {
-  Impl = new BasicInlinerImpl(TD);
-}
-
-BasicInliner::~BasicInliner() {
-  delete Impl;
-}
-
-/// addFunction - Add function into the list of functions to process.
-/// All functions must be inserted using this interface before invoking
-/// inlineFunctions().
-void BasicInliner::addFunction(Function *F) {
-  Impl->addFunction(F);
-}
-
-/// neverInlineFunction - Sometimes a function is never to be inlined because
-/// of one or other reason. 
-void BasicInliner::neverInlineFunction(Function *F) {
-  Impl->neverInlineFunction(F);
-}
-
-/// inlineFuctions - Walk all call sites in all functions supplied by
-/// client. Inline as many call sites as possible. Delete completely
-/// inlined functions.
-void BasicInliner::inlineFunctions() {
-  Impl->inlineFunctions();
-}
-
-}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index c052910..f752d79 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -372,8 +372,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           // form, which we're in the process of restoring!
           if (!Preds.empty() && HasPredOutsideOfLoop) {
             BasicBlock *NewExitBB =
-              SplitBlockPredecessors(Exit, Preds.data(), Preds.size(),
-                                     "split", P);
+              SplitBlockPredecessors(Exit, Preds, "split", P);
             if (P->mustPreserveAnalysisID(LCSSAID))
               CreatePHIsForSplitLoopExit(Preds, NewExitBB, Exit);
           }
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 4b5f45b..a808303 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -15,11 +15,15 @@
 #include "llvm/Type.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -206,19 +210,16 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
 /// 'floor').  This function is known to take a single of type matching 'Op' and
 /// returns one value with the same type.  If 'Op' is a long double, 'l' is
 /// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
-Value *llvm::EmitUnaryFloatFnCall(Value *Op, const char *Name,
-                                  IRBuilder<> &B, const AttrListPtr &Attrs) {
-  char NameBuffer[20];
+Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+                                  const AttrListPtr &Attrs) {
+  SmallString<20> NameBuffer;
   if (!Op->getType()->isDoubleTy()) {
     // If we need to add a suffix, copy into NameBuffer.
-    unsigned NameLen = strlen(Name);
-    assert(NameLen < sizeof(NameBuffer)-2);
-    memcpy(NameBuffer, Name, NameLen);
+    NameBuffer += Name;
     if (Op->getType()->isFloatTy())
-      NameBuffer[NameLen] = 'f';  // floorf
+      NameBuffer += 'f'; // floorf
     else
-      NameBuffer[NameLen] = 'l';  // floorl
-    NameBuffer[NameLen+1] = 0;
+      NameBuffer += 'l'; // floorl
     Name = NameBuffer;
   }
 
@@ -299,20 +300,21 @@ void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
 /// pointer and File is a pointer to FILE.
 void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
-                     const TargetData *TD) {
+                     const TargetData *TD, const TargetLibraryInfo *TLI) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputs", AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI, 3),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
   else
-    F = M->getOrInsertFunction("fputs", B.getInt32Ty(),
+    F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
   CallInst *CI = B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
@@ -324,23 +326,25 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
 /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
 void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
-                      IRBuilder<> &B, const TargetData *TD) {
+                      IRBuilder<> &B, const TargetData *TD,
+                      const TargetLibraryInfo *TLI) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture);
   AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fwrite", AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI, 3),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
                                TD->getIntPtrType(Context),
                                File->getType(), NULL);
   else
-    F = M->getOrInsertFunction("fwrite", TD->getIntPtrType(Context),
+    F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
                                TD->getIntPtrType(Context),
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 7adc5f1..7f5cb5e 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_llvm_library(LLVMTransformUtils
   AddrModeMatcher.cpp
   BasicBlockUtils.cpp
-  BasicInliner.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
   CloneFunction.cpp
   CloneModule.cpp
+  CmpInstAnalysis.cpp
   CodeExtractor.cpp
   DemoteRegToStack.cpp
   InlineFunction.cpp
@@ -14,10 +14,12 @@ add_llvm_library(LLVMTransformUtils
   Local.cpp
   LoopSimplify.cpp
   LoopUnroll.cpp
+  LoopUnrollRuntime.cpp
   LowerExpectIntrinsic.cpp
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
+  ModuleUtils.cpp
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
@@ -27,11 +29,3 @@ add_llvm_library(LLVMTransformUtils
   Utils.cpp
   ValueMapper.cpp
   )
-
-add_llvm_library_dependencies(LLVMTransformUtils
-  LLVMAnalysis
-  LLVMCore
-  LLVMSupport
-  LLVMTarget
-  LLVMipa
-  )
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index cf21f1e..20052a4 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -23,8 +23,11 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include <map>
@@ -60,7 +63,6 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
   
   if (CodeInfo) {
     CodeInfo->ContainsCalls          |= hasCalls;
-    CodeInfo->ContainsUnwinds        |= isa<UnwindInst>(BB->getTerminator());
     CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
     CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
                                         BB != &BB->getParent()->getEntryBlock();
@@ -75,7 +77,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                              ValueToValueMapTy &VMap,
                              bool ModuleLevelChanges,
                              SmallVectorImpl<ReturnInst*> &Returns,
-                             const char *NameSuffix, ClonedCodeInfo *CodeInfo) {
+                             const char *NameSuffix, ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper) {
   assert(NameSuffix && "NameSuffix cannot be null!");
 
 #ifndef NDEBUG
@@ -113,8 +116,23 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
     // Create a new basic block and copy instructions into it!
     BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo);
-    VMap[&BB] = CBB;                       // Add basic block mapping.
 
+    // Add basic block mapping.
+    VMap[&BB] = CBB;
+
+    // It is only legal to clone a function if a block address within that
+    // function is never referenced outside of the function.  Given that, we
+    // want to map block addresses from the old function to block addresses in
+    // the clone. (This is different from the generic ValueMapper
+    // implementation, which generates an invalid blockaddress when
+    // cloning a function.)
+    if (BB.hasAddressTaken()) {
+      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                              const_cast<BasicBlock*>(&BB));
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);                                         
+    }
+
+    // Note return instructions for the caller.
     if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
       Returns.push_back(RI);
   }
@@ -126,7 +144,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // Loop over all instructions, fixing each one as we find it...
     for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
       RemapInstruction(II, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper);
 }
 
 /// CloneFunction - Return a copy of the specified function, but without
@@ -181,7 +200,6 @@ namespace {
     const Function *OldFunc;
     ValueToValueMapTy &VMap;
     bool ModuleLevelChanges;
-    SmallVectorImpl<ReturnInst*> &Returns;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
     const TargetData *TD;
@@ -189,24 +207,18 @@ namespace {
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap,
                           bool moduleLevelChanges,
-                          SmallVectorImpl<ReturnInst*> &returns,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
                           const TargetData *td)
     : NewFunc(newFunc), OldFunc(oldFunc),
       VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
-      Returns(returns), NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
+      NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
     }
 
     /// CloneBlock - The specified block is found to be reachable, clone it and
     /// anything that it can reach.
     void CloneBlock(const BasicBlock *BB,
                     std::vector<const BasicBlock*> &ToClone);
-    
-  public:
-    /// ConstantFoldMappedInstruction - Constant fold the specified instruction,
-    /// mapping its operands through VMap if they are available.
-    Constant *ConstantFoldMappedInstruction(const Instruction *I);
   };
 }
 
@@ -214,7 +226,7 @@ namespace {
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                        std::vector<const BasicBlock*> &ToClone){
-  TrackingVH<Value> &BBEntry = VMap[BB];
+  WeakVH &BBEntry = VMap[BB];
 
   // Have we already cloned this block?
   if (BBEntry) return;
@@ -224,25 +236,55 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   BBEntry = NewBB = BasicBlock::Create(BB->getContext());
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
 
+  // It is only legal to clone a function if a block address within that
+  // function is never referenced outside of the function.  Given that, we
+  // want to map block addresses from the old function to block addresses in
+  // the clone. (This is different from the generic ValueMapper
+  // implementation, which generates an invalid blockaddress when
+  // cloning a function.)
+  //
+  // Note that we don't need to fix the mapping for unreachable blocks;
+  // the default mapping there is safe.
+  if (BB->hasAddressTaken()) {
+    Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                            const_cast<BasicBlock*>(BB));
+    VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
+  }
+    
+
   bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
   
   // Loop over all instructions, and copy them over, DCE'ing as we go.  This
   // loop doesn't include the terminator.
   for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end();
        II != IE; ++II) {
-    // If this instruction constant folds, don't bother cloning the instruction,
-    // instead, just add the constant to the value map.
-    if (Constant *C = ConstantFoldMappedInstruction(II)) {
-      VMap[II] = C;
-      continue;
+    Instruction *NewInst = II->clone();
+
+    // Eagerly remap operands to the newly cloned instruction, except for PHI
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
+      RemapInstruction(NewInst, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+
+      // If we can simplify this instruction to some other value, simply add
+      // a mapping to that value rather than inserting a new instruction into
+      // the basic block.
+      if (Value *V = SimplifyInstruction(NewInst, TD)) {
+        // On the off-chance that this simplifies to an instruction in the old
+        // function, map it back into the new function.
+        if (Value *MappedV = VMap.lookup(V))
+          V = MappedV;
+
+        VMap[II] = V;
+        delete NewInst;
+        continue;
+      }
     }
 
-    Instruction *NewInst = II->clone();
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
-    NewBB->getInstList().push_back(NewInst);
     VMap[II] = NewInst;                // Add instruction map to value.
-    
+    NewBB->getInstList().push_back(NewInst);
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
       if (isa<ConstantInt>(AI->getArraySize()))
@@ -281,7 +323,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
-      BasicBlock *Dest = SI->getSuccessor(SI->findCaseValue(Cond));
+      SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond);
+      BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
       VMap[OldTI] = BranchInst::Create(Dest, NewBB);
       ToClone.push_back(Dest);
       TerminatorDone = true;
@@ -303,38 +346,10 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   
   if (CodeInfo) {
     CodeInfo->ContainsCalls          |= hasCalls;
-    CodeInfo->ContainsUnwinds        |= isa<UnwindInst>(OldTI);
     CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
     CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
       BB != &BB->getParent()->front();
   }
-  
-  if (ReturnInst *RI = dyn_cast<ReturnInst>(NewBB->getTerminator()))
-    Returns.push_back(RI);
-}
-
-/// ConstantFoldMappedInstruction - Constant fold the specified instruction,
-/// mapping its operands through VMap if they are available.
-Constant *PruningFunctionCloner::
-ConstantFoldMappedInstruction(const Instruction *I) {
-  SmallVector<Constant*, 8> Ops;
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-    if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
-                                                           VMap,
-                  ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges)))
-      Ops.push_back(Op);
-    else
-      return 0;  // All operands not constant!
-
-  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
-    return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
-                                           TD);
-
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
-    if (!LI->isVolatile())
-      return ConstantFoldLoadFromConstPtr(Ops[0], TD);
-
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD);
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
@@ -361,7 +376,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
 #endif
 
   PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
-                            Returns, NameSuffix, CodeInfo, TD);
+                            NameSuffix, CodeInfo, TD);
 
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock*> CloneWorklist;
@@ -386,29 +401,19 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
     // Add the new block to the new function.
     NewFunc->getBasicBlockList().push_back(NewBB);
-    
-    // Loop over all of the instructions in the block, fixing up operand
-    // references as we go.  This uses VMap to do all the hard work.
-    //
-    BasicBlock::iterator I = NewBB->begin();
-
-    DebugLoc TheCallDL;
-    if (TheCall) 
-      TheCallDL = TheCall->getDebugLoc();
-    
+
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
-    if (PHINode *PN = dyn_cast<PHINode>(I)) {
-      // Skip over all PHI nodes, remembering them for later.
-      BasicBlock::const_iterator OldI = BI->begin();
-      for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI)
-        PHIToResolve.push_back(cast<PHINode>(OldI));
-    }
-    
-    // Otherwise, remap the rest of the instructions normally.
-    for (; I != NewBB->end(); ++I)
-      RemapInstruction(I, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I)
+      if (const PHINode *PN = dyn_cast<PHINode>(I))
+        PHIToResolve.push_back(PN);
+      else
+        break;
+
+    // Finally, remap the terminator instructions, as those can't be remapped
+    // until all BBs are mapped.
+    RemapInstruction(NewBB->getTerminator(), VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
   }
   
   // Defer PHI resolution until rest of function is resolved, PHI resolution
@@ -490,31 +495,55 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
         ++OldI;
       }
     }
-    // NOTE: We cannot eliminate single entry phi nodes here, because of
-    // VMap.  Single entry phi nodes can have multiple VMap entries
-    // pointing at them.  Thus, deleting one would require scanning the VMap
-    // to update any entries in it that would require that.  This would be
-    // really slow.
   }
-  
+
+  // Make a second pass over the PHINodes now that all of them have been
+  // remapped into the new function, simplifying the PHINode and performing any
+  // recursive simplifications exposed. This will transparently update the
+  // WeakVH in the VMap. Notably, we rely on that so that if we coalesce
+  // two PHINodes, the iteration over the old PHIs remains valid, and the
+  // mapping will just map us to the new node (which may not even be a PHI
+  // node).
+  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+    if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]]))
+      recursivelySimplifyInstruction(PN, TD);
+
   // Now that the inlined function body has been fully constructed, go through
   // and zap unconditional fall-through branches.  This happen all the time when
   // specializing code: code specialization turns conditional branches into
   // uncond branches, and this code folds them.
-  Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
+  Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
+  Function::iterator I = Begin;
   while (I != NewFunc->end()) {
+    // Check if this block has become dead during inlining or other
+    // simplifications. Note that the first block will appear dead, as it has
+    // not yet been wired up properly.
+    if (I != Begin && (pred_begin(I) == pred_end(I) ||
+                       I->getSinglePredecessor() == I)) {
+      BasicBlock *DeadBB = I++;
+      DeleteDeadBlock(DeadBB);
+      continue;
+    }
+
+    // We need to simplify conditional branches and switches with a constant
+    // operand. We try to prune these out when cloning, but if the
+    // simplification required looking through PHI nodes, those are only
+    // available after forming the full basic block. That may leave some here,
+    // and we still want to prune the dead code as early as possible.
+    ConstantFoldTerminator(I);
+
     BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
     if (!BI || BI->isConditional()) { ++I; continue; }
     
-    // Note that we can't eliminate uncond branches if the destination has
-    // single-entry PHI nodes.  Eliminating the single-entry phi nodes would
-    // require scanning the VMap to update any entries that point to the phi
-    // node.
     BasicBlock *Dest = BI->getSuccessor(0);
-    if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) {
+    if (!Dest->getSinglePredecessor()) {
       ++I; continue;
     }
-    
+
+    // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
+    // above should have zapped all of them..
+    assert(!isa<PHINode>(Dest->begin()));
+
     // We know all single-entry PHI nodes in the inlined function have been
     // removed, so we just need to splice the blocks.
     BI->eraseFromParent();
@@ -530,4 +559,13 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     
     // Do not increment I, iteratively merge all things this block branches to.
   }
+
+  // Make a final pass over the basic blocks from theh old function to gather
+  // any return instructions which survived folding. We have to do this here
+  // because we can iteratively remove and merge returns above.
+  for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]),
+                          E = NewFunc->end();
+       I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
+      Returns.push_back(RI);
 }
diff --git a/lib/Transforms/Utils/CmpInstAnalysis.cpp b/lib/Transforms/Utils/CmpInstAnalysis.cpp
new file mode 100644
index 0000000..9b09915
--- /dev/null
+++ b/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -0,0 +1,96 @@
+//===- CmpInstAnalysis.cpp - Utils to help fold compares ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file holds routines to help analyse compare instructions
+// and fold them into constants or other compare instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+
+using namespace llvm;
+
+/// getICmpCode - Encode a icmp predicate into a three bit mask.  These bits
+/// are carefully arranged to allow folding of expressions such as:
+///
+///      (A < B) | (A > B) --> (A != B)
+///
+/// Note that this is only valid if the first and second predicates have the
+/// same sign. Is illegal to do: (A u< B) | (A s> B)
+///
+/// Three bits are used to represent the condition, as follows:
+///   0  A > B
+///   1  A == B
+///   2  A < B
+///
+/// <=>  Value  Definition
+/// 000     0   Always false
+/// 001     1   A >  B
+/// 010     2   A == B
+/// 011     3   A >= B
+/// 100     4   A <  B
+/// 101     5   A != B
+/// 110     6   A <= B
+/// 111     7   Always true
+///
+unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
+  ICmpInst::Predicate Pred = InvertPred ? ICI->getInversePredicate()
+                                        : ICI->getPredicate();
+  switch (Pred) {
+      // False -> 0
+    case ICmpInst::ICMP_UGT: return 1;  // 001
+    case ICmpInst::ICMP_SGT: return 1;  // 001
+    case ICmpInst::ICMP_EQ:  return 2;  // 010
+    case ICmpInst::ICMP_UGE: return 3;  // 011
+    case ICmpInst::ICMP_SGE: return 3;  // 011
+    case ICmpInst::ICMP_ULT: return 4;  // 100
+    case ICmpInst::ICMP_SLT: return 4;  // 100
+    case ICmpInst::ICMP_NE:  return 5;  // 101
+    case ICmpInst::ICMP_ULE: return 6;  // 110
+    case ICmpInst::ICMP_SLE: return 6;  // 110
+      // True -> 7
+    default:
+      llvm_unreachable("Invalid ICmp predicate!");
+  }
+}
+
+/// getICmpValue - This is the complement of getICmpCode, which turns an
+/// opcode and two operands into either a constant true or false, or the
+/// predicate for a new ICmp instruction. The sign is passed in to determine
+/// which kind of predicate to use in the new icmp instruction.
+/// Non-NULL return value will be a true or false constant.
+/// NULL return means a new ICmp is needed.  The predicate for which is
+/// output in NewICmpPred.
+Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+                          CmpInst::Predicate &NewICmpPred) {
+  switch (Code) {
+    default: llvm_unreachable("Illegal ICmp code!");
+    case 0: // False.
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+    case 1: NewICmpPred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    case 2: NewICmpPred = ICmpInst::ICMP_EQ; break;
+    case 3: NewICmpPred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 4: NewICmpPred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 5: NewICmpPred = ICmpInst::ICMP_NE; break;
+    case 6: NewICmpPred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+    case 7: // True.
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+  }
+  return NULL;
+}
+
+/// PredicatesFoldable - Return true if both predicates match sign or if at
+/// least one of them is an equality comparison (which is signless).
+bool llvm::PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
+  return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) ||
+         (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) ||
+         (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1));
+}
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 5f47ebb..e8c0b80 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -615,9 +615,10 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   default:
     // Otherwise, make the default destination of the switch instruction be one
     // of the other successors.
-    TheSwitch->setOperand(0, call);
-    TheSwitch->setSuccessor(0, TheSwitch->getSuccessor(NumExitBlocks));
-    TheSwitch->removeCase(NumExitBlocks);  // Remove redundant case
+    TheSwitch->setCondition(call);
+    TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
+    // Remove redundant case
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
     break;
   }
 }
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 8cc2649..99b5830 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -6,21 +6,12 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file provide the function DemoteRegToStack().  This function takes a
-// virtual register computed by an Instruction and replaces it with a slot in
-// the stack frame, allocated via alloca. It returns the pointer to the
-// AllocaInst inserted.  After this function is called on an instruction, we are
-// guaranteed that the only user of the instruction is a store that is
-// immediately after it.
-//
-//===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Type.h"
-#include <map>
+#include "llvm/ADT/DenseMap.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
@@ -28,8 +19,7 @@ using namespace llvm;
 /// alloca.  This allows the CFG to be changed around without fear of
 /// invalidating the SSA information for the value.  It returns the pointer to
 /// the alloca inserted to create a stack slot for I.
-///
-AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
                                    Instruction *AllocaPoint) {
   if (I.use_empty()) {
     I.eraseFromParent();
@@ -47,21 +37,20 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
                           F->getEntryBlock().begin());
   }
 
-  // Change all of the users of the instruction to read from the stack slot
-  // instead.
+  // Change all of the users of the instruction to read from the stack slot.
   while (!I.use_empty()) {
     Instruction *U = cast<Instruction>(I.use_back());
     if (PHINode *PN = dyn_cast<PHINode>(U)) {
       // If this is a PHI node, we can't insert a load of the value before the
-      // use.  Instead, insert the load in the predecessor block corresponding
+      // use.  Instead insert the load in the predecessor block corresponding
       // to the incoming value.
       //
       // Note that if there are multiple edges from a basic block to this PHI
-      // node that we cannot multiple loads.  The problem is that the resultant
-      // PHI node will have multiple values (from each load) coming in from the
-      // same block, which is illegal SSA form.  For this reason, we keep track
-      // and reuse loads we insert.
-      std::map<BasicBlock*, Value*> Loads;
+      // node that we cannot have multiple loads. The problem is that the
+      // resulting PHI node will have multiple values (from each load) coming in
+      // from the same block, which is illegal SSA form. For this reason, we
+      // keep track of and reuse loads we insert.
+      DenseMap<BasicBlock*, Value*> Loads;
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
         if (PN->getIncomingValue(i) == &I) {
           Value *&V = Loads[PN->getIncomingBlock(i)];
@@ -81,9 +70,9 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
   }
 
 
-  // Insert stores of the computed value into the stack slot.  We have to be
-  // careful is I is an invoke instruction though, because we can't insert the
-  // store AFTER the terminator instruction.
+  // Insert stores of the computed value into the stack slot. We have to be
+  // careful if I is an invoke instruction, because we can't insert the store
+  // AFTER the terminator instruction.
   BasicBlock::iterator InsertPt;
   if (!isa<TerminatorInst>(I)) {
     InsertPt = &I;
@@ -97,18 +86,17 @@ AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     InsertPt = II.getNormalDest()->begin();
   }
 
-  for (; isa<PHINode>(InsertPt); ++InsertPt)
-  /* empty */;   // Don't insert before any PHI nodes.
-  new StoreInst(&I, Slot, InsertPt);
+  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
 
+  new StoreInst(&I, Slot, InsertPt);
   return Slot;
 }
 
-
-/// DemotePHIToStack - This function takes a virtual register computed by a phi
-/// node and replaces it with a slot in the stack frame, allocated via alloca.
-/// The phi node is deleted and it returns the pointer to the alloca inserted.
-AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+/// DemotePHIToStack - This function takes a virtual register computed by a PHI
+/// node and replaces it with a slot in the stack frame allocated via alloca.
+/// The PHI node is deleted. It returns the pointer to the alloca inserted.
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   if (P->use_empty()) {
     P->eraseFromParent();
     return 0;
@@ -125,7 +113,7 @@ AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
                           F->getEntryBlock().begin());
   }
 
-  // Iterate over each operand, insert store in each predecessor.
+  // Iterate over each operand inserting a store in each predecessor.
   for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
       assert(II->getParent() != P->getIncomingBlock(i) &&
@@ -135,12 +123,11 @@ AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
                   P->getIncomingBlock(i)->getTerminator());
   }
 
-  // Insert load in place of the phi and replace all uses.
+  // Insert a load in place of the PHI and replace all uses.
   Value *V = new LoadInst(Slot, P->getName()+".reload", P);
   P->replaceAllUsesWith(V);
 
-  // Delete phi.
+  // Delete PHI.
   P->eraseFromParent();
-
   return Slot;
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 5464dbc..d2b167a 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -10,13 +10,6 @@
 // This file implements inlining of a function into a call site, resolving
 // parameters and the return value as appropriate.
 //
-// The code in this file for handling inlines through invoke
-// instructions preserves semantics only under some assumptions about
-// the behavior of unwinders which correspond to gcc-style libUnwind
-// exception personality functions.  Eventually the IR will be
-// improved to make this unnecessary, but until then, this code is
-// marked [LIBUNWIND].
-//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -38,271 +31,52 @@
 #include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
-bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI) {
-  return InlineFunction(CallSite(CI), IFI);
-}
-bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
-  return InlineFunction(CallSite(II), IFI);
-}
-
-// FIXME: New EH - Remove the functions marked [LIBUNWIND] when new EH is
-// turned on.
-
-/// [LIBUNWIND] Look for an llvm.eh.exception call in the given block.
-static EHExceptionInst *findExceptionInBlock(BasicBlock *bb) {
-  for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; i++) {
-    EHExceptionInst *exn = dyn_cast<EHExceptionInst>(i);
-    if (exn) return exn;
-  }
-
-  return 0;
-}
-
-/// [LIBUNWIND] Look for the 'best' llvm.eh.selector instruction for
-/// the given llvm.eh.exception call.
-static EHSelectorInst *findSelectorForException(EHExceptionInst *exn) {
-  BasicBlock *exnBlock = exn->getParent();
-
-  EHSelectorInst *outOfBlockSelector = 0;
-  for (Instruction::use_iterator
-         ui = exn->use_begin(), ue = exn->use_end(); ui != ue; ++ui) {
-    EHSelectorInst *sel = dyn_cast<EHSelectorInst>(*ui);
-    if (!sel) continue;
-
-    // Immediately accept an eh.selector in the same block as the
-    // excepton call.
-    if (sel->getParent() == exnBlock) return sel;
-
-    // Otherwise, use the first selector we see.
-    if (!outOfBlockSelector) outOfBlockSelector = sel;
-  }
-
-  return outOfBlockSelector;
+bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
+  return InlineFunction(CallSite(CI), IFI, InsertLifetime);
 }
-
-/// [LIBUNWIND] Find the (possibly absent) call to @llvm.eh.selector
-/// in the given landing pad.  In principle, llvm.eh.exception is
-/// required to be in the landing pad; in practice, SplitCriticalEdge
-/// can break that invariant, and then inlining can break it further.
-/// There's a real need for a reliable solution here, but until that
-/// happens, we have some fragile workarounds here.
-static EHSelectorInst *findSelectorForLandingPad(BasicBlock *lpad) {
-  // Look for an exception call in the actual landing pad.
-  EHExceptionInst *exn = findExceptionInBlock(lpad);
-  if (exn) return findSelectorForException(exn);
-
-  // Okay, if that failed, look for one in an obvious successor.  If
-  // we find one, we'll fix the IR by moving things back to the
-  // landing pad.
-
-  bool dominates = true; // does the lpad dominate the exn call
-  BasicBlock *nonDominated = 0; // if not, the first non-dominated block
-  BasicBlock *lastDominated = 0; // and the block which branched to it
-
-  BasicBlock *exnBlock = lpad;
-
-  // We need to protect against lpads that lead into infinite loops.
-  SmallPtrSet<BasicBlock*,4> visited;
-  visited.insert(exnBlock);
-
-  do {
-    // We're not going to apply this hack to anything more complicated
-    // than a series of unconditional branches, so if the block
-    // doesn't terminate in an unconditional branch, just fail.  More
-    // complicated cases can arise when, say, sinking a call into a
-    // split unwind edge and then inlining it; but that can do almost
-    // *anything* to the CFG, including leaving the selector
-    // completely unreachable.  The only way to fix that properly is
-    // to (1) prohibit transforms which move the exception or selector
-    // values away from the landing pad, e.g. by producing them with
-    // instructions that are pinned to an edge like a phi, or
-    // producing them with not-really-instructions, and (2) making
-    // transforms which split edges deal with that.
-    BranchInst *branch = dyn_cast<BranchInst>(&exnBlock->back());
-    if (!branch || branch->isConditional()) return 0;
-
-    BasicBlock *successor = branch->getSuccessor(0);
-
-    // Fail if we found an infinite loop.
-    if (!visited.insert(successor)) return 0;
-
-    // If the successor isn't dominated by exnBlock:
-    if (!successor->getSinglePredecessor()) {
-      // We don't want to have to deal with threading the exception
-      // through multiple levels of phi, so give up if we've already
-      // followed a non-dominating edge.
-      if (!dominates) return 0;
-
-      // Otherwise, remember this as a non-dominating edge.
-      dominates = false;
-      nonDominated = successor;
-      lastDominated = exnBlock;
-    }
-
-    exnBlock = successor;
-
-    // Can we stop here?
-    exn = findExceptionInBlock(exnBlock);
-  } while (!exn);
-
-  // Look for a selector call for the exception we found.
-  EHSelectorInst *selector = findSelectorForException(exn);
-  if (!selector) return 0;
-
-  // The easy case is when the landing pad still dominates the
-  // exception call, in which case we can just move both calls back to
-  // the landing pad.
-  if (dominates) {
-    selector->moveBefore(lpad->getFirstNonPHI());
-    exn->moveBefore(selector);
-    return selector;
-  }
-
-  // Otherwise, we have to split at the first non-dominating block.
-  // The CFG looks basically like this:
-  //    lpad:
-  //      phis_0
-  //      insnsAndBranches_1
-  //      br label %nonDominated
-  //    nonDominated:
-  //      phis_2
-  //      insns_3
-  //      %exn = call i8* @llvm.eh.exception()
-  //      insnsAndBranches_4
-  //      %selector = call @llvm.eh.selector(i8* %exn, ...
-  // We need to turn this into:
-  //    lpad:
-  //      phis_0
-  //      %exn0 = call i8* @llvm.eh.exception()
-  //      %selector0 = call @llvm.eh.selector(i8* %exn0, ...
-  //      insnsAndBranches_1
-  //      br label %split // from lastDominated
-  //    nonDominated:
-  //      phis_2 (without edge from lastDominated)
-  //      %exn1 = call i8* @llvm.eh.exception()
-  //      %selector1 = call i8* @llvm.eh.selector(i8* %exn1, ...
-  //      br label %split
-  //    split:
-  //      phis_2 (edge from lastDominated, edge from split)
-  //      %exn = phi ...
-  //      %selector = phi ...
-  //      insns_3
-  //      insnsAndBranches_4
-
-  assert(nonDominated);
-  assert(lastDominated);
-
-  // First, make clones of the intrinsics to go in lpad.
-  EHExceptionInst *lpadExn = cast<EHExceptionInst>(exn->clone());
-  EHSelectorInst *lpadSelector = cast<EHSelectorInst>(selector->clone());
-  lpadSelector->setArgOperand(0, lpadExn);
-  lpadSelector->insertBefore(lpad->getFirstNonPHI());
-  lpadExn->insertBefore(lpadSelector);
-
-  // Split the non-dominated block.
-  BasicBlock *split =
-    nonDominated->splitBasicBlock(nonDominated->getFirstNonPHI(),
-                                  nonDominated->getName() + ".lpad-fix");
-
-  // Redirect the last dominated branch there.
-  cast<BranchInst>(lastDominated->back()).setSuccessor(0, split);
-
-  // Move the existing intrinsics to the end of the old block.
-  selector->moveBefore(&nonDominated->back());
-  exn->moveBefore(selector);
-
-  Instruction *splitIP = &split->front();
-
-  // For all the phis in nonDominated, make a new phi in split to join
-  // that phi with the edge from lastDominated.
-  for (BasicBlock::iterator
-         i = nonDominated->begin(), e = nonDominated->end(); i != e; ++i) {
-    PHINode *phi = dyn_cast<PHINode>(i);
-    if (!phi) break;
-
-    PHINode *splitPhi = PHINode::Create(phi->getType(), 2, phi->getName(),
-                                        splitIP);
-    phi->replaceAllUsesWith(splitPhi);
-    splitPhi->addIncoming(phi, nonDominated);
-    splitPhi->addIncoming(phi->removeIncomingValue(lastDominated),
-                          lastDominated);
-  }
-
-  // Make new phis for the exception and selector.
-  PHINode *exnPhi = PHINode::Create(exn->getType(), 2, "", splitIP);
-  exn->replaceAllUsesWith(exnPhi);
-  selector->setArgOperand(0, exn); // except for this use
-  exnPhi->addIncoming(exn, nonDominated);
-  exnPhi->addIncoming(lpadExn, lastDominated);
-
-  PHINode *selectorPhi = PHINode::Create(selector->getType(), 2, "", splitIP);
-  selector->replaceAllUsesWith(selectorPhi);
-  selectorPhi->addIncoming(selector, nonDominated);
-  selectorPhi->addIncoming(lpadSelector, lastDominated);
-
-  return lpadSelector;
+bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
+  return InlineFunction(CallSite(II), IFI, InsertLifetime);
 }
 
 namespace {
   /// A class for recording information about inlining through an invoke.
   class InvokeInliningInfo {
-    BasicBlock *OuterUnwindDest;
-    EHSelectorInst *OuterSelector;
-    BasicBlock *InnerUnwindDest;
-    PHINode *InnerExceptionPHI;
-    PHINode *InnerSelectorPHI;
-    SmallVector<Value*, 8> UnwindDestPHIValues;
-
-    // FIXME: New EH - These will replace the analogous ones above.
     BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind.
     BasicBlock *InnerResumeDest; //< Destination for the callee's resume.
     LandingPadInst *CallerLPad;  //< LandingPadInst associated with the invoke.
     PHINode *InnerEHValuesPHI;   //< PHI for EH values from landingpad insts.
+    SmallVector<Value*, 8> UnwindDestPHIValues;
 
   public:
     InvokeInliningInfo(InvokeInst *II)
-      : OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
-        InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0),
-        OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0),
+      : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0),
         CallerLPad(0), InnerEHValuesPHI(0) {
       // If there are PHI nodes in the unwind destination block, we need to keep
       // track of which values came into them from the invoke before removing
       // the edge from this block.
       llvm::BasicBlock *InvokeBB = II->getParent();
-      BasicBlock::iterator I = OuterUnwindDest->begin();
+      BasicBlock::iterator I = OuterResumeDest->begin();
       for (; isa<PHINode>(I); ++I) {
         // Save the value to use for this edge.
         PHINode *PHI = cast<PHINode>(I);
         UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
       }
 
-      // FIXME: With the new EH, this if/dyn_cast should be a 'cast'.
-      if (LandingPadInst *LPI = dyn_cast<LandingPadInst>(I)) {
-        CallerLPad = LPI;
-      }
+      CallerLPad = cast<LandingPadInst>(I);
     }
 
-    /// The outer unwind destination is the target of unwind edges
-    /// introduced for calls within the inlined function.
-    BasicBlock *getOuterUnwindDest() const {
-      return OuterUnwindDest;
+    /// getOuterResumeDest - The outer unwind destination is the target of
+    /// unwind edges introduced for calls within the inlined function.
+    BasicBlock *getOuterResumeDest() const {
+      return OuterResumeDest;
     }
 
-    EHSelectorInst *getOuterSelector() {
-      if (!OuterSelector)
-        OuterSelector = findSelectorForLandingPad(OuterUnwindDest);
-      return OuterSelector;
-    }
-
-    BasicBlock *getInnerUnwindDest();
-
-    // FIXME: New EH - Rename when new EH is turned on.
-    BasicBlock *getInnerUnwindDestNewEH();
+    BasicBlock *getInnerResumeDest();
 
     LandingPadInst *getLandingPadInst() const { return CallerLPad; }
 
-    bool forwardEHResume(CallInst *call, BasicBlock *src);
-
     /// forwardResume - Forward the 'resume' instruction to the caller's landing
     /// pad block. When the landing pad block has only one predecessor, this is
     /// a simple branch. When there is more than one predecessor, we need to
@@ -314,7 +88,7 @@ namespace {
     /// destination block for the given basic block, using the values for the
     /// original invoke's source block.
     void addIncomingPHIValuesFor(BasicBlock *BB) const {
-      addIncomingPHIValuesForInto(BB, OuterUnwindDest);
+      addIncomingPHIValuesForInto(BB, OuterResumeDest);
     }
 
     void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
@@ -327,113 +101,8 @@ namespace {
   };
 }
 
-/// [LIBUNWIND] Get or create a target for the branch out of rewritten calls to
-/// llvm.eh.resume.
-BasicBlock *InvokeInliningInfo::getInnerUnwindDest() {
-  if (InnerUnwindDest) return InnerUnwindDest;
-
-  // Find and hoist the llvm.eh.exception and llvm.eh.selector calls
-  // in the outer landing pad to immediately following the phis.
-  EHSelectorInst *selector = getOuterSelector();
-  if (!selector) return 0;
-
-  // The call to llvm.eh.exception *must* be in the landing pad.
-  Instruction *exn = cast<Instruction>(selector->getArgOperand(0));
-  assert(exn->getParent() == OuterUnwindDest);
-
-  // TODO: recognize when we've already done this, so that we don't
-  // get a linear number of these when inlining calls into lots of
-  // invokes with the same landing pad.
-
-  // Do the hoisting.
-  Instruction *splitPoint = exn->getParent()->getFirstNonPHI();
-  assert(splitPoint != selector && "selector-on-exception dominance broken!");
-  if (splitPoint == exn) {
-    selector->removeFromParent();
-    selector->insertAfter(exn);
-    splitPoint = selector->getNextNode();
-  } else {
-    exn->moveBefore(splitPoint);
-    selector->moveBefore(splitPoint);
-  }
-
-  // Split the landing pad.
-  InnerUnwindDest = OuterUnwindDest->splitBasicBlock(splitPoint,
-                                        OuterUnwindDest->getName() + ".body");
-
-  // The number of incoming edges we expect to the inner landing pad.
-  const unsigned phiCapacity = 2;
-
-  // Create corresponding new phis for all the phis in the outer landing pad.
-  BasicBlock::iterator insertPoint = InnerUnwindDest->begin();
-  BasicBlock::iterator I = OuterUnwindDest->begin();
-  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
-    PHINode *outerPhi = cast<PHINode>(I);
-    PHINode *innerPhi = PHINode::Create(outerPhi->getType(), phiCapacity,
-                                        outerPhi->getName() + ".lpad-body",
-                                        insertPoint);
-    outerPhi->replaceAllUsesWith(innerPhi);
-    innerPhi->addIncoming(outerPhi, OuterUnwindDest);
-  }
-
-  // Create a phi for the exception value...
-  InnerExceptionPHI = PHINode::Create(exn->getType(), phiCapacity,
-                                      "exn.lpad-body", insertPoint);
-  exn->replaceAllUsesWith(InnerExceptionPHI);
-  selector->setArgOperand(0, exn); // restore this use
-  InnerExceptionPHI->addIncoming(exn, OuterUnwindDest);
-
-  // ...and the selector.
-  InnerSelectorPHI = PHINode::Create(selector->getType(), phiCapacity,
-                                     "selector.lpad-body", insertPoint);
-  selector->replaceAllUsesWith(InnerSelectorPHI);
-  InnerSelectorPHI->addIncoming(selector, OuterUnwindDest);
-
-  // All done.
-  return InnerUnwindDest;
-}
-
-/// [LIBUNWIND] Try to forward the given call, which logically occurs
-/// at the end of the given block, as a branch to the inner unwind
-/// block.  Returns true if the call was forwarded.
-bool InvokeInliningInfo::forwardEHResume(CallInst *call, BasicBlock *src) {
-  // First, check whether this is a call to the intrinsic.
-  Function *fn = dyn_cast<Function>(call->getCalledValue());
-  if (!fn || fn->getName() != "llvm.eh.resume")
-    return false;
-  
-  // At this point, we need to return true on all paths, because
-  // otherwise we'll construct an invoke of the intrinsic, which is
-  // not well-formed.
-
-  // Try to find or make an inner unwind dest, which will fail if we
-  // can't find a selector call for the outer unwind dest.
-  BasicBlock *dest = getInnerUnwindDest();
-  bool hasSelector = (dest != 0);
-
-  // If we failed, just use the outer unwind dest, dropping the
-  // exception and selector on the floor.
-  if (!hasSelector)
-    dest = OuterUnwindDest;
-
-  // Make a branch.
-  BranchInst::Create(dest, src);
-
-  // Update the phis in the destination.  They were inserted in an
-  // order which makes this work.
-  addIncomingPHIValuesForInto(src, dest);
-
-  if (hasSelector) {
-    InnerExceptionPHI->addIncoming(call->getArgOperand(0), src);
-    InnerSelectorPHI->addIncoming(call->getArgOperand(1), src);
-  }
-
-  return true;
-}
-
-/// Get or create a target for the branch from ResumeInsts.
-BasicBlock *InvokeInliningInfo::getInnerUnwindDestNewEH() {
-  // FIXME: New EH - rename this function when new EH is turned on.
+/// getInnerResumeDest - Get or create a target for the branch from ResumeInsts.
+BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
   if (InnerResumeDest) return InnerResumeDest;
 
   // Split the landing pad.
@@ -472,7 +141,7 @@ BasicBlock *InvokeInliningInfo::getInnerUnwindDestNewEH() {
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
 void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
-  BasicBlock *Dest = getInnerUnwindDestNewEH();
+  BasicBlock *Dest = getInnerResumeDest();
   BasicBlock *Src = RI->getParent();
 
   BranchInst::Create(Dest, Src);
@@ -485,14 +154,6 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
   RI->eraseFromParent();
 }
 
-/// [LIBUNWIND] Check whether this selector is "only cleanups":
-///   call i32 @llvm.eh.selector(blah, blah, i32 0)
-static bool isCleanupOnlySelector(EHSelectorInst *selector) {
-  if (selector->getNumArgOperands() != 3) return false;
-  ConstantInt *val = dyn_cast<ConstantInt>(selector->getArgOperand(2));
-  return (val && val->isZero());
-}
-
 /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
 /// an invoke, we have to turn all of the calls that can throw into
 /// invokes.  This function analyze BB to see if there are any calls, and if so,
@@ -507,77 +168,34 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = BBI++;
 
-    if (LPI) // FIXME: New EH - This won't be NULL in the new EH.
-      if (LandingPadInst *L = dyn_cast<LandingPadInst>(I)) {
-        unsigned NumClauses = LPI->getNumClauses();
-        L->reserveClauses(NumClauses);
-        for (unsigned i = 0; i != NumClauses; ++i)
-          L->addClause(LPI->getClause(i));
-      }
+    if (LandingPadInst *L = dyn_cast<LandingPadInst>(I)) {
+      unsigned NumClauses = LPI->getNumClauses();
+      L->reserveClauses(NumClauses);
+      for (unsigned i = 0; i != NumClauses; ++i)
+        L->addClause(LPI->getClause(i));
+    }
 
     // We only need to check for function calls: inlined invoke
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
-    if (CI == 0) continue;
-
-    // LIBUNWIND: merge selector instructions.
-    if (EHSelectorInst *Inner = dyn_cast<EHSelectorInst>(CI)) {
-      EHSelectorInst *Outer = Invoke.getOuterSelector();
-      if (!Outer) continue;
-
-      bool innerIsOnlyCleanup = isCleanupOnlySelector(Inner);
-      bool outerIsOnlyCleanup = isCleanupOnlySelector(Outer);
-
-      // If both selectors contain only cleanups, we don't need to do
-      // anything.  TODO: this is really just a very specific instance
-      // of a much more general optimization.
-      if (innerIsOnlyCleanup && outerIsOnlyCleanup) continue;
-
-      // Otherwise, we just append the outer selector to the inner selector.
-      SmallVector<Value*, 16> NewSelector;
-      for (unsigned i = 0, e = Inner->getNumArgOperands(); i != e; ++i)
-        NewSelector.push_back(Inner->getArgOperand(i));
-      for (unsigned i = 2, e = Outer->getNumArgOperands(); i != e; ++i)
-        NewSelector.push_back(Outer->getArgOperand(i));
-
-      CallInst *NewInner =
-        IRBuilder<>(Inner).CreateCall(Inner->getCalledValue(), NewSelector);
-      // No need to copy attributes, calling convention, etc.
-      NewInner->takeName(Inner);
-      Inner->replaceAllUsesWith(NewInner);
-      Inner->eraseFromParent();
-      continue;
-    }
-    
+
     // If this call cannot unwind, don't convert it to an invoke.
-    if (CI->doesNotThrow())
+    if (!CI || CI->doesNotThrow())
       continue;
-    
-    // Convert this function call into an invoke instruction.
-    // First, split the basic block.
+
+    // Convert this function call into an invoke instruction.  First, split the
+    // basic block.
     BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
 
     // Delete the unconditional branch inserted by splitBasicBlock
     BB->getInstList().pop_back();
 
-    // LIBUNWIND: If this is a call to @llvm.eh.resume, just branch
-    // directly to the new landing pad.
-    if (Invoke.forwardEHResume(CI, BB)) {
-      // TODO: 'Split' is now unreachable; clean it up.
-
-      // We want to leave the original call intact so that the call
-      // graph and other structures won't get misled.  We also have to
-      // avoid processing the next block, or we'll iterate here forever.
-      return true;
-    }
-
-    // Otherwise, create the new invoke instruction.
+    // Create the new invoke instruction.
     ImmutableCallSite CS(CI);
     SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
-    InvokeInst *II =
-      InvokeInst::Create(CI->getCalledValue(), Split,
-                         Invoke.getOuterUnwindDest(),
-                         InvokeArgs, CI->getName(), BB);
+    InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split,
+                                        Invoke.getOuterResumeDest(),
+                                        InvokeArgs, CI->getName(), BB);
     II->setCallingConv(CI->getCallingConv());
     II->setAttributes(CI->getAttributes());
     
@@ -585,21 +203,20 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     // updates the CallGraph if present, because it uses a WeakVH.
     CI->replaceAllUsesWith(II);
 
-    Split->getInstList().pop_front();  // Delete the original call
+    // Delete the original call
+    Split->getInstList().pop_front();
 
-    // Update any PHI nodes in the exceptional block to indicate that
-    // there is now a new entry in them.
+    // Update any PHI nodes in the exceptional block to indicate that there is
+    // now a new entry in them.
     Invoke.addIncomingPHIValuesFor(BB);
     return false;
   }
 
   return false;
 }
-  
 
 /// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls
-/// in the body of the inlined function into invokes and turn unwind
-/// instructions into branches to the invoke unwind dest.
+/// in the body of the inlined function into invokes.
 ///
 /// II is the invoke instruction being inlined.  FirstNewBlock is the first
 /// block of the inlined code (the last block is the end of the function),
@@ -614,7 +231,7 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
   // start of the inlined code to its end, checking for stuff we need to
   // rewrite.  If the code doesn't have calls or unwinds, we know there is
   // nothing to rewrite.
-  if (!InlinedCodeInfo.ContainsCalls && !InlinedCodeInfo.ContainsUnwinds) {
+  if (!InlinedCodeInfo.ContainsCalls) {
     // Now that everything is happy, we have one final detail.  The PHI nodes in
     // the exception destination block still have entries due to the original
     // invoke instruction.  Eliminate these entries (which might even delete the
@@ -628,30 +245,13 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
   for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
     if (InlinedCodeInfo.ContainsCalls)
       if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
-        // Honor a request to skip the next block.  We don't need to
-        // consider UnwindInsts in this case either.
+        // Honor a request to skip the next block.
         ++BB;
         continue;
       }
 
-    if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-      // An UnwindInst requires special handling when it gets inlined into an
-      // invoke site.  Once this happens, we know that the unwind would cause
-      // a control transfer to the invoke exception destination, so we can
-      // transform it into a direct branch to the exception destination.
-      BranchInst::Create(InvokeDest, UI);
-
-      // Delete the unwind instruction!
-      UI->eraseFromParent();
-
-      // Update any PHI nodes in the exceptional block to indicate that
-      // there is now a new entry in them.
-      Invoke.addIncomingPHIValuesFor(BB);
-    }
-
-    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
       Invoke.forwardResume(RI);
-    }
   }
 
   // Now that everything is happy, we have one final detail.  The PHI nodes in
@@ -836,8 +436,8 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
   return false;
 }
 
-/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to recursively
-/// update InlinedAtEntry of a DebugLoc.
+/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to
+/// recursively update InlinedAtEntry of a DebugLoc.
 static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, 
                                     const DebugLoc &InlinedAtDL,
                                     LLVMContext &Ctx) {
@@ -847,16 +447,15 @@ static DebugLoc updateInlinedAtInfo(const DebugLoc &DL,
     return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
                          NewInlinedAtDL.getAsMDNode(Ctx));
   }
-                                             
+
   return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
                        InlinedAtDL.getAsMDNode(Ctx));
 }
 
-
 /// fixupLineNumbers - Update inlined instructions' line numbers to 
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
-                              Instruction *TheCall) {
+                             Instruction *TheCall) {
   DebugLoc TheCallDL = TheCall->getDebugLoc();
   if (TheCallDL.isUnknown())
     return;
@@ -878,18 +477,18 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   }
 }
 
-// InlineFunction - This function inlines the called function into the basic
-// block of the caller.  This returns false if it is not possible to inline this
-// call.  The program is still in a well defined state if this occurs though.
-//
-// Note that this only does one level of inlining.  For example, if the
-// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
-// exists in the instruction stream.  Similarly this will inline a recursive
-// function by one level.
-//
-bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
+/// InlineFunction - This function inlines the called function into the basic
+/// block of the caller.  This returns false if it is not possible to inline
+/// this call.  The program is still in a well defined state if this occurs
+/// though.
+///
+/// Note that this only does one level of inlining.  For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream.  Similarly this will inline a recursive
+/// function by one level.
+bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
   Instruction *TheCall = CS.getInstruction();
-  LLVMContext &Context = TheCall->getContext();
   assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
          "Instruction not in function!");
 
@@ -924,43 +523,40 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       return false;
   }
 
-  // Find the personality function used by the landing pads of the caller. If it
-  // exists, then check to see that it matches the personality function used in
-  // the callee.
-  for (Function::const_iterator
-         I = Caller->begin(), E = Caller->end(); I != E; ++I)
+  // Get the personality function from the callee if it contains a landing pad.
+  Value *CalleePersonality = 0;
+  for (Function::const_iterator I = CalledFunc->begin(), E = CalledFunc->end();
+       I != E; ++I)
     if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
       const BasicBlock *BB = II->getUnwindDest();
-      // FIXME: This 'isa' here should become go away once the new EH system is
-      // in place.
-      if (!isa<LandingPadInst>(BB->getFirstNonPHI()))
-        continue;
-      const LandingPadInst *LP = cast<LandingPadInst>(BB->getFirstNonPHI());
-      const Value *CallerPersFn = LP->getPersonalityFn();
-
-      // If the personality functions match, then we can perform the
-      // inlining. Otherwise, we can't inline.
-      // TODO: This isn't 100% true. Some personality functions are proper
-      //       supersets of others and can be used in place of the other.
-      for (Function::const_iterator
-             I = CalledFunc->begin(), E = CalledFunc->end(); I != E; ++I)
-        if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
-          const BasicBlock *BB = II->getUnwindDest();
-          // FIXME: This 'if/dyn_cast' here should become a normal 'cast' once
-          // the new EH system is in place.
-          if (const LandingPadInst *LP =
-              dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
-            if (CallerPersFn != LP->getPersonalityFn())
-              return false;
-          break;
-        }
-
+      const LandingPadInst *LP = BB->getLandingPadInst();
+      CalleePersonality = LP->getPersonalityFn();
       break;
     }
 
+  // Find the personality function used by the landing pads of the caller. If it
+  // exists, then check to see that it matches the personality function used in
+  // the callee.
+  if (CalleePersonality) {
+    for (Function::const_iterator I = Caller->begin(), E = Caller->end();
+         I != E; ++I)
+      if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+        const BasicBlock *BB = II->getUnwindDest();
+        const LandingPadInst *LP = BB->getLandingPadInst();
+
+        // If the personality functions match, then we can perform the
+        // inlining. Otherwise, we can't inline.
+        // TODO: This isn't 100% true. Some personality functions are proper
+        //       supersets of others and can be used in place of the other.
+        if (LP->getPersonalityFn() != CalleePersonality)
+          return false;
+
+        break;
+      }
+  }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
-  //
   Function::iterator LastBlock = &Caller->back();
 
   // Make sure to capture all of the return instructions from the cloned
@@ -987,7 +583,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       // by them explicit.  However, we don't do this if the callee is readonly
       // or readnone, because the copy would be unneeded: the callee doesn't
       // modify the struct.
-      if (CalledFunc->paramHasAttr(ArgNo+1, Attribute::ByVal)) {
+      if (CS.isByValArgument(ArgNo)) {
         ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
                                         CalledFunc->getParamAlignment(ArgNo+1));
  
@@ -1023,7 +619,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
   // block for the callee, move them to the entry block of the caller.  First
   // calculate which instruction they should be inserted before.  We insert the
   // instructions at the end of the current alloca list.
-  //
   {
     BasicBlock::iterator InsertPoint = Caller->begin()->begin();
     for (BasicBlock::iterator I = FirstNewBlock->begin(),
@@ -1063,7 +658,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
 
   // Leave lifetime markers for the static alloca's, scoping them to the
   // function we just inlined.
-  if (!IFI.StaticAllocas.empty()) {
+  if (InsertLifetime && !IFI.StaticAllocas.empty()) {
     IRBuilder<> builder(FirstNewBlock->begin());
     for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
       AllocaInst *AI = IFI.StaticAllocas[ai];
@@ -1098,20 +693,6 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
       IRBuilder<>(Returns[i]).CreateCall(StackRestore, SavedPtr);
     }
-
-    // Count the number of StackRestore calls we insert.
-    unsigned NumStackRestores = Returns.size();
-
-    // If we are inlining an invoke instruction, insert restores before each
-    // unwind.  These unwinds will be rewritten into branches later.
-    if (InlinedFunctionInfo.ContainsUnwinds && isa<InvokeInst>(TheCall)) {
-      for (Function::iterator BB = FirstNewBlock, E = Caller->end();
-           BB != E; ++BB)
-        if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-          IRBuilder<>(UI).CreateCall(StackRestore, SavedPtr);
-          ++NumStackRestores;
-        }
-    }
   }
 
   // If we are inlining tail call instruction through a call site that isn't
@@ -1131,21 +712,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
         }
   }
 
-  // If we are inlining through a 'nounwind' call site then any inlined 'unwind'
-  // instructions are unreachable.
-  if (InlinedFunctionInfo.ContainsUnwinds && MarkNoUnwind)
-    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
-         BB != E; ++BB) {
-      TerminatorInst *Term = BB->getTerminator();
-      if (isa<UnwindInst>(Term)) {
-        new UnreachableInst(Context, Term);
-        BB->getInstList().erase(Term);
-      }
-    }
-
   // If we are inlining for an invoke instruction, we must make sure to rewrite
-  // any inlined 'unwind' instructions into branches to the invoke exception
-  // destination, and call instructions into invoke instructions.
+  // any call instructions into invoke instructions.
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
     HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo);
 
@@ -1308,11 +876,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
   // If we inserted a phi node, check to see if it has a single value (e.g. all
   // the entries are the same or undef).  If so, remove the PHI so it doesn't
   // block other optimizations.
-  if (PHI)
+  if (PHI) {
     if (Value *V = SimplifyInstruction(PHI, IFI.TD)) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
     }
+  }
 
   return true;
 }
diff --git a/lib/Transforms/Utils/LLVMBuild.txt b/lib/Transforms/Utils/LLVMBuild.txt
new file mode 100644
index 0000000..88b2ffe
--- /dev/null
+++ b/lib/Transforms/Utils/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/Utils/LLVMBuild.txt ---------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = TransformUtils
+parent = Transforms
+required_libraries = Analysis Core IPA Support Target
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 7034feb..d1c4d59 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
@@ -105,33 +106,32 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
     // If we are switching on a constant, we can convert the switch into a
     // single branch instruction!
     ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
-    BasicBlock *TheOnlyDest = SI->getSuccessor(0);  // The default dest
+    BasicBlock *TheOnlyDest = SI->getDefaultDest();
     BasicBlock *DefaultDest = TheOnlyDest;
-    assert(TheOnlyDest == SI->getDefaultDest() &&
-           "Default destination is not successor #0?");
 
     // Figure out which case it goes to.
-    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i) {
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
       // Found case matching a constant operand?
-      if (SI->getSuccessorValue(i) == CI) {
-        TheOnlyDest = SI->getSuccessor(i);
+      if (i.getCaseValue() == CI) {
+        TheOnlyDest = i.getCaseSuccessor();
         break;
       }
 
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
-      if (SI->getSuccessor(i) == DefaultDest) {
+      if (i.getCaseSuccessor() == DefaultDest) {
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
         SI->removeCase(i);
-        --i; --e;  // Don't skip an entry...
+        --i; --e;
         continue;
       }
 
       // Otherwise, check to see if the switch only branches to one destination.
       // We do this by reseting "TheOnlyDest" to null when we find two non-equal
       // destinations.
-      if (SI->getSuccessor(i) != TheOnlyDest) TheOnlyDest = 0;
+      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = 0;
     }
 
     if (CI && !TheOnlyDest) {
@@ -165,14 +165,16 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       return true;
     }
     
-    if (SI->getNumSuccessors() == 2) {
+    if (SI->getNumCases() == 1) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
+      SwitchInst::CaseIt FirstCase = SI->case_begin();
       Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-                                         SI->getSuccessorValue(1), "cond");
+          FirstCase.getCaseValue(), "cond");
 
       // Insert the new branch.
-      Builder.CreateCondBr(Cond, SI->getSuccessor(1), SI->getSuccessor(0));
+      Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
+                           SI->getDefaultDest());
 
       // Delete the old switch.
       SI->eraseFromParent();
@@ -257,6 +259,13 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
         II->getIntrinsicID() == Intrinsic::lifetime_end)
       return isa<UndefValue>(II->getArgOperand(1));
   }
+
+  if (extractMallocCall(I)) return true;
+
+  if (CallInst *CI = isFreeCall(I))
+    if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
+      return C->isNullValue() || isa<UndefValue>(C);
+
   return false;
 }
 
@@ -346,22 +355,27 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
 /// instructions in other blocks as well in this block.
 bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
   bool MadeChange = false;
-  for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(--BB->end());
+#endif
+
+  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
+    assert(!BI->isTerminator());
     Instruction *Inst = BI++;
-    
-    if (Value *V = SimplifyInstruction(Inst, TD)) {
-      WeakVH BIHandle(BI);
-      ReplaceAndSimplifyAllUses(Inst, V, TD);
+
+    WeakVH BIHandle(BI);
+    if (recursivelySimplifyInstruction(Inst, TD)) {
       MadeChange = true;
       if (BIHandle != BI)
         BI = BB->begin();
       continue;
     }
 
-    if (Inst->isTerminator())
-      break;
-
-    WeakVH BIHandle(BI);
     MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst);
     if (BIHandle != BI)
       BI = BB->begin();
@@ -399,17 +413,11 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
   WeakVH PhiIt = &BB->front();
   while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
     PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
+    Value *OldPhiIt = PhiIt;
 
-    Value *PNV = SimplifyInstruction(PN, TD);
-    if (PNV == 0) continue;
+    if (!recursivelySimplifyInstruction(PN, TD))
+      continue;
 
-    // If we're able to simplify the phi to a single value, substitute the new
-    // value into all of its uses.
-    assert(PNV != PN && "SimplifyInstruction broken!");
-    
-    Value *OldPhiIt = PhiIt;
-    ReplaceAndSimplifyAllUses(PN, PNV, TD);
-    
     // If recursive simplification ended up deleting the next PHI node we would
     // iterate to, then our iterator is invalid, restart scanning from the top
     // of the block.
@@ -486,22 +494,8 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
   if (Succ->getSinglePredecessor()) return true;
 
   // Make a list of the predecessors of BB
-  typedef SmallPtrSet<BasicBlock*, 16> BlockSet;
-  BlockSet BBPreds(pred_begin(BB), pred_end(BB));
-
-  // Use that list to make another list of common predecessors of BB and Succ
-  BlockSet CommonPreds;
-  for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
-       PI != PE; ++PI) {
-    BasicBlock *P = *PI;
-    if (BBPreds.count(P))
-      CommonPreds.insert(P);
-  }
+  SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
 
-  // Shortcut, if there are no common predecessors, merging is always safe
-  if (CommonPreds.empty())
-    return true;
-  
   // Look at all the phi nodes in Succ, to see if they present a conflict when
   // merging these blocks
   for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
@@ -512,28 +506,28 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
     // merge the phi nodes and then the blocks can still be merged
     PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
     if (BBPN && BBPN->getParent() == BB) {
-      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
-            PI != PE; PI++) {
-        if (BBPN->getIncomingValueForBlock(*PI) 
-              != PN->getIncomingValueForBlock(*PI)) {
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) &&
+            BBPN->getIncomingValueForBlock(IBB) != PN->getIncomingValue(PI)) {
           DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " 
                 << Succ->getName() << " is conflicting with " 
                 << BBPN->getName() << " with regard to common predecessor "
-                << (*PI)->getName() << "\n");
+                << IBB->getName() << "\n");
           return false;
         }
       }
     } else {
       Value* Val = PN->getIncomingValueForBlock(BB);
-      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
-            PI != PE; PI++) {
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
         // See if the incoming value for the common predecessor is equal to the
         // one for BB, in which case this phi node will not prevent the merging
         // of the block.
-        if (Val != PN->getIncomingValueForBlock(*PI)) {
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) && Val != PN->getIncomingValue(PI)) {
           DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " 
                 << Succ->getName() << " is conflicting with regard to common "
-                << "predecessor " << (*PI)->getName() << "\n");
+                << "predecessor " << IBB->getName() << "\n");
           return false;
         }
       }
@@ -740,6 +734,10 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.
     if (GV->isDeclaration()) return Align;
+    // If the memory we set aside for the global may not be the memory used by
+    // the final program then it is impossible for us to reliably enforce the
+    // preferred alignment.
+    if (GV->isWeakForLinker()) return Align;
     
     if (GV->getAlignment() >= PrefAlign)
       return GV->getAlignment();
@@ -764,9 +762,8 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
   unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
-  APInt Mask = APInt::getAllOnesValue(BitWidth);
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
   unsigned TrailZ = KnownZero.countTrailingOnes();
   
   // Avoid trouble with rediculously large TrailZ values, such as
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index cbd54a8..0bc185d 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -99,7 +99,8 @@ namespace {
     bool ProcessLoop(Loop *L, LPPassManager &LPM);
     BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
     BasicBlock *InsertPreheaderForLoop(Loop *L);
-    Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM);
+    Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM,
+                             BasicBlock *Preheader);
     BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
     void PlaceSplitBlockCarefully(BasicBlock *NewBB,
                                   SmallVectorImpl<BasicBlock*> &SplitPreds,
@@ -240,7 +241,7 @@ ReprocessLoop:
     // this for loops with a giant number of backedges, just factor them into a
     // common backedge instead.
     if (L->getNumBackEdges() < 8) {
-      if (SeparateNestedLoop(L, LPM)) {
+      if (SeparateNestedLoop(L, LPM, Preheader)) {
         ++NumNested;
         // This is a big restructuring change, reprocess the whole loop.
         Changed = true;
@@ -265,7 +266,7 @@ ReprocessLoop:
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, 0, DT)) {
+    if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) {
       if (AA) AA->deleteValue(PN);
       if (SE) SE->forgetValue(PN);
       PN->replaceAllUsesWith(V);
@@ -379,19 +380,27 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
   }
 
   // Split out the loop pre-header.
-  BasicBlock *NewBB =
-    SplitBlockPredecessors(Header, &OutsideBlocks[0], OutsideBlocks.size(),
-                           ".preheader", this);
+  BasicBlock *PreheaderBB;
+  if (!Header->isLandingPad()) {
+    PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
+                                         this);
+  } else {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
+                                ".split-lp", this, NewBBs);
+    PreheaderBB = NewBBs[0];
+  }
 
-  NewBB->getTerminator()->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
-  DEBUG(dbgs() << "LoopSimplify: Creating pre-header " << NewBB->getName()
-               << "\n");
+  PreheaderBB->getTerminator()->setDebugLoc(
+                                      Header->getFirstNonPHI()->getDebugLoc());
+  DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+               << PreheaderBB->getName() << "\n");
 
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
-  PlaceSplitBlockCarefully(NewBB, OutsideBlocks, L);
+  PlaceSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);
 
-  return NewBB;
+  return PreheaderBB;
 }
 
 /// RewriteLoopExitBlock - Ensure that the loop preheader dominates all exit
@@ -420,9 +429,7 @@ BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
                                 this, NewBBs);
     NewExitBB = NewBBs[0];
   } else {
-    NewExitBB = SplitBlockPredecessors(Exit, &LoopBlocks[0],
-                                       LoopBlocks.size(), ".loopexit",
-                                       this);
+    NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", this);
   }
 
   DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
@@ -456,7 +463,7 @@ static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT,
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, 0, DT)) {
+    if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       if (AA) AA->deleteValue(PN);
@@ -529,7 +536,16 @@ void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
 /// If we are able to separate out a loop, return the new outer loop that was
 /// created.
 ///
-Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
+Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM,
+                                       BasicBlock *Preheader) {
+  // Don't try to separate loops without a preheader.
+  if (!Preheader)
+    return 0;
+
+  // The header is not a landing pad; preheader insertion should ensure this.
+  assert(!L->getHeader()->isLandingPad() &&
+         "Can't insert backedge to landing pad");
+
   PHINode *PN = FindPHIToPartitionLoops(L, DT, AA, LI);
   if (PN == 0) return 0;  // No known way to partition.
 
@@ -537,16 +553,15 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
   // handles the case when a PHI node has multiple instances of itself as
   // arguments.
   SmallVector<BasicBlock*, 8> OuterLoopPreds;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     if (PN->getIncomingValue(i) != PN ||
         !L->contains(PN->getIncomingBlock(i))) {
       // We can't split indirectbr edges.
       if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
         return 0;
-
       OuterLoopPreds.push_back(PN->getIncomingBlock(i));
     }
-
+  }
   DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
 
   // If ScalarEvolution is around and knows anything about values in
@@ -556,9 +571,8 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
     SE->forgetLoop(L);
 
   BasicBlock *Header = L->getHeader();
-  BasicBlock *NewBB = SplitBlockPredecessors(Header, &OuterLoopPreds[0],
-                                             OuterLoopPreds.size(),
-                                             ".outer", this);
+  BasicBlock *NewBB =
+    SplitBlockPredecessors(Header, OuterLoopPreds,  ".outer", this);
 
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
@@ -640,6 +654,9 @@ LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
   if (!Preheader)
     return 0;
 
+  // The header is not a landing pad; preheader insertion should ensure this.
+  assert(!Header->isLandingPad() && "Can't insert backedge to landing pad");
+
   // Figure out which basic blocks contain back-edges to the loop header.
   std::vector<BasicBlock*> BackedgeBlocks;
   for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 62e4fa2..e15497a 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -135,7 +135,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
 /// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
 /// available it must also preserve those analyses.
 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
-                      unsigned TripMultiple, LoopInfo *LI, LPPassManager *LPM) {
+                      bool AllowRuntime, unsigned TripMultiple,
+                      LoopInfo *LI, LPPassManager *LPM) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -148,6 +149,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     return false;
   }
 
+  // Loops with indirectbr cannot be cloned.
+  if (!L->isSafeToClone()) {
+    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    return false;
+  }
+
   BasicBlock *Header = L->getHeader();
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
 
@@ -165,12 +172,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     return false;
   }
 
-  // Notify ScalarEvolution that the loop will be substantially changed,
-  // if not outright eliminated.
-  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
-  if (SE)
-    SE->forgetLoop(L);
-
   if (TripCount != 0)
     DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
   if (TripMultiple != 1)
@@ -181,6 +182,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   if (TripCount != 0 && Count > TripCount)
     Count = TripCount;
 
+  // Don't enter the unroll code if there is nothing to do. This way we don't
+  // need to support "partial unrolling by 1".
+  if (TripCount == 0 && Count < 2)
+    return false;
+
   assert(Count > 0);
   assert(TripMultiple > 0);
   assert(TripCount == 0 || TripCount % TripMultiple == 0);
@@ -188,6 +194,20 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = Count == TripCount;
 
+  // We assume a run-time trip count if the compiler cannot
+  // figure out the loop trip count and the unroll-runtime
+  // flag is specified.
+  bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
+
+  if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM))
+    return false;
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+  if (SE)
+    SE->forgetLoop(L);
+
   // If we know the trip count, we know the multiple...
   unsigned BreakoutTrip = 0;
   if (TripCount != 0) {
@@ -209,6 +229,8 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
     } else if (TripMultiple != 1) {
       DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+    } else if (RuntimeTripCount) {
+      DEBUG(dbgs() << " with run-time trip count");
     }
     DEBUG(dbgs() << "!\n");
   }
@@ -332,6 +354,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     BasicBlock *Dest = Headers[j];
     bool NeedConditional = true;
 
+    if (RuntimeTripCount && j != 0) {
+      NeedConditional = false;
+    }
+
     // For a complete unroll, make the last iteration end with a branch
     // to the exit block.
     if (CompletelyUnroll && j == 0) {
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
new file mode 100644
index 0000000..3aa6bef
--- /dev/null
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -0,0 +1,372 @@
+//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for loops with run-time
+// trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time
+// trip counts.
+//
+// The functions in this file are used to generate extra code when the
+// run-time trip count modulo the unroll factor is not 0.  When this is the
+// case, we need to generate code to execute these 'left over' iterations.
+//
+// The current strategy generates an if-then-else sequence prior to the
+// unrolled loop to execute the 'left over' iterations.  Other strategies
+// include generate a loop before or after the unrolled loop.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <algorithm>
+
+using namespace llvm;
+
+STATISTIC(NumRuntimeUnrolled,
+          "Number of loops unrolled with run-time trip counts");
+
+/// Connect the unrolling prolog code to the original loop.
+/// The unrolling prolog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Create PHI nodes at prolog end block to combine values
+///   that exit the prolog code and jump around the prolog.
+/// - Add a PHI operand to a PHI node at the loop exit block
+///   for values that exit the prolog and go around the loop.
+/// - Branch around the original loop if the trip count is less
+///   than the unroll factor.
+///
+static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
+                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
+                          BasicBlock *OrigPH, BasicBlock *NewPH,
+                          ValueToValueMapTy &LVMap, Pass *P) {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch != 0 && "Loop must have a latch");
+
+  // Create a PHI node for each outgoing value from the original loop
+  // (which means it is an outgoing value from the prolog code too).
+  // The new PHI node is inserted in the prolog end basic block.
+  // The new PHI name is added as an operand of a PHI node in either
+  // the loop header or the loop exit block.
+  for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
+       SBI != SBE; ++SBI) {
+    for (BasicBlock::iterator BBI = (*SBI)->begin();
+         PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
+
+      // Add a new PHI node to the prolog end block and add the
+      // appropriate incoming values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
+                                       PrologEnd->getTerminator());
+      // Adding a value to the new PHI node from the original loop preheader.
+      // This is the value that skips all the prolog code.
+      if (L->contains(PN)) {
+        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
+      } else {
+        NewPN->addIncoming(Constant::getNullValue(PN->getType()), OrigPH);
+      }
+
+      Value *V = PN->getIncomingValueForBlock(Latch);
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (L->contains(I)) {
+          V = LVMap[I];
+        }
+      }
+      // Adding a value to the new PHI node from the last prolog block
+      // that was created.
+      NewPN->addIncoming(V, LastPrologBB);
+
+      // Update the existing PHI node operand with the value from the
+      // new PHI node.  How this is done depends on if the existing
+      // PHI node is in the original loop block, or the exit block.
+      if (L->contains(PN)) {
+        PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
+      } else {
+        PN->addIncoming(NewPN, PrologEnd);
+      }
+    }
+  }
+
+  // Create a branch around the orignal loop, which is taken if the
+  // trip count is less than the unroll factor.
+  Instruction *InsertPt = PrologEnd->getTerminator();
+  Instruction *BrLoopExit =
+    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
+                 ConstantInt::get(TripCount->getType(), Count));
+  BasicBlock *Exit = L->getUniqueExitBlock();
+  assert(Exit != 0 && "Loop must have a single exit block only");
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
+  if (!Exit->isLandingPad()) {
+    SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P);
+  } else {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa",
+                                P, NewBBs);
+  }
+  // Add the branch to the exit block (around the unrolled loop)
+  BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt);
+  InsertPt->eraseFromParent();
+}
+
+/// Create a clone of the blocks in a loop and connect them together.
+/// This function doesn't create a clone of the loop structure.
+///
+/// There are two value maps that are defined and used.  VMap is
+/// for the values in the current loop instance.  LVMap contains
+/// the values from the last loop instance.  We need the LVMap values
+/// to update the inital values for the current loop instance.
+///
+static void CloneLoopBlocks(Loop *L,
+                            bool FirstCopy,
+                            BasicBlock *InsertTop,
+                            BasicBlock *InsertBot,
+                            std::vector<BasicBlock *> &NewBlocks,
+                            LoopBlocksDFS &LoopBlocks,
+                            ValueToValueMapTy &VMap,
+                            ValueToValueMapTy &LVMap,
+                            LoopInfo *LI) {
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  Function *F = Header->getParent();
+  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  // For each block in the original loop, create a new copy,
+  // and update the value map with the newly created values.
+  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F);
+    NewBlocks.push_back(NewBB);
+
+    if (Loop *ParentLoop = L->getParentLoop())
+      ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+
+    VMap[*BB] = NewBB;
+    if (Header == *BB) {
+      // For the first block, add a CFG connection to this newly
+      // created block
+      InsertTop->getTerminator()->setSuccessor(0, NewBB);
+
+      // Change the incoming values to the ones defined in the
+      // previously cloned loop.
+      for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+        PHINode *NewPHI = cast<PHINode>(VMap[I]);
+        if (FirstCopy) {
+          // We replace the first phi node with the value from the preheader
+          VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
+          NewBB->getInstList().erase(NewPHI);
+        } else {
+          // Update VMap with values from the previous block
+          unsigned idx = NewPHI->getBasicBlockIndex(Latch);
+          Value *InVal = NewPHI->getIncomingValue(idx);
+          if (Instruction *I = dyn_cast<Instruction>(InVal))
+            if (L->contains(I))
+              InVal = LVMap[InVal];
+          NewPHI->setIncomingValue(idx, InVal);
+          NewPHI->setIncomingBlock(idx, InsertTop);
+        }
+      }
+    }
+
+    if (Latch == *BB) {
+      VMap.erase((*BB)->getTerminator());
+      NewBB->getTerminator()->eraseFromParent();
+      BranchInst::Create(InsertBot, NewBB);
+    }
+  }
+  // LastValueMap is updated with the values for the current loop
+  // which are used the next time this function is called.
+  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+       VI != VE; ++VI) {
+    LVMap[VI->first] = VI->second;
+  }
+}
+
+/// Insert code in the prolog code when unrolling a loop with a
+/// run-time trip-count.
+///
+/// This method assumes that the loop unroll factor is total number
+/// of loop bodes in the loop after unrolling. (Some folks refer
+/// to the unroll factor as the number of *extra* copies added).
+/// We assume also that the loop unroll factor is a power-of-two. So, after
+/// unrolling the loop, the number of loop bodies executed is 2,
+/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
+/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
+/// the switch instruction is generated.
+///
+///    extraiters = tripcount % loopfactor
+///    if (extraiters == 0) jump Loop:
+///    if (extraiters == loopfactor) jump L1
+///    if (extraiters == loopfactor-1) jump L2
+///    ...
+///    L1:  LoopBody;
+///    L2:  LoopBody;
+///    ...
+///    if tripcount < loopfactor jump End
+///    Loop:
+///    ...
+///    End:
+///
+bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
+                                   LPPassManager *LPM) {
+  // for now, only unroll loops that contain a single exit
+  if (!L->getExitingBlock())
+    return false;
+
+  // Make sure the loop is in canonical form, and there is a single
+  // exit block only.
+  if (!L->isLoopSimplifyForm() || L->getUniqueExitBlock() == 0)
+    return false;
+
+  // Use Scalar Evolution to compute the trip count.  This allows more
+  // loops to be unrolled than relying on induction var simplification
+  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+  if (SE == 0)
+    return false;
+
+  // Only unroll loops with a computable trip count and the trip count needs
+  // to be an int value (allowing a pointer type is a TODO item)
+  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
+    return false;
+
+  // Add 1 since the backedge count doesn't include the first loop iteration
+  const SCEV *TripCountSC =
+    SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
+  if (isa<SCEVCouldNotCompute>(TripCountSC))
+    return false;
+
+  // We only handle cases when the unroll factor is a power of 2.
+  // Count is the loop unroll factor, the number of extra copies added + 1.
+  if ((Count & (Count-1)) != 0)
+    return false;
+
+  // If this loop is nested, then the loop unroller changes the code in
+  // parent loop, so the Scalar Evolution pass needs to be run again
+  if (Loop *ParentLoop = L->getParentLoop())
+    SE->forgetLoop(ParentLoop);
+
+  BasicBlock *PH = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  // It helps to splits the original preheader twice, one for the end of the
+  // prolog code and one for a new loop preheader
+  BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass());
+  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass());
+  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
+
+  // Compute the number of extra iterations required, which is:
+  //  extra iterations = run-time trip count % (loop unroll factor + 1)
+  SCEVExpander Expander(*SE, "loop-unroll");
+  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
+                                            PreHeaderBR);
+  Type *CountTy = TripCount->getType();
+  BinaryOperator *ModVal =
+    BinaryOperator::CreateURem(TripCount,
+                               ConstantInt::get(CountTy, Count),
+                               "xtraiter");
+  ModVal->insertBefore(PreHeaderBR);
+
+  // Check if for no extra iterations, then jump to unrolled loop
+  Value *BranchVal = new ICmpInst(PreHeaderBR,
+                                  ICmpInst::ICMP_NE, ModVal,
+                                  ConstantInt::get(CountTy, 0), "lcmp");
+  // Branch to either the extra iterations or the unrolled loop
+  // We will fix up the true branch label when adding loop body copies
+  BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR);
+  assert(PreHeaderBR->isUnconditional() &&
+         PreHeaderBR->getSuccessor(0) == PEnd &&
+         "CFG edges in Preheader are not correct");
+  PreHeaderBR->eraseFromParent();
+
+  ValueToValueMapTy LVMap;
+  Function *F = Header->getParent();
+  // These variables are used to update the CFG links in each iteration
+  BasicBlock *CompareBB = 0;
+  BasicBlock *LastLoopBB = PH;
+  // Get an ordered list of blocks in the loop to help with the ordering of the
+  // cloned blocks in the prolog code
+  LoopBlocksDFS LoopBlocks(L);
+  LoopBlocks.perform(LI);
+
+  //
+  // For each extra loop iteration, create a copy of the loop's basic blocks
+  // and generate a condition that branches to the copy depending on the
+  // number of 'left over' iterations.
+  //
+  for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) {
+    std::vector<BasicBlock*> NewBlocks;
+    ValueToValueMapTy VMap;
+
+    // Clone all the basic blocks in the loop, but we don't clone the loop
+    // This function adds the appropriate CFG connections.
+    CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks,
+                    LoopBlocks, VMap, LVMap, LI);
+    LastLoopBB = cast<BasicBlock>(VMap[Latch]);
+
+    // Insert the cloned blocks into function just before the original loop
+    F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(),
+                                  NewBlocks[0], F->end());
+
+    // Generate the code for the comparison which determines if the loop
+    // prolog code needs to be executed.
+    if (leftOverIters == Count-1) {
+      // There is no compare block for the fall-thru case when for the last
+      // left over iteration
+      CompareBB = NewBlocks[0];
+    } else {
+      // Create a new block for the comparison
+      BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp",
+                                             F, CompareBB);
+      if (Loop *ParentLoop = L->getParentLoop()) {
+        // Add the new block to the parent loop, if needed
+        ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      }
+
+      // The comparison w/ the extra iteration value and branch
+      Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal,
+                                      ConstantInt::get(CountTy, leftOverIters),
+                                      "un.tmp");
+      // Branch to either the extra iterations or the unrolled loop
+      BranchInst::Create(NewBlocks[0], CompareBB,
+                         BranchVal, NewBB);
+      CompareBB = NewBB;
+      PH->getTerminator()->setSuccessor(0, NewBB);
+      VMap[NewPH] = CompareBB;
+    }
+
+    // Rewrite the cloned instruction operands to use the values
+    // created when the clone is created.
+    for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
+      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+             E = NewBlocks[i]->end(); I != E; ++I) {
+        RemapInstruction(I, VMap,
+                         RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+      }
+    }
+  }
+
+  // Connect the prolog code to the original loop and update the
+  // PHI functions.
+  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap,
+                LPM->getAsPass());
+  NumRuntimeUnrolled++;
+  return true;
+}
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 61ab3f6..c70ced1 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -1,3 +1,16 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
 #define DEBUG_TYPE "lower-expect-intrinsic"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -60,14 +73,17 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
   LLVMContext &Context = CI->getContext();
   Type *Int32Ty = Type::getInt32Ty(Context);
 
-  unsigned caseNo = SI->findCaseValue(ExpectedValue);
+  SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue);
   std::vector<Value *> Vec;
   unsigned n = SI->getNumCases();
-  Vec.resize(n + 1); // +1 for MDString
+  Vec.resize(n + 1 + 1); // +1 for MDString and +1 for default case
 
   Vec[0] = MDString::get(Context, "branch_weights");
+  Vec[1] = ConstantInt::get(Int32Ty, Case == SI->case_default() ?
+                            LikelyBranchWeight : UnlikelyBranchWeight);
   for (unsigned i = 0; i < n; ++i) {
-    Vec[i + 1] = ConstantInt::get(Int32Ty, i == caseNo ? LikelyBranchWeight : UnlikelyBranchWeight);
+    Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == Case.getCaseIndex() ?
+        LikelyBranchWeight : UnlikelyBranchWeight);
   }
 
   MDNode *WeightsNode = llvm::MDNode::get(Context, Vec);
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index c96c8fc..9305554 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -54,7 +54,6 @@
 using namespace llvm;
 
 STATISTIC(NumInvokes, "Number of invokes replaced");
-STATISTIC(NumUnwinds, "Number of unwinds replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support",
@@ -193,20 +192,6 @@ bool LowerInvoke::insertCheapEHSupport(Function &F) {
       BB->getInstList().erase(II);
 
       ++NumInvokes; Changed = true;
-    } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-      // Insert a call to abort()
-      CallInst::Create(AbortFn, "", UI)->setTailCall();
-
-      // Insert a return instruction.  This really should be a "barrier", as it
-      // is unreachable.
-      ReturnInst::Create(F.getContext(),
-                         F.getReturnType()->isVoidTy() ?
-                          0 : Constant::getNullValue(F.getReturnType()), UI);
-
-      // Remove the unwind instruction now.
-      BB->getInstList().erase(UI);
-
-      ++NumUnwinds; Changed = true;
     }
   return Changed;
 }
@@ -404,7 +389,6 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
 
 bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
   SmallVector<ReturnInst*,16> Returns;
-  SmallVector<UnwindInst*,16> Unwinds;
   SmallVector<InvokeInst*,16> Invokes;
   UnreachableInst* UnreachablePlaceholder = 0;
 
@@ -415,14 +399,11 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
       Returns.push_back(RI);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
       Invokes.push_back(II);
-    } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-      Unwinds.push_back(UI);
     }
 
-  if (Unwinds.empty() && Invokes.empty()) return false;
+  if (Invokes.empty()) return false;
 
   NumInvokes += Invokes.size();
-  NumUnwinds += Unwinds.size();
 
   // TODO: This is not an optimal way to do this.  In particular, this always
   // inserts setjmp calls into the entries of functions with invoke instructions
@@ -572,13 +553,6 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
   CallInst::Create(AbortFn, "",
                    TermBlock->getTerminator())->setTailCall();
 
-
-  // Replace all unwinds with a branch to the unwind handler.
-  for (unsigned i = 0, e = Unwinds.size(); i != e; ++i) {
-    BranchInst::Create(UnwindHandler, Unwinds[i]);
-    Unwinds[i]->eraseFromParent();
-  }
-
   // Replace the inserted unreachable with a branch to the unwind handler.
   if (UnreachablePlaceholder) {
     BranchInst::Create(UnwindHandler, UnreachablePlaceholder);
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 686178c..a16130d 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -237,10 +237,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (unsigned i = 1; i < SI->getNumSuccessors(); ++i)
-    Cases.push_back(CaseRange(SI->getSuccessorValue(i),
-                              SI->getSuccessorValue(i),
-                              SI->getSuccessor(i)));
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
+                              i.getCaseSuccessor()));
+  
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
@@ -281,7 +281,7 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   BasicBlock* Default = SI->getDefaultDest();
 
   // If there is only the default destination, don't bother with the code below.
-  if (SI->getNumCases() == 1) {
+  if (!SI->getNumCases()) {
     BranchInst::Create(SI->getDefaultDest(), CurBlock);
     CurBlock->getInstList().erase(SI);
     return;
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
new file mode 100644
index 0000000..8491c55
--- /dev/null
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -0,0 +1,64 @@
+//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on Modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Support/IRBuilder.h"
+
+using namespace llvm;
+
+static void appendToGlobalArray(const char *Array, 
+                                Module &M, Function *F, int Priority) {
+  IRBuilder<> IRB(M.getContext());
+  FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
+  StructType *Ty = StructType::get(
+      IRB.getInt32Ty(), PointerType::getUnqual(FnTy), NULL);
+
+  Constant *RuntimeCtorInit = ConstantStruct::get(
+      Ty, IRB.getInt32(Priority), F, NULL);
+
+  // Get the current set of static global constructors and add the new ctor
+  // to the list.
+  SmallVector<Constant *, 16> CurrentCtors;
+  if (GlobalVariable * GVCtor = M.getNamedGlobal(Array)) {
+    if (Constant *Init = GVCtor->getInitializer()) {
+      unsigned n = Init->getNumOperands();
+      CurrentCtors.reserve(n + 1);
+      for (unsigned i = 0; i != n; ++i)
+        CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
+    }
+    GVCtor->eraseFromParent();
+  }
+
+  CurrentCtors.push_back(RuntimeCtorInit);
+
+  // Create a new initializer.
+  ArrayType *AT = ArrayType::get(RuntimeCtorInit->getType(),
+                                 CurrentCtors.size());
+  Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
+
+  // Create the new global variable and replace all uses of
+  // the old global variable with the new one.
+  (void)new GlobalVariable(M, NewInit->getType(), false,
+                           GlobalValue::AppendingLinkage, NewInit, Array);
+}
+
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) {
+  appendToGlobalArray("llvm.global_ctors", M, F, Priority);
+}
+
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority) {
+  appendToGlobalArray("llvm.global_dtors", M, F, Priority);
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index db3e942..2357d81 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -66,7 +67,8 @@ struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > {
     return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U);
   }
   static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) {
-    return DenseMapInfo<void*>::getHashValue(Val.first) + Val.second*2;
+    using llvm::hash_value;
+    return static_cast<unsigned>(hash_value(Val));
   }
   static bool isEqual(const EltTy &LHS, const EltTy &RHS) {
     return LHS == RHS;
@@ -423,7 +425,8 @@ void PromoteMem2Reg::run() {
 
       // Finally, after the scan, check to see if the store is all that is left.
       if (Info.UsingBlocks.empty()) {
-        // Record debuginfo for the store and remove the declaration's debuginfo.
+        // Record debuginfo for the store and remove the declaration's 
+        // debuginfo.
         if (DbgDeclareInst *DDI = Info.DbgDeclare) {
           if (!DIB)
             DIB = new DIBuilder(*DDI->getParent()->getParent()->getParent());
@@ -590,7 +593,7 @@ void PromoteMem2Reg::run() {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, 0, &DT)) {
+      if (Value *V = SimplifyInstruction(PN, 0, 0, &DT)) {
         if (AST && PN->getType()->isPointerTy())
           AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index fa8061c..e60a41b 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -518,3 +518,10 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     User->eraseFromParent();
   }
 }
+
+bool
+LoadAndStorePromoter::isInstInList(Instruction *I,
+                                   const SmallVectorImpl<Instruction*> &Insts)
+                                   const {
+  return std::find(Insts.begin(), Insts.end(), I) != Insts.end();
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index b8c3ab4..66dd2c9 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -14,16 +14,20 @@
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
+#include "llvm/Operator.h"
 #include "llvm/Type.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalVariable.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -63,9 +67,8 @@ class SimplifyCFGOpt {
   bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                            IRBuilder<> &Builder);
 
-  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
   bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
-  bool SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder);
+  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
   bool SimplifyUnreachable(UnreachableInst *UI);
   bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
   bool SimplifyIndirectBr(IndirectBrInst *IBI);
@@ -205,6 +208,42 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   return BI->getCondition();
 }
 
+/// ComputeSpeculuationCost - Compute an abstract "cost" of speculating the
+/// given instruction, which is assumed to be safe to speculate. 1 means
+/// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive.
+static unsigned ComputeSpeculationCost(const User *I) {
+  assert(isSafeToSpeculativelyExecute(I) &&
+         "Instruction is not safe to speculatively execute!");
+  switch (Operator::getOpcode(I)) {
+  default:
+    // In doubt, be conservative.
+    return UINT_MAX;
+  case Instruction::GetElementPtr:
+    // GEPs are cheap if all indices are constant.
+    if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+      return UINT_MAX;
+    return 1;
+  case Instruction::Load:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::ICmp:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return 1; // These are all cheap.
+
+  case Instruction::Call:
+  case Instruction::Select:
+    return 2;
+  }
+}
+
 /// DominatesMergePoint - If we have a merge point of an "if condition" as
 /// accepted above, return true if the specified value dominates the block.  We
 /// don't handle the true generality of domination here, just a special case
@@ -257,46 +296,10 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // Okay, it looks like the instruction IS in the "condition".  Check to
   // see if it's a cheap instruction to unconditionally compute, and if it
   // only uses stuff defined outside of the condition.  If so, hoist it out.
-  if (!I->isSafeToSpeculativelyExecute())
+  if (!isSafeToSpeculativelyExecute(I))
     return false;
 
-  unsigned Cost = 0;
-
-  switch (I->getOpcode()) {
-  default: return false;  // Cannot hoist this out safely.
-  case Instruction::Load:
-    // We have to check to make sure there are no instructions before the
-    // load in its basic block, as we are going to hoist the load out to its
-    // predecessor.
-    if (PBB->getFirstNonPHIOrDbg() != I)
-      return false;
-    Cost = 1;
-    break;
-  case Instruction::GetElementPtr:
-    // GEPs are cheap if all indices are constant.
-    if (!cast<GetElementPtrInst>(I)->hasAllConstantIndices())
-      return false;
-    Cost = 1;
-    break;
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::ICmp:
-  case Instruction::Trunc:
-  case Instruction::ZExt:
-  case Instruction::SExt:
-    Cost = 1;
-    break;   // These are all cheap and non-trapping instructions.
-
-  case Instruction::Select:
-    Cost = 2;
-    break;
-  }
+  unsigned Cost = ComputeSpeculationCost(I);
 
   if (Cost > CostRemaining)
     return false;
@@ -373,9 +376,7 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
         Span = Span.inverse();
       
       // If there are a ton of values, we don't want to make a ginormous switch.
-      if (Span.getSetSize().ugt(8) || Span.isEmptySet() ||
-          // We don't handle wrapped sets yet.
-          Span.isWrappedSet())
+      if (Span.getSetSize().ugt(8) || Span.isEmptySet())
         return 0;
       
       for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
@@ -430,9 +431,9 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
   
   return 0;
 }
-      
+
 static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
-  Instruction* Cond = 0;
+  Instruction *Cond = 0;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cond = dyn_cast<Instruction>(SI->getCondition());
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -479,8 +480,9 @@ GetValueEqualityComparisonCases(TerminatorInst *TI,
                                                       BasicBlock*> > &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
-      Cases.push_back(std::make_pair(SI->getCaseValue(i), SI->getSuccessor(i)));
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+      Cases.push_back(std::make_pair(i.getCaseValue(),
+                                     i.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
@@ -603,11 +605,13 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
 
-    for (unsigned i = SI->getNumCases()-1; i != 0; --i)
-      if (DeadCases.count(SI->getCaseValue(i))) {
-        SI->getSuccessor(i)->removePredecessor(TI->getParent());
+    for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
+      --i;
+      if (DeadCases.count(i.getCaseValue())) {
+        i.getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
+    }
 
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
@@ -951,6 +955,20 @@ HoistTerminator:
 /// and an BB2 and the only successor of BB1 is BB2, hoist simple code
 /// (for now, restricted to a single instruction that's side effect free) from
 /// the BB1 into the branch block to speculatively execute it.
+///
+/// Turn
+/// BB:
+///     %t1 = icmp
+///     br i1 %t1, label %BB1, label %BB2
+/// BB1:
+///     %t3 = add %t2, c
+///     br label BB2
+/// BB2:
+/// =>
+/// BB:
+///     %t1 = icmp
+///     %t4 = add %t2, c
+///     %t3 = select i1 %t1, %t2, %t3
 static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
   // Only speculatively execution a single instruction (not counting the
   // terminator) for now.
@@ -967,8 +985,29 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
       return false;
     HInst = I;
   }
-  if (!HInst)
-    return false;
+
+  BasicBlock *BIParent = BI->getParent();
+
+  // Check the instruction to be hoisted, if there is one.
+  if (HInst) {
+    // Don't hoist the instruction if it's unsafe or expensive.
+    if (!isSafeToSpeculativelyExecute(HInst))
+      return false;
+    if (ComputeSpeculationCost(HInst) > PHINodeFoldingThreshold)
+      return false;
+
+    // Do not hoist the instruction if any of its operands are defined but not
+    // used in this BB. The transformation will prevent the operand from
+    // being sunk into the use block.
+    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
+         i != e; ++i) {
+      Instruction *OpI = dyn_cast<Instruction>(*i);
+      if (OpI && OpI->getParent() == BIParent &&
+          !OpI->mayHaveSideEffects() &&
+          !OpI->isUsedInBasicBlock(BIParent))
+        return false;
+    }
+  }
 
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
@@ -983,130 +1022,78 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
     Invert = true;
   }
 
-  // Turn
-  // BB:
-  //     %t1 = icmp
-  //     br i1 %t1, label %BB1, label %BB2
-  // BB1:
-  //     %t3 = add %t2, c
-  //     br label BB2
-  // BB2:
-  // =>
-  // BB:
-  //     %t1 = icmp
-  //     %t4 = add %t2, c
-  //     %t3 = select i1 %t1, %t2, %t3
-  switch (HInst->getOpcode()) {
-  default: return false;  // Not safe / profitable to hoist.
-  case Instruction::Add:
-  case Instruction::Sub:
-    // Not worth doing for vector ops.
-    if (HInst->getType()->isVectorTy())
-      return false;
-    break;
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-    // Don't mess with vector operations.
-    if (HInst->getType()->isVectorTy())
-      return false;
-    break;   // These are all cheap and non-trapping instructions.
-  }
-  
-  // If the instruction is obviously dead, don't try to predicate it.
-  if (HInst->use_empty()) {
-    HInst->eraseFromParent();
-    return true;
+  // Collect interesting PHIs, and scan for hazards.
+  SmallSetVector<std::pair<Value *, Value *>, 4> PHIs;
+  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
+  for (BasicBlock::iterator I = BB2->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    Value *BB1V = PN->getIncomingValueForBlock(BB1);
+    Value *BIParentV = PN->getIncomingValueForBlock(BIParent);
+
+    // Skip PHIs which are trivial.
+    if (BB1V == BIParentV)
+      continue;
+
+    // Check for saftey.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BB1V)) {
+      // An unfolded ConstantExpr could end up getting expanded into
+      // Instructions. Don't speculate this and another instruction at
+      // the same time.
+      if (HInst)
+        return false;
+      if (!isSafeToSpeculativelyExecute(CE))
+        return false;
+      if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+        return false;
+    }
+
+    // Ok, we may insert a select for this PHI.
+    PHIs.insert(std::make_pair(BB1V, BIParentV));
   }
 
-  // Can we speculatively execute the instruction? And what is the value 
-  // if the condition is false? Consider the phi uses, if the incoming value
-  // from the "if" block are all the same V, then V is the value of the
-  // select if the condition is false.
-  BasicBlock *BIParent = BI->getParent();
-  SmallVector<PHINode*, 4> PHIUses;
-  Value *FalseV = NULL;
+  // If there are no PHIs to process, bail early. This helps ensure idempotence
+  // as well.
+  if (PHIs.empty())
+    return false;
   
-  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
-  for (Value::use_iterator UI = HInst->use_begin(), E = HInst->use_end();
-       UI != E; ++UI) {
-    // Ignore any user that is not a PHI node in BB2.  These can only occur in
-    // unreachable blocks, because they would not be dominated by the instr.
-    PHINode *PN = dyn_cast<PHINode>(*UI);
-    if (!PN || PN->getParent() != BB2)
-      return false;
-    PHIUses.push_back(PN);
-    
-    Value *PHIV = PN->getIncomingValueForBlock(BIParent);
-    if (!FalseV)
-      FalseV = PHIV;
-    else if (FalseV != PHIV)
-      return false;  // Inconsistent value when condition is false.
-  }
-  
-  assert(FalseV && "Must have at least one user, and it must be a PHI");
-
-  // Do not hoist the instruction if any of its operands are defined but not
-  // used in this BB. The transformation will prevent the operand from
-  // being sunk into the use block.
-  for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
-       i != e; ++i) {
-    Instruction *OpI = dyn_cast<Instruction>(*i);
-    if (OpI && OpI->getParent() == BIParent &&
-        !OpI->isUsedInBasicBlock(BIParent))
-      return false;
-  }
+  // If we get here, we can hoist the instruction and if-convert.
+  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
 
-  // If we get here, we can hoist the instruction. Try to place it
-  // before the icmp instruction preceding the conditional branch.
-  BasicBlock::iterator InsertPos = BI;
-  if (InsertPos != BIParent->begin())
-    --InsertPos;
-  // Skip debug info between condition and branch.
-  while (InsertPos != BIParent->begin() && isa<DbgInfoIntrinsic>(InsertPos))
-    --InsertPos;
-  if (InsertPos == BrCond && !isa<PHINode>(BrCond)) {
-    SmallPtrSet<Instruction *, 4> BB1Insns;
-    for(BasicBlock::iterator BB1I = BB1->begin(), BB1E = BB1->end(); 
-        BB1I != BB1E; ++BB1I) 
-      BB1Insns.insert(BB1I);
-    for(Value::use_iterator UI = BrCond->use_begin(), UE = BrCond->use_end();
-        UI != UE; ++UI) {
-      Instruction *Use = cast<Instruction>(*UI);
-      if (!BB1Insns.count(Use)) continue;
-      
-      // If BrCond uses the instruction that place it just before
-      // branch instruction.
-      InsertPos = BI;
-      break;
-    }
-  } else
-    InsertPos = BI;
-  BIParent->getInstList().splice(InsertPos, BB1->getInstList(), HInst);
+  // Hoist the instruction.
+  if (HInst)
+    BIParent->getInstList().splice(BI, BB1->getInstList(), HInst);
 
-  // Create a select whose true value is the speculatively executed value and
-  // false value is the previously determined FalseV.
+  // Insert selects and rewrite the PHI operands.
   IRBuilder<true, NoFolder> Builder(BI);
-  SelectInst *SI;
-  if (Invert)
-    SI = cast<SelectInst>
-      (Builder.CreateSelect(BrCond, FalseV, HInst,
-                            FalseV->getName() + "." + HInst->getName()));
-  else
-    SI = cast<SelectInst>
-      (Builder.CreateSelect(BrCond, HInst, FalseV,
-                            HInst->getName() + "." + FalseV->getName()));
-
-  // Make the PHI node use the select for all incoming values for "then" and
-  // "if" blocks.
-  for (unsigned i = 0, e = PHIUses.size(); i != e; ++i) {
-    PHINode *PN = PHIUses[i];
-    for (unsigned j = 0, ee = PN->getNumIncomingValues(); j != ee; ++j)
-      if (PN->getIncomingBlock(j) == BB1 || PN->getIncomingBlock(j) == BIParent)
-        PN->setIncomingValue(j, SI);
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    Value *TrueV = PHIs[i].first;
+    Value *FalseV = PHIs[i].second;
+
+    // Create a select whose true value is the speculatively executed value and
+    // false value is the previously determined FalseV.
+    SelectInst *SI;
+    if (Invert)
+      SI = cast<SelectInst>
+        (Builder.CreateSelect(BrCond, FalseV, TrueV,
+                              FalseV->getName() + "." + TrueV->getName()));
+    else
+      SI = cast<SelectInst>
+        (Builder.CreateSelect(BrCond, TrueV, FalseV,
+                              TrueV->getName() + "." + FalseV->getName()));
+
+    // Make the PHI node use the select for all incoming values for "then" and
+    // "if" blocks.
+    for (BasicBlock::iterator I = BB2->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      unsigned BB1I = PN->getBasicBlockIndex(BB1);
+      unsigned BIParentI = PN->getBasicBlockIndex(BIParent);
+      Value *BB1V = PN->getIncomingValue(BB1I);
+      Value *BIParentV = PN->getIncomingValue(BIParentI);
+      if (TrueV == BB1V && FalseV == BIParentV) {
+        PN->setIncomingValue(BB1I, SI);
+        PN->setIncomingValue(BIParentI, SI);
+      }
+    }
   }
 
   ++NumSpeculations;
@@ -1461,6 +1448,49 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
   return true;
 }
 
+/// ExtractBranchMetadata - Given a conditional BranchInstruction, retrieve the
+/// probabilities of the branch taking each edge. Fills in the two APInt
+/// parameters and return true, or returns false if no or invalid metadata was
+/// found.
+static bool ExtractBranchMetadata(BranchInst *BI,
+                                  APInt &ProbTrue, APInt &ProbFalse) {
+  assert(BI->isConditional() &&
+         "Looking for probabilities on unconditional branch?");
+  MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3) return false;
+  ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1));
+  ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2));
+  if (!CITrue || !CIFalse) return false;
+  ProbTrue = CITrue->getValue();
+  ProbFalse = CIFalse->getValue();
+  assert(ProbTrue.getBitWidth() == 32 && ProbFalse.getBitWidth() == 32 &&
+         "Branch probability metadata must be 32-bit integers");
+  return true;
+}
+
+/// MultiplyAndLosePrecision - Multiplies A and B, then returns the result. In
+/// the event of overflow, logically-shifts all four inputs right until the
+/// multiply fits.
+static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
+                                      unsigned &BitsLost) {
+  BitsLost = 0;
+  bool Overflow = false;
+  APInt Result = A.umul_ov(B, Overflow);
+  if (Overflow) {
+    APInt MaxB = APInt::getMaxValue(A.getBitWidth()).udiv(A);
+    do {
+      B = B.lshr(1);
+      ++BitsLost;
+    } while (B.ugt(MaxB));
+    A = A.lshr(BitsLost);
+    C = C.lshr(BitsLost);
+    D = D.lshr(BitsLost);
+    Result = A * B;
+  }
+  return Result;
+}
+
+
 /// FoldBranchToCommonDest - If this basic block is simple enough, and if a
 /// predecessor branches to us and one of our successors, fold the block into
 /// the predecessor and use logical operations to pick the right destination.
@@ -1479,7 +1509,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
   // Ignore dbg intrinsics.
   while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
-    
+
   // Allow a single instruction to be hoisted in addition to the compare
   // that feeds the branch.  We later ensure that any values that _it_ uses
   // were also live in the predecessor, so that we don't unnecessarily create
@@ -1487,7 +1517,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   Instruction *BonusInst = 0;
   if (&*FrontIt != Cond &&
       FrontIt->hasOneUse() && *FrontIt->use_begin() == Cond &&
-      FrontIt->isSafeToSpeculativelyExecute()) {
+      isSafeToSpeculativelyExecute(FrontIt)) {
     BonusInst = &*FrontIt;
     ++FrontIt;
     
@@ -1557,7 +1587,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       SmallPtrSet<Value*, 4> UsedValues;
       for (Instruction::op_iterator OI = BonusInst->op_begin(),
            OE = BonusInst->op_end(); OI != OE; ++OI) {
-        Value* V = *OI;
+        Value *V = *OI;
         if (!isa<Constant>(V))
           UsedValues.insert(V);
       }
@@ -1602,10 +1632,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       }
       
       PBI->setCondition(NewCond);
-      BasicBlock *OldTrue = PBI->getSuccessor(0);
-      BasicBlock *OldFalse = PBI->getSuccessor(1);
-      PBI->setSuccessor(0, OldFalse);
-      PBI->setSuccessor(1, OldTrue);
+      PBI->swapSuccessors();
     }
     
     // If we have a bonus inst, clone it into the predecessor block.
@@ -1638,6 +1665,94 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       PBI->setSuccessor(1, FalseDest);
     }
 
+    // TODO: If BB is reachable from all paths through PredBlock, then we
+    // could replace PBI's branch probabilities with BI's.
+
+    // Merge probability data into PredBlock's branch.
+    APInt A, B, C, D;
+    if (ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
+      // Given IR which does:
+      //   bbA:
+      //     br i1 %x, label %bbB, label %bbC
+      //   bbB:
+      //     br i1 %y, label %bbD, label %bbC
+      // Let's call the probability that we take the edge from %bbA to %bbB
+      // 'a', from %bbA to %bbC, 'b', from %bbB to %bbD 'c' and from %bbB to
+      // %bbC probability 'd'.
+      //
+      // We transform the IR into:
+      //   bbA:
+      //     br i1 %z, label %bbD, label %bbC
+      // where the probability of going to %bbD is (a*c) and going to bbC is
+      // (b+a*d).
+      //
+      // Probabilities aren't stored as ratios directly. Using branch weights,
+      // we get:
+      // (a*c)% = A*C, (b+(a*d))% = A*D+B*C+B*D.
+
+      // In the event of overflow, we want to drop the LSB of the input
+      // probabilities.
+      unsigned BitsLost;
+
+      // Ignore overflow result on ProbTrue.
+      APInt ProbTrue = MultiplyAndLosePrecision(A, C, B, D, BitsLost);
+
+      APInt Tmp1 = MultiplyAndLosePrecision(B, D, A, C, BitsLost);
+      if (BitsLost) {
+        ProbTrue = ProbTrue.lshr(BitsLost*2);
+      }
+
+      APInt Tmp2 = MultiplyAndLosePrecision(A, D, C, B, BitsLost);
+      if (BitsLost) {
+        ProbTrue = ProbTrue.lshr(BitsLost*2);
+        Tmp1 = Tmp1.lshr(BitsLost*2);
+      }
+
+      APInt Tmp3 = MultiplyAndLosePrecision(B, C, A, D, BitsLost);
+      if (BitsLost) {
+        ProbTrue = ProbTrue.lshr(BitsLost*2);
+        Tmp1 = Tmp1.lshr(BitsLost*2);
+        Tmp2 = Tmp2.lshr(BitsLost*2);
+      }
+
+      bool Overflow1 = false, Overflow2 = false;
+      APInt Tmp4 = Tmp2.uadd_ov(Tmp3, Overflow1);
+      APInt ProbFalse = Tmp4.uadd_ov(Tmp1, Overflow2);
+
+      if (Overflow1 || Overflow2) {
+        ProbTrue = ProbTrue.lshr(1);
+        Tmp1 = Tmp1.lshr(1);
+        Tmp2 = Tmp2.lshr(1);
+        Tmp3 = Tmp3.lshr(1);
+        Tmp4 = Tmp2 + Tmp3;
+        ProbFalse = Tmp4 + Tmp1;
+      }
+
+      // The sum of branch weights must fit in 32-bits.
+      if (ProbTrue.isNegative() && ProbFalse.isNegative()) {
+        ProbTrue = ProbTrue.lshr(1);
+        ProbFalse = ProbFalse.lshr(1);
+      }
+
+      if (ProbTrue != ProbFalse) {
+        // Normalize the result.
+        APInt GCD = APIntOps::GreatestCommonDivisor(ProbTrue, ProbFalse);
+        ProbTrue = ProbTrue.udiv(GCD);
+        ProbFalse = ProbFalse.udiv(GCD);
+
+        LLVMContext &Context = BI->getContext();
+        Value *Ops[3];
+        Ops[0] = BI->getMetadata(LLVMContext::MD_prof)->getOperand(0);
+        Ops[1] = ConstantInt::get(Context, ProbTrue);
+        Ops[2] = ConstantInt::get(Context, ProbFalse);
+        PBI->setMetadata(LLVMContext::MD_prof, MDNode::get(Context, Ops));
+      } else {
+        PBI->setMetadata(LLVMContext::MD_prof, NULL);
+      }
+    } else {
+      PBI->setMetadata(LLVMContext::MD_prof, NULL);
+    }
+
     // Copy any debug value intrinsics into the end of PredBlock.
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
       if (isa<DbgInfoIntrinsic>(*I))
@@ -1894,8 +2009,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 
   // Find the relevant condition and destinations.
   Value *Condition = Select->getCondition();
-  BasicBlock *TrueBB = SI->getSuccessor(SI->findCaseValue(TrueVal));
-  BasicBlock *FalseBB = SI->getSuccessor(SI->findCaseValue(FalseVal));
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
 
   // Perform the actual simplification.
   return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB);
@@ -1979,7 +2094,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   // Ok, the block is reachable from the default dest.  If the constant we're
   // comparing exists in one of the other edges, then we can constant fold ICI
   // and zap it.
-  if (SI->findCaseValue(Cst) != 0) {
+  if (SI->findCaseValue(Cst) != SI->case_default()) {
     Value *V;
     if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
       V = ConstantInt::getFalse(BB->getContext());
@@ -2235,52 +2350,6 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder) {
-  // Check to see if the first instruction in this block is just an unwind.
-  // If so, replace any invoke instructions which use this as an exception
-  // destination with call instructions.
-  BasicBlock *BB = UI->getParent();
-  if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
-
-  bool Changed = false;
-  SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
-  while (!Preds.empty()) {
-    BasicBlock *Pred = Preds.back();
-    InvokeInst *II = dyn_cast<InvokeInst>(Pred->getTerminator());
-    if (II && II->getUnwindDest() == BB) {
-      // Insert a new branch instruction before the invoke, because this
-      // is now a fall through.
-      Builder.SetInsertPoint(II);
-      BranchInst *BI = Builder.CreateBr(II->getNormalDest());
-      Pred->getInstList().remove(II);   // Take out of symbol table
-      
-      // Insert the call now.
-      SmallVector<Value*,8> Args(II->op_begin(), II->op_end()-3);
-      Builder.SetInsertPoint(BI);
-      CallInst *CI = Builder.CreateCall(II->getCalledValue(),
-                                        Args, II->getName());
-      CI->setCallingConv(II->getCallingConv());
-      CI->setAttributes(II->getAttributes());
-      // If the invoke produced a value, the Call now does instead.
-      II->replaceAllUsesWith(CI);
-      delete II;
-      Changed = true;
-    }
-    
-    Preds.pop_back();
-  }
-  
-  // If this block is now dead (and isn't the entry block), remove it.
-  if (pred_begin(BB) == pred_end(BB) &&
-      BB != &BB->getParent()->getEntryBlock()) {
-    // We know there are no successors, so just nuke the block.
-    BB->eraseFromParent();
-    return true;
-  }
-  
-  return Changed;  
-}
-
 bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   BasicBlock *BB = UI->getParent();
   
@@ -2352,8 +2421,9 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
-        if (SI->getSuccessor(i) == BB) {
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+           i != e; ++i)
+        if (i.getCaseSuccessor() == BB) {
           BB->removePredecessor(SI->getParent());
           SI->removeCase(i);
           --i; --e;
@@ -2361,14 +2431,15 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       // If the default value is unreachable, figure out the most popular
       // destination and make it the default.
-      if (SI->getSuccessor(0) == BB) {
+      if (SI->getDefaultDest() == BB) {
         std::map<BasicBlock*, std::pair<unsigned, unsigned> > Popularity;
-        for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
-          std::pair<unsigned, unsigned>& entry =
-              Popularity[SI->getSuccessor(i)];
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
+          std::pair<unsigned, unsigned> &entry =
+              Popularity[i.getCaseSuccessor()];
           if (entry.first == 0) {
             entry.first = 1;
-            entry.second = i;
+            entry.second = i.getCaseIndex();
           } else {
             entry.first++;
           }
@@ -2390,7 +2461,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         if (MaxBlock) {
           // Make this the new default, allowing us to delete any explicit
           // edges to it.
-          SI->setSuccessor(0, MaxBlock);
+          SI->setDefaultDest(MaxBlock);
           Changed = true;
           
           // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
@@ -2399,8 +2470,9 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
             for (unsigned i = 0; i != MaxPop-1; ++i)
               MaxBlock->removePredecessor(SI->getParent());
           
-          for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i)
-            if (SI->getSuccessor(i) == MaxBlock) {
+          for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+               i != e; ++i)
+            if (i.getCaseSuccessor() == MaxBlock) {
               SI->removeCase(i);
               --i; --e;
             }
@@ -2442,17 +2514,19 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 /// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a
 /// integer range comparison into a sub, an icmp and a branch.
 static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
-  assert(SI->getNumCases() > 2 && "Degenerate switch?");
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   // Make sure all cases point to the same destination and gather the values.
   SmallVector<ConstantInt *, 16> Cases;
-  Cases.push_back(SI->getCaseValue(1));
-  for (unsigned I = 2, E = SI->getNumCases(); I != E; ++I) {
-    if (SI->getSuccessor(I-1) != SI->getSuccessor(I))
+  SwitchInst::CaseIt I = SI->case_begin();
+  Cases.push_back(I.getCaseValue());
+  SwitchInst::CaseIt PrevI = I++;
+  for (SwitchInst::CaseIt E = SI->case_end(); I != E; PrevI = I++) {
+    if (PrevI.getCaseSuccessor() != I.getCaseSuccessor())
       return false;
-    Cases.push_back(SI->getCaseValue(I));
+    Cases.push_back(I.getCaseValue());
   }
-  assert(Cases.size() == SI->getNumCases()-1 && "Not all cases gathered");
+  assert(Cases.size() == SI->getNumCases() && "Not all cases gathered");
 
   // Sort the case values, then check if they form a range we can transform.
   array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
@@ -2462,18 +2536,19 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   }
 
   Constant *Offset = ConstantExpr::getNeg(Cases.back());
-  Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases()-1);
+  Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases());
 
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
   Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  Builder.CreateCondBr(Cmp, SI->getSuccessor(1), SI->getDefaultDest());
+  Builder.CreateCondBr(
+      Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
   // Prune obsolete incoming values off the successor's PHI nodes.
-  for (BasicBlock::iterator BBI = SI->getSuccessor(1)->begin();
+  for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
        isa<PHINode>(BBI); ++BBI) {
-    for (unsigned I = 0, E = SI->getNumCases()-2; I != E; ++I)
+    for (unsigned I = 0, E = SI->getNumCases()-1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
   SI->eraseFromParent();
@@ -2487,24 +2562,26 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
   unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
-  ComputeMaskedBits(Cond, APInt::getAllOnesValue(Bits), KnownZero, KnownOne);
+  ComputeMaskedBits(Cond, KnownZero, KnownOne);
 
   // Gather dead cases.
   SmallVector<ConstantInt*, 8> DeadCases;
-  for (unsigned I = 1, E = SI->getNumCases(); I != E; ++I) {
-    if ((SI->getCaseValue(I)->getValue() & KnownZero) != 0 ||
-        (SI->getCaseValue(I)->getValue() & KnownOne) != KnownOne) {
-      DeadCases.push_back(SI->getCaseValue(I));
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+    if ((I.getCaseValue()->getValue() & KnownZero) != 0 ||
+        (I.getCaseValue()->getValue() & KnownOne) != KnownOne) {
+      DeadCases.push_back(I.getCaseValue());
       DEBUG(dbgs() << "SimplifyCFG: switch case '"
-                   << SI->getCaseValue(I)->getValue() << "' is dead.\n");
+                   << I.getCaseValue() << "' is dead.\n");
     }
   }
 
   // Remove dead cases from the switch.
   for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
-    unsigned Case = SI->findCaseValue(DeadCases[I]);
+    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
+    assert(Case != SI->case_default() &&
+           "Case was not found. Probably mistake in DeadCases forming.");
     // Prune unused values from PHI nodes.
-    SI->getSuccessor(Case)->removePredecessor(SI->getParent());
+    Case.getCaseSuccessor()->removePredecessor(SI->getParent());
     SI->removeCase(Case);
   }
 
@@ -2553,9 +2630,9 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   typedef DenseMap<PHINode*, SmallVector<int,4> > ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (unsigned I = 1; I < SI->getNumCases(); ++I) { // 0 is the default case.
-    ConstantInt *CaseValue = SI->getCaseValue(I);
-    BasicBlock *CaseDest = SI->getSuccessor(I);
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+    ConstantInt *CaseValue = I.getCaseValue();
+    BasicBlock *CaseDest = I.getCaseSuccessor();
 
     int PhiIndex;
     PHINode *PHI = FindPHIForConditionForwarding(CaseValue, CaseDest,
@@ -2676,8 +2753,8 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
-      if (I->isTerminator() 
-          && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
+      if (I->isTerminator() &&
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
         return true;
     }
   
@@ -2720,6 +2797,12 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (SimplifyBranchOnICmpChain(BI, TD, Builder))
     return true;
   
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and one of our successors, fold the comparison into the
+  // predecessor and use logical operations to pick the right destination.
+  if (FoldBranchToCommonDest(BI))
+    return SimplifyCFG(BB) | true;
+  
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
   // there is any identical code in the "then" and "else" blocks.  If so, we
@@ -2754,12 +2837,6 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       if (FoldCondBranchOnPHI(BI, TD))
         return SimplifyCFG(BB) | true;
   
-  // If this basic block is ONLY a setcc and a branch, and if a predecessor
-  // branches to us and one of our successors, fold the setcc into the
-  // predecessor and use logical operations to pick the right destination.
-  if (FoldBranchToCommonDest(BI))
-    return SimplifyCFG(BB) | true;
-  
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
@@ -2809,7 +2886,7 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
 }
 
 /// If BB has an incoming value that will always trigger undefined behavior
-/// (eg. null pointer derefence), remove the branch leading here.
+/// (eg. null pointer dereference), remove the branch leading here.
 static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
   for (BasicBlock::iterator i = BB->begin();
        PHINode *PHI = dyn_cast<PHINode>(i); ++i)
@@ -2883,17 +2960,15 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
     } else {
       if (SimplifyCondBranch(BI, Builder)) return true;
     }
-  } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
-    if (SimplifyResume(RI, Builder)) return true;
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
     if (SimplifyReturn(RI, Builder)) return true;
+  } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+    if (SimplifyResume(RI, Builder)) return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
     if (SimplifySwitch(SI, Builder)) return true;
   } else if (UnreachableInst *UI =
                dyn_cast<UnreachableInst>(BB->getTerminator())) {
     if (SimplifyUnreachable(UI)) return true;
-  } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-    if (SimplifyUnwind(UI, Builder)) return true;
   } else if (IndirectBrInst *IBI =
                dyn_cast<IndirectBrInst>(BB->getTerminator())) {
     if (SimplifyIndirectBr(IBI)) return true;
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 76289c0..4030bef 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -46,7 +46,6 @@ namespace {
     LoopInfo         *LI;
     DominatorTree    *DT;
     ScalarEvolution  *SE;
-    IVUsers          *IU; // NULL for DisableIVRewrite
     const TargetData *TD; // May be NULL
 
     SmallVectorImpl<WeakVH> &DeadInsts;
@@ -59,7 +58,6 @@ namespace {
       L(Loop),
       LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
       SE(SE),
-      IU(IVU),
       TD(LPM->getAnalysisIfAvailable<TargetData>()),
       DeadInsts(Dead),
       Changed(false) {
@@ -107,8 +105,8 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
 
     // Attempt to fold a binary operator with constant operand.
     // e.g. ((I + 1) >> 2) => I >> 2
-    if (IVOperand->getNumOperands() != 2 ||
-        !isa<ConstantInt>(IVOperand->getOperand(1)))
+    if (!isa<BinaryOperator>(IVOperand)
+        || !isa<ConstantInt>(IVOperand->getOperand(1)))
       return 0;
 
     IVSrc = IVOperand->getOperand(0);
@@ -229,11 +227,6 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
     Rem->replaceAllUsesWith(Sel);
   }
 
-  // Inform IVUsers about the new users.
-  if (IU) {
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
-  }
   DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
   Changed = true;
@@ -375,6 +368,8 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
 
 namespace llvm {
 
+void IVVisitor::anchor() { }
+
 /// simplifyUsersOfIV - Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
@@ -397,36 +392,4 @@ bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
   return Changed;
 }
 
-/// simplifyIVUsers - Perform simplification on instructions recorded by the
-/// IVUsers pass.
-///
-/// This is the old approach to IV simplification to be replaced by
-/// SimplifyLoopIVs.
-bool simplifyIVUsers(IVUsers *IU, ScalarEvolution *SE, LPPassManager *LPM,
-                     SmallVectorImpl<WeakVH> &Dead) {
-  SimplifyIndvar SIV(IU->getLoop(), SE, LPM, Dead);
-
-  // Each round of simplification involves a round of eliminating operations
-  // followed by a round of widening IVs. A single IVUsers worklist is used
-  // across all rounds. The inner loop advances the user. If widening exposes
-  // more uses, then another pass through the outer loop is triggered.
-  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
-    Instruction *UseInst = I->getUser();
-    Value *IVOperand = I->getOperandValToReplace();
-
-    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      SIV.eliminateIVComparison(ICmp, IVOperand);
-      continue;
-    }
-    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-        SIV.eliminateIVRemainder(Rem, IVOperand, IsSigned);
-        continue;
-      }
-    }
-  }
-  return SIV.hasChanged();
-}
-
 } // namespace llvm
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index ac005f9..81eb9e0 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -39,12 +40,14 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
     /// runOnFunction - Remove instructions that simplify.
     bool runOnFunction(Function &F) {
       const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
       const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+      const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
       SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
       bool Changed = false;
 
@@ -60,7 +63,7 @@ namespace {
               continue;
             // Don't waste time simplifying unused instructions.
             if (!I->use_empty())
-              if (Value *V = SimplifyInstruction(I, TD, DT)) {
+              if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
                 // Mark all uses for resimplification next time round the loop.
                 for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
                      UI != UE; ++UI)
@@ -84,8 +87,11 @@ namespace {
 }
 
 char InstSimplifier::ID = 0;
-INITIALIZE_PASS(InstSimplifier, "instsimplify", "Remove redundant instructions",
-                false, false)
+INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
+                      "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
+                    "Remove redundant instructions", false, false)
 char &llvm::InstructionSimplifierID = InstSimplifier::ID;
 
 // Public interface to the simplify instructions pass.
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 46d4ada..b1cad06 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -50,33 +50,13 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
   // return.
   //
   std::vector<BasicBlock*> ReturningBlocks;
-  std::vector<BasicBlock*> UnwindingBlocks;
   std::vector<BasicBlock*> UnreachableBlocks;
   for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
     if (isa<ReturnInst>(I->getTerminator()))
       ReturningBlocks.push_back(I);
-    else if (isa<UnwindInst>(I->getTerminator()))
-      UnwindingBlocks.push_back(I);
     else if (isa<UnreachableInst>(I->getTerminator()))
       UnreachableBlocks.push_back(I);
 
-  // Handle unwinding blocks first.
-  if (UnwindingBlocks.empty()) {
-    UnwindBlock = 0;
-  } else if (UnwindingBlocks.size() == 1) {
-    UnwindBlock = UnwindingBlocks.front();
-  } else {
-    UnwindBlock = BasicBlock::Create(F.getContext(), "UnifiedUnwindBlock", &F);
-    new UnwindInst(F.getContext(), UnwindBlock);
-
-    for (std::vector<BasicBlock*>::iterator I = UnwindingBlocks.begin(),
-           E = UnwindingBlocks.end(); I != E; ++I) {
-      BasicBlock *BB = *I;
-      BB->getInstList().pop_back();  // Remove the unwind insn
-      BranchInst::Create(UnwindBlock, BB);
-    }
-  }
-
   // Then unreachable blocks.
   if (UnreachableBlocks.empty()) {
     UnreachableBlock = 0;
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
new file mode 100644
index 0000000..286b54f
--- /dev/null
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -0,0 +1,1907 @@
+//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+
+#define BBV_NAME "bb-vectorize"
+#define DEBUG_TYPE BBV_NAME
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+static cl::opt<unsigned>
+ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
+  cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<unsigned>
+SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
+  cl::desc("The maximum search distance for instruction pairs"));
+
+static cl::opt<bool>
+SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
+  cl::desc("Replicating one element to a pair breaks the chain"));
+
+static cl::opt<unsigned>
+VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
+  cl::desc("The size of the native vector registers"));
+
+static cl::opt<unsigned>
+MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
+  cl::desc("The maximum number of pairing iterations"));
+
+static cl::opt<unsigned>
+MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
+  cl::desc("The maximum number of pairable instructions per group"));
+
+static cl::opt<unsigned>
+MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
+  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
+                       " a full cycle check"));
+
+static cl::opt<bool>
+NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize integer values"));
+
+static cl::opt<bool>
+NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize floating-point values"));
+
+static cl::opt<bool>
+NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize casting (conversion) operations"));
+
+static cl::opt<bool>
+NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
+  cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+NoMemOpBoost("bb-vectorize-no-mem-op-boost",
+  cl::init(false), cl::Hidden,
+  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
+
+static cl::opt<bool>
+FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
+  cl::desc("Use a fast instruction dependency analysis"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("bb-vectorize-debug-pair-selection",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("bb-vectorize-debug-cycle-check",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " cycle-checking process"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
+
+namespace {
+  struct BBVectorize : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+
+    const VectorizeConfig Config;
+
+    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
+      : BasicBlockPass(ID), Config(C) {
+      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
+    }
+
+    BBVectorize(Pass *P, const VectorizeConfig &C)
+      : BasicBlockPass(ID), Config(C) {
+      AA = &P->getAnalysis<AliasAnalysis>();
+      SE = &P->getAnalysis<ScalarEvolution>();
+      TD = P->getAnalysisIfAvailable<TargetData>();
+    }
+
+    typedef std::pair<Value *, Value *> ValuePair;
+    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+    typedef std::pair<std::multimap<Value *, Value *>::iterator,
+              std::multimap<Value *, Value *>::iterator> VPIteratorPair;
+    typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
+              std::multimap<ValuePair, ValuePair>::iterator>
+                VPPIteratorPair;
+
+    AliasAnalysis *AA;
+    ScalarEvolution *SE;
+    TargetData *TD;
+
+    // FIXME: const correct?
+
+    bool vectorizePairs(BasicBlock &BB);
+
+    bool getCandidatePairs(BasicBlock &BB,
+                       BasicBlock::iterator &Start,
+                       std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts);
+
+    void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts,
+                       std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+
+    void buildDepMap(BasicBlock &BB,
+                       std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts,
+                       DenseSet<ValuePair> &PairableInstUsers);
+
+    void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+                        std::vector<Value *> &PairableInsts,
+                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                        DenseSet<ValuePair> &PairableInstUsers,
+                        DenseMap<Value *, Value *>& ChosenPairs);
+
+    void fuseChosenPairs(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *>& ChosenPairs);
+
+    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+    bool areInstsCompatible(Instruction *I, Instruction *J,
+                       bool IsSimpleLoadStore);
+
+    bool trackUsesOfI(DenseSet<Value *> &Users,
+                      AliasSetTracker &WriteSet, Instruction *I,
+                      Instruction *J, bool UpdateUsers = true,
+                      std::multimap<Value *, Value *> *LoadMoveSet = 0);
+
+    void computePairsConnectedTo(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      ValuePair P);
+
+    bool pairsConflict(ValuePair P, ValuePair Q,
+                 DenseSet<ValuePair> &PairableInstUsers,
+                 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+
+    bool pairWillFormCycle(ValuePair P,
+                       std::multimap<ValuePair, ValuePair> &PairableInstUsers,
+                       DenseSet<ValuePair> &CurrentPairs);
+
+    void pruneTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree,
+                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
+                      bool UseCycleCheck);
+
+    void buildInitialTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree, ValuePair J);
+
+    void findBestTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+                      size_t &BestEffSize, VPIteratorPair ChoiceRange,
+                      bool UseCycleCheck);
+
+    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool &FlipMemInputs);
+
+    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+                     unsigned IdxOffset, std::vector<Constant*> &Mask);
+
+    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+                     Instruction *J);
+
+    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool FlipMemInputs);
+
+    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     bool &FlipMemInputs);
+
+    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, Instruction *K,
+                     Instruction *&InsertionPt, Instruction *&K1,
+                     Instruction *&K2, bool &FlipMemInputs);
+
+    void collectPairLoadMoveSet(BasicBlock &BB,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I);
+
+    void collectLoadMoveSet(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet);
+
+    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I, Instruction *J);
+
+    void moveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *&InsertionPt,
+                     Instruction *I, Instruction *J);
+
+    bool vectorizeBB(BasicBlock &BB) {
+      bool changed = false;
+      // Iterate a sufficient number of times to merge types of size 1 bit,
+      // then 2 bits, then 4, etc. up to half of the target vector width of the
+      // target vector register.
+      for (unsigned v = 2, n = 1;
+           v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
+           v *= 2, ++n) {
+        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
+              " for " << BB.getName() << " in " <<
+              BB.getParent()->getName() << "...\n");
+        if (vectorizePairs(BB))
+          changed = true;
+        else
+          break;
+      }
+
+      DEBUG(dbgs() << "BBV: done!\n");
+      return changed;
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      AA = &getAnalysis<AliasAnalysis>();
+      SE = &getAnalysis<ScalarEvolution>();
+      TD = getAnalysisIfAvailable<TargetData>();
+
+      return vectorizeBB(BB);
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      BasicBlockPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.setPreservesCFG();
+    }
+
+    // This returns the vector type that holds a pair of the provided type.
+    // If the provided type is already a vector, then its length is doubled.
+    static inline VectorType *getVecTypeForPair(Type *ElemTy) {
+      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+        unsigned numElem = VTy->getNumElements();
+        return VectorType::get(ElemTy->getScalarType(), numElem*2);
+      }
+
+      return VectorType::get(ElemTy, 2);
+    }
+
+    // Returns the weight associated with the provided value. A chain of
+    // candidate pairs has a length given by the sum of the weights of its
+    // members (one weight per pair; the weight of each member of the pair
+    // is assumed to be the same). This length is then compared to the
+    // chain-length threshold to determine if a given chain is significant
+    // enough to be vectorized. The length is also used in comparing
+    // candidate chains where longer chains are considered to be better.
+    // Note: when this function returns 0, the resulting instructions are
+    // not actually fused.
+    inline size_t getDepthFactor(Value *V) {
+      // InsertElement and ExtractElement have a depth factor of zero. This is
+      // for two reasons: First, they cannot be usefully fused. Second, because
+      // the pass generates a lot of these, they can confuse the simple metric
+      // used to compare the trees in the next iteration. Thus, giving them a
+      // weight of zero allows the pass to essentially ignore them in
+      // subsequent iterations when looking for vectorization opportunities
+      // while still tracking dependency chains that flow through those
+      // instructions.
+      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+        return 0;
+
+      // Give a load or store half of the required depth so that load/store
+      // pairs will vectorize.
+      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
+        return Config.ReqChainDepth/2;
+
+      return 1;
+    }
+
+    // This determines the relative offset of two loads or stores, returning
+    // true if the offset could be determined to be some constant value.
+    // For example, if OffsetInElmts == 1, then J accesses the memory directly
+    // after I; if OffsetInElmts == -1 then I accesses the memory
+    // directly after J. This function assumes that both instructions
+    // have the same type.
+    bool getPairPtrInfo(Instruction *I, Instruction *J,
+        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+        int64_t &OffsetInElmts) {
+      OffsetInElmts = 0;
+      if (isa<LoadInst>(I)) {
+        IPtr = cast<LoadInst>(I)->getPointerOperand();
+        JPtr = cast<LoadInst>(J)->getPointerOperand();
+        IAlignment = cast<LoadInst>(I)->getAlignment();
+        JAlignment = cast<LoadInst>(J)->getAlignment();
+      } else {
+        IPtr = cast<StoreInst>(I)->getPointerOperand();
+        JPtr = cast<StoreInst>(J)->getPointerOperand();
+        IAlignment = cast<StoreInst>(I)->getAlignment();
+        JAlignment = cast<StoreInst>(J)->getAlignment();
+      }
+
+      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+      // If this is a trivial offset, then we'll get something like
+      // 1*sizeof(type). With target data, which we need anyway, this will get
+      // constant folded into a number.
+      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+      if (const SCEVConstant *ConstOffSCEV =
+            dyn_cast<SCEVConstant>(OffsetSCEV)) {
+        ConstantInt *IntOff = ConstOffSCEV->getValue();
+        int64_t Offset = IntOff->getSExtValue();
+
+        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
+
+        assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
+
+        OffsetInElmts = Offset/VTyTSS;
+        return (abs64(Offset) % VTyTSS) == 0;
+      }
+
+      return false;
+    }
+
+    // Returns true if the provided CallInst represents an intrinsic that can
+    // be vectorized.
+    bool isVectorizableIntrinsic(CallInst* I) {
+      Function *F = I->getCalledFunction();
+      if (!F) return false;
+
+      unsigned IID = F->getIntrinsicID();
+      if (!IID) return false;
+
+      switch(IID) {
+      default:
+        return false;
+      case Intrinsic::sqrt:
+      case Intrinsic::powi:
+      case Intrinsic::sin:
+      case Intrinsic::cos:
+      case Intrinsic::log:
+      case Intrinsic::log2:
+      case Intrinsic::log10:
+      case Intrinsic::exp:
+      case Intrinsic::exp2:
+      case Intrinsic::pow:
+        return Config.VectorizeMath;
+      case Intrinsic::fma:
+        return Config.VectorizeFMA;
+      }
+    }
+
+    // Returns true if J is the second element in some pair referenced by
+    // some multimap pair iterator pair.
+    template <typename V>
+    bool isSecondInIteratorPair(V J, std::pair<
+           typename std::multimap<V, V>::iterator,
+           typename std::multimap<V, V>::iterator> PairRange) {
+      for (typename std::multimap<V, V>::iterator K = PairRange.first;
+           K != PairRange.second; ++K)
+        if (K->second == J) return true;
+
+      return false;
+    }
+  };
+
+  // This function implements one vectorization iteration on the provided
+  // basic block. It returns true if the block is changed.
+  bool BBVectorize::vectorizePairs(BasicBlock &BB) {
+    bool ShouldContinue;
+    BasicBlock::iterator Start = BB.getFirstInsertionPt();
+
+    std::vector<Value *> AllPairableInsts;
+    DenseMap<Value *, Value *> AllChosenPairs;
+
+    do {
+      std::vector<Value *> PairableInsts;
+      std::multimap<Value *, Value *> CandidatePairs;
+      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+                                         PairableInsts);
+      if (PairableInsts.empty()) continue;
+
+      // Now we have a map of all of the pairable instructions and we need to
+      // select the best possible pairing. A good pairing is one such that the
+      // users of the pair are also paired. This defines a (directed) forest
+      // over the pairs such that two pairs are connected iff the second pair
+      // uses the first.
+
+      // Note that it only matters that both members of the second pair use some
+      // element of the first pair (to allow for splatting).
+
+      std::multimap<ValuePair, ValuePair> ConnectedPairs;
+      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+      if (ConnectedPairs.empty()) continue;
+
+      // Build the pairable-instruction dependency map
+      DenseSet<ValuePair> PairableInstUsers;
+      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+      // There is now a graph of the connected pairs. For each variable, pick
+      // the pairing with the largest tree meeting the depth requirement on at
+      // least one branch. Then select all pairings that are part of that tree
+      // and remove them from the list of available pairings and pairable
+      // variables.
+
+      DenseMap<Value *, Value *> ChosenPairs;
+      choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+        PairableInstUsers, ChosenPairs);
+
+      if (ChosenPairs.empty()) continue;
+      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
+                              PairableInsts.end());
+      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+    } while (ShouldContinue);
+
+    if (AllChosenPairs.empty()) return false;
+    NumFusedOps += AllChosenPairs.size();
+
+    // A set of pairs has now been selected. It is now necessary to replace the
+    // paired instructions with vector instructions. For this procedure each
+    // operand must be replaced with a vector operand. This vector is formed
+    // by using build_vector on the old operands. The replaced values are then
+    // replaced with a vector_extract on the result.  Subsequent optimization
+    // passes should coalesce the build/extract combinations.
+
+    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+    return true;
+  }
+
+  // This function returns true if the provided instruction is capable of being
+  // fused into a vector instruction. This determination is based only on the
+  // type and other attributes of the instruction.
+  bool BBVectorize::isInstVectorizable(Instruction *I,
+                                         bool &IsSimpleLoadStore) {
+    IsSimpleLoadStore = false;
+
+    if (CallInst *C = dyn_cast<CallInst>(I)) {
+      if (!isVectorizableIntrinsic(C))
+        return false;
+    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      // Vectorize simple loads if possbile:
+      IsSimpleLoadStore = L->isSimple();
+      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
+        return false;
+    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+      // Vectorize simple stores if possbile:
+      IsSimpleLoadStore = S->isSimple();
+      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
+        return false;
+    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+      // We can vectorize casts, but not casts of pointer types, etc.
+      if (!Config.VectorizeCasts)
+        return false;
+
+      Type *SrcTy = C->getSrcTy();
+      if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
+        return false;
+
+      Type *DestTy = C->getDestTy();
+      if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
+        return false;
+    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
+        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
+      return false;
+    }
+
+    // We can't vectorize memory operations without target data
+    if (TD == 0 && IsSimpleLoadStore)
+      return false;
+
+    Type *T1, *T2;
+    if (isa<StoreInst>(I)) {
+      // For stores, it is the value type, not the pointer type that matters
+      // because the value is what will come from a vector register.
+
+      Value *IVal = cast<StoreInst>(I)->getValueOperand();
+      T1 = IVal->getType();
+    } else {
+      T1 = I->getType();
+    }
+
+    if (I->isCast())
+      T2 = cast<CastInst>(I)->getSrcTy();
+    else
+      T2 = T1;
+
+    // Not every type can be vectorized...
+    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
+      return false;
+
+    if (!Config.VectorizeInts
+        && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+      return false;
+
+    if (!Config.VectorizeFloats
+        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
+      return false;
+
+    if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 ||
+        T2->getPrimitiveSizeInBits() > Config.VectorBits/2)
+      return false;
+
+    return true;
+  }
+
+  // This function returns true if the two provided instructions are compatible
+  // (meaning that they can be fused into a vector instruction). This assumes
+  // that I has already been determined to be vectorizable and that J is not
+  // in the use tree of I.
+  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+                       bool IsSimpleLoadStore) {
+    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
+                     " <-> " << *J << "\n");
+
+    // Loads and stores can be merged if they have different alignments,
+    // but are otherwise the same.
+    LoadInst *LI, *LJ;
+    StoreInst *SI, *SJ;
+    if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) {
+      if (I->getType() != J->getType())
+        return false;
+
+      if (LI->getPointerOperand()->getType() !=
+            LJ->getPointerOperand()->getType() ||
+          LI->isVolatile() != LJ->isVolatile() ||
+          LI->getOrdering() != LJ->getOrdering() ||
+          LI->getSynchScope() != LJ->getSynchScope())
+        return false;
+    } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
+      if (SI->getValueOperand()->getType() !=
+            SJ->getValueOperand()->getType() ||
+          SI->getPointerOperand()->getType() !=
+            SJ->getPointerOperand()->getType() ||
+          SI->isVolatile() != SJ->isVolatile() ||
+          SI->getOrdering() != SJ->getOrdering() ||
+          SI->getSynchScope() != SJ->getSynchScope())
+        return false;
+    } else if (!J->isSameOperationAs(I)) {
+      return false;
+    }
+    // FIXME: handle addsub-type operations!
+
+    if (IsSimpleLoadStore) {
+      Value *IPtr, *JPtr;
+      unsigned IAlignment, JAlignment;
+      int64_t OffsetInElmts = 0;
+      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+        if (Config.AlignedOnly) {
+          Type *aType = isa<StoreInst>(I) ?
+            cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+          // An aligned load or store is possible only if the instruction
+          // with the lower offset has an alignment suitable for the
+          // vector type.
+
+          unsigned BottomAlignment = IAlignment;
+          if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+          Type *VType = getVecTypeForPair(aType);
+          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+          if (BottomAlignment < VecAlignment)
+            return false;
+        }
+      } else {
+        return false;
+      }
+    } else if (isa<ShuffleVectorInst>(I)) {
+      // Only merge two shuffles if they're both constant
+      return isa<Constant>(I->getOperand(2)) &&
+             isa<Constant>(J->getOperand(2));
+      // FIXME: We may want to vectorize non-constant shuffles also.
+    }
+
+    // The powi intrinsic is special because only the first argument is
+    // vectorized, the second arguments must be equal.
+    CallInst *CI = dyn_cast<CallInst>(I);
+    Function *FI;
+    if (CI && (FI = CI->getCalledFunction()) &&
+        FI->getIntrinsicID() == Intrinsic::powi) {
+
+      Value *A1I = CI->getArgOperand(1),
+            *A1J = cast<CallInst>(J)->getArgOperand(1);
+      const SCEV *A1ISCEV = SE->getSCEV(A1I),
+                 *A1JSCEV = SE->getSCEV(A1J);
+      return (A1ISCEV == A1JSCEV);
+    }
+
+    return true;
+  }
+
+  // Figure out whether or not J uses I and update the users and write-set
+  // structures associated with I. Specifically, Users represents the set of
+  // instructions that depend on I. WriteSet represents the set
+  // of memory locations that are dependent on I. If UpdateUsers is true,
+  // and J uses I, then Users is updated to contain J and WriteSet is updated
+  // to contain any memory locations to which J writes. The function returns
+  // true if J uses I. By default, alias analysis is used to determine
+  // whether J reads from memory that overlaps with a location in WriteSet.
+  // If LoadMoveSet is not null, then it is a previously-computed multimap
+  // where the key is the memory-based user instruction and the value is
+  // the instruction to be compared with I. So, if LoadMoveSet is provided,
+  // then the alias analysis is not used. This is necessary because this
+  // function is called during the process of moving instructions during
+  // vectorization and the results of the alias analysis are not stable during
+  // that process.
+  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+                       AliasSetTracker &WriteSet, Instruction *I,
+                       Instruction *J, bool UpdateUsers,
+                       std::multimap<Value *, Value *> *LoadMoveSet) {
+    bool UsesI = false;
+
+    // This instruction may already be marked as a user due, for example, to
+    // being a member of a selected pair.
+    if (Users.count(J))
+      UsesI = true;
+
+    if (!UsesI)
+      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
+           JU != JE; ++JU) {
+        Value *V = *JU;
+        if (I == V || Users.count(V)) {
+          UsesI = true;
+          break;
+        }
+      }
+    if (!UsesI && J->mayReadFromMemory()) {
+      if (LoadMoveSet) {
+        VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
+        UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+      } else {
+        for (AliasSetTracker::iterator W = WriteSet.begin(),
+             WE = WriteSet.end(); W != WE; ++W) {
+          if (W->aliasesUnknownInst(J, *AA)) {
+            UsesI = true;
+            break;
+          }
+        }
+      }
+    }
+
+    if (UsesI && UpdateUsers) {
+      if (J->mayWriteToMemory()) WriteSet.add(J);
+      Users.insert(J);
+    }
+
+    return UsesI;
+  }
+
+  // This function iterates over all instruction pairs in the provided
+  // basic block and collects all candidate pairs for vectorization.
+  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
+                       BasicBlock::iterator &Start,
+                       std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts) {
+    BasicBlock::iterator E = BB.end();
+    if (Start == E) return false;
+
+    bool ShouldContinue = false, IAfterStart = false;
+    for (BasicBlock::iterator I = Start++; I != E; ++I) {
+      if (I == Start) IAfterStart = true;
+
+      bool IsSimpleLoadStore;
+      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+
+      // Look for an instruction with which to pair instruction *I...
+      DenseSet<Value *> Users;
+      AliasSetTracker WriteSet(*AA);
+      bool JAfterStart = IAfterStart;
+      BasicBlock::iterator J = llvm::next(I);
+      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
+        if (J == Start) JAfterStart = true;
+
+        // Determine if J uses I, if so, exit the loop.
+        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
+        if (Config.FastDep) {
+          // Note: For this heuristic to be effective, independent operations
+          // must tend to be intermixed. This is likely to be true from some
+          // kinds of grouped loop unrolling (but not the generic LLVM pass),
+          // but otherwise may require some kind of reordering pass.
+
+          // When using fast dependency analysis,
+          // stop searching after first use:
+          if (UsesI) break;
+        } else {
+          if (UsesI) continue;
+        }
+
+        // J does not use I, and comes before the first use of I, so it can be
+        // merged with I if the instructions are compatible.
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
+
+        // J is a candidate for merging with I.
+        if (!PairableInsts.size() ||
+             PairableInsts[PairableInsts.size()-1] != I) {
+          PairableInsts.push_back(I);
+        }
+
+        CandidatePairs.insert(ValuePair(I, J));
+
+        // The next call to this function must start after the last instruction
+        // selected during this invocation.
+        if (JAfterStart) {
+          Start = llvm::next(J);
+          IAfterStart = JAfterStart = false;
+        }
+
+        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
+                     << *I << " <-> " << *J << "\n");
+
+        // If we have already found too many pairs, break here and this function
+        // will be called again starting after the last instruction selected
+        // during this invocation.
+        if (PairableInsts.size() >= Config.MaxInsts) {
+          ShouldContinue = true;
+          break;
+        }
+      }
+
+      if (ShouldContinue)
+        break;
+    }
+
+    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
+           << " instructions with candidate pairs\n");
+
+    return ShouldContinue;
+  }
+
+  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+  // it looks for pairs such that both members have an input which is an
+  // output of PI or PJ.
+  void BBVectorize::computePairsConnectedTo(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      ValuePair P) {
+    // For each possible pairing for this variable, look at the uses of
+    // the first value...
+    for (Value::use_iterator I = P.first->use_begin(),
+         E = P.first->use_end(); I != E; ++I) {
+      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+      // For each use of the first variable, look for uses of the second
+      // variable...
+      for (Value::use_iterator J = P.second->use_begin(),
+           E2 = P.second->use_end(); J != E2; ++J) {
+        VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
+
+        // Look for <I, J>:
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+
+        // Look for <J, I>:
+        if (isSecondInIteratorPair<Value*>(*I, JPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+      }
+
+      if (Config.SplatBreaksChain) continue;
+      // Look for cases where just the first value in the pair is used by
+      // both members of another pair (splatting).
+      for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+      }
+    }
+
+    if (Config.SplatBreaksChain) return;
+    // Look for cases where just the second value in the pair is used by
+    // both members of another pair (splatting).
+    for (Value::use_iterator I = P.second->use_begin(),
+         E = P.second->use_end(); I != E; ++I) {
+      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+      for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+      }
+    }
+  }
+
+  // This function figures out which pairs are connected.  Two pairs are
+  // connected if some output of the first pair forms an input to both members
+  // of the second pair.
+  void BBVectorize::computeConnectedPairs(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+         PE = PairableInsts.end(); PI != PE; ++PI) {
+      VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+
+      for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
+           P != choiceRange.second; ++P)
+        computePairsConnectedTo(CandidatePairs, PairableInsts,
+                                ConnectedPairs, *P);
+    }
+
+    DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
+                 << " pair connections.\n");
+  }
+
+  // This function builds a set of use tuples such that <A, B> is in the set
+  // if B is in the use tree of A. If B is in the use tree of A, then B
+  // depends on the output of A.
+  void BBVectorize::buildDepMap(
+                      BasicBlock &BB,
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      DenseSet<ValuePair> &PairableInstUsers) {
+    DenseSet<Value *> IsInPair;
+    for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
+         E = CandidatePairs.end(); C != E; ++C) {
+      IsInPair.insert(C->first);
+      IsInPair.insert(C->second);
+    }
+
+    // Iterate through the basic block, recording all Users of each
+    // pairable instruction.
+
+    BasicBlock::iterator E = BB.end();
+    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+      if (IsInPair.find(I) == IsInPair.end()) continue;
+
+      DenseSet<Value *> Users;
+      AliasSetTracker WriteSet(*AA);
+      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+        (void) trackUsesOfI(Users, WriteSet, I, J);
+
+      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+           U != E; ++U)
+        PairableInstUsers.insert(ValuePair(I, *U));
+    }
+  }
+
+  // Returns true if an input to pair P is an output of pair Q and also an
+  // input of pair Q is an output of pair P. If this is the case, then these
+  // two pairs cannot be simultaneously fused.
+  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
+                     DenseSet<ValuePair> &PairableInstUsers,
+                     std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+    // Two pairs are in conflict if they are mutual Users of eachother.
+    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
+                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
+                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
+                  PairableInstUsers.count(ValuePair(P.second, Q.second));
+    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
+                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
+                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
+                  PairableInstUsers.count(ValuePair(Q.second, P.second));
+    if (PairableInstUserMap) {
+      // FIXME: The expensive part of the cycle check is not so much the cycle
+      // check itself but this edge insertion procedure. This needs some
+      // profiling and probably a different data structure (same is true of
+      // most uses of std::multimap).
+      if (PUsesQ) {
+        VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
+        if (!isSecondInIteratorPair(P, QPairRange))
+          PairableInstUserMap->insert(VPPair(Q, P));
+      }
+      if (QUsesP) {
+        VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
+        if (!isSecondInIteratorPair(Q, PPairRange))
+          PairableInstUserMap->insert(VPPair(P, Q));
+      }
+    }
+
+    return (QUsesP && PUsesQ);
+  }
+
+  // This function walks the use graph of current pairs to see if, starting
+  // from P, the walk returns to P.
+  bool BBVectorize::pairWillFormCycle(ValuePair P,
+                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                       DenseSet<ValuePair> &CurrentPairs) {
+    DEBUG(if (DebugCycleCheck)
+            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
+                   << *P.second << "\n");
+    // A lookup table of visisted pairs is kept because the PairableInstUserMap
+    // contains non-direct associations.
+    DenseSet<ValuePair> Visited;
+    SmallVector<ValuePair, 32> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(P);
+    do {
+      ValuePair QTop = Q.pop_back_val();
+      Visited.insert(QTop);
+
+      DEBUG(if (DebugCycleCheck)
+              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
+                     << *QTop.second << "\n");
+      VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
+      for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
+           C != QPairRange.second; ++C) {
+        if (C->second == P) {
+          DEBUG(dbgs()
+                 << "BBV: rejected to prevent non-trivial cycle formation: "
+                 << *C->first.first << " <-> " << *C->first.second << "\n");
+          return true;
+        }
+
+        if (CurrentPairs.count(C->second) && !Visited.count(C->second))
+          Q.push_back(C->second);
+      }
+    } while (!Q.empty());
+
+    return false;
+  }
+
+  // This function builds the initial tree of connected pairs with the
+  // pair J at the root.
+  void BBVectorize::buildInitialTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
+    // Each of these pairs is viewed as the root node of a Tree. The Tree
+    // is then walked (depth-first). As this happens, we keep track of
+    // the pairs that compose the Tree and the maximum depth of the Tree.
+    SmallVector<ValuePairWithDepth, 32> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+    do {
+      ValuePairWithDepth QTop = Q.back();
+
+      // Push each child onto the queue:
+      bool MoreChildren = false;
+      size_t MaxChildDepth = QTop.second;
+      VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
+      for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
+           k != qtRange.second; ++k) {
+        // Make sure that this child pair is still a candidate:
+        bool IsStillCand = false;
+        VPIteratorPair checkRange =
+          CandidatePairs.equal_range(k->second.first);
+        for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
+             m != checkRange.second; ++m) {
+          if (m->second == k->second.second) {
+            IsStillCand = true;
+            break;
+          }
+        }
+
+        if (IsStillCand) {
+          DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
+          if (C == Tree.end()) {
+            size_t d = getDepthFactor(k->second.first);
+            Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
+            MoreChildren = true;
+          } else {
+            MaxChildDepth = std::max(MaxChildDepth, C->second);
+          }
+        }
+      }
+
+      if (!MoreChildren) {
+        // Record the current pair as part of the Tree:
+        Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+        Q.pop_back();
+      }
+    } while (!Q.empty());
+  }
+
+  // Given some initial tree, prune it by removing conflicting pairs (pairs
+  // that cannot be simultaneously chosen for vectorization).
+  void BBVectorize::pruneTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree,
+                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
+                      bool UseCycleCheck) {
+    SmallVector<ValuePairWithDepth, 32> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+    do {
+      ValuePairWithDepth QTop = Q.pop_back_val();
+      PrunedTree.insert(QTop.first);
+
+      // Visit each child, pruning as necessary...
+      DenseMap<ValuePair, size_t> BestChildren;
+      VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
+      for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
+           K != QTopRange.second; ++K) {
+        DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
+        if (C == Tree.end()) continue;
+
+        // This child is in the Tree, now we need to make sure it is the
+        // best of any conflicting children. There could be multiple
+        // conflicting children, so first, determine if we're keeping
+        // this child, then delete conflicting children as necessary.
+
+        // It is also necessary to guard against pairing-induced
+        // dependencies. Consider instructions a .. x .. y .. b
+        // such that (a,b) are to be fused and (x,y) are to be fused
+        // but a is an input to x and b is an output from y. This
+        // means that y cannot be moved after b but x must be moved
+        // after b for (a,b) to be fused. In other words, after
+        // fusing (a,b) we have y .. a/b .. x where y is an input
+        // to a/b and x is an output to a/b: x and y can no longer
+        // be legally fused. To prevent this condition, we must
+        // make sure that a child pair added to the Tree is not
+        // both an input and output of an already-selected pair.
+
+        // Pairing-induced dependencies can also form from more complicated
+        // cycles. The pair vs. pair conflicts are easy to check, and so
+        // that is done explicitly for "fast rejection", and because for
+        // child vs. child conflicts, we may prefer to keep the current
+        // pair in preference to the already-selected child.
+        DenseSet<ValuePair> CurrentPairs;
+
+        bool CanAdd = true;
+        for (DenseMap<ValuePair, size_t>::iterator C2
+              = BestChildren.begin(), E2 = BestChildren.end();
+             C2 != E2; ++C2) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            if (C2->second >= C->second) {
+              CanAdd = false;
+              break;
+            }
+
+            CurrentPairs.insert(C2->first);
+          }
+        }
+        if (!CanAdd) continue;
+
+        // Even worse, this child could conflict with another node already
+        // selected for the Tree. If that is the case, ignore this child.
+        for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
+             E2 = PrunedTree.end(); T != E2; ++T) {
+          if (T->first == C->first.first ||
+              T->first == C->first.second ||
+              T->second == C->first.first ||
+              T->second == C->first.second ||
+              pairsConflict(*T, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(*T);
+        }
+        if (!CanAdd) continue;
+
+        // And check the queue too...
+        for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
+             E2 = Q.end(); C2 != E2; ++C2) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(C2->first);
+        }
+        if (!CanAdd) continue;
+
+        // Last but not least, check for a conflict with any of the
+        // already-chosen pairs.
+        for (DenseMap<Value *, Value *>::iterator C2 =
+              ChosenPairs.begin(), E2 = ChosenPairs.end();
+             C2 != E2; ++C2) {
+          if (pairsConflict(*C2, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(*C2);
+        }
+        if (!CanAdd) continue;
+
+        // To check for non-trivial cycles formed by the addition of the
+        // current pair we've formed a list of all relevant pairs, now use a
+        // graph walk to check for a cycle. We start from the current pair and
+        // walk the use tree to see if we again reach the current pair. If we
+        // do, then the current pair is rejected.
+
+        // FIXME: It may be more efficient to use a topological-ordering
+        // algorithm to improve the cycle check. This should be investigated.
+        if (UseCycleCheck &&
+            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
+          continue;
+
+        // This child can be added, but we may have chosen it in preference
+        // to an already-selected child. Check for this here, and if a
+        // conflict is found, then remove the previously-selected child
+        // before adding this one in its place.
+        for (DenseMap<ValuePair, size_t>::iterator C2
+              = BestChildren.begin(); C2 != BestChildren.end();) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers))
+            BestChildren.erase(C2++);
+          else
+            ++C2;
+        }
+
+        BestChildren.insert(ValuePairWithDepth(C->first, C->second));
+      }
+
+      for (DenseMap<ValuePair, size_t>::iterator C
+            = BestChildren.begin(), E2 = BestChildren.end();
+           C != E2; ++C) {
+        size_t DepthF = getDepthFactor(C->first.first);
+        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
+      }
+    } while (!Q.empty());
+  }
+
+  // This function finds the best tree of mututally-compatible connected
+  // pairs, given the choice of root pairs as an iterator range.
+  void BBVectorize::findBestTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+                      size_t &BestEffSize, VPIteratorPair ChoiceRange,
+                      bool UseCycleCheck) {
+    for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
+         J != ChoiceRange.second; ++J) {
+
+      // Before going any further, make sure that this pair does not
+      // conflict with any already-selected pairs (see comment below
+      // near the Tree pruning for more details).
+      DenseSet<ValuePair> ChosenPairSet;
+      bool DoesConflict = false;
+      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
+           E = ChosenPairs.end(); C != E; ++C) {
+        if (pairsConflict(*C, *J, PairableInstUsers,
+                          UseCycleCheck ? &PairableInstUserMap : 0)) {
+          DoesConflict = true;
+          break;
+        }
+
+        ChosenPairSet.insert(*C);
+      }
+      if (DoesConflict) continue;
+
+      if (UseCycleCheck &&
+          pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+        continue;
+
+      DenseMap<ValuePair, size_t> Tree;
+      buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                          PairableInstUsers, ChosenPairs, Tree, *J);
+
+      // Because we'll keep the child with the largest depth, the largest
+      // depth is still the same in the unpruned Tree.
+      size_t MaxDepth = Tree.lookup(*J);
+
+      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
+                   << *J->first << " <-> " << *J->second << "} of depth " <<
+                   MaxDepth << " and size " << Tree.size() << "\n");
+
+      // At this point the Tree has been constructed, but, may contain
+      // contradictory children (meaning that different children of
+      // some tree node may be attempting to fuse the same instruction).
+      // So now we walk the tree again, in the case of a conflict,
+      // keep only the child with the largest depth. To break a tie,
+      // favor the first child.
+
+      DenseSet<ValuePair> PrunedTree;
+      pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                   PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
+                   PrunedTree, *J, UseCycleCheck);
+
+      size_t EffSize = 0;
+      for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+           E = PrunedTree.end(); S != E; ++S)
+        EffSize += getDepthFactor(S->first);
+
+      DEBUG(if (DebugPairSelection)
+             dbgs() << "BBV: found pruned Tree for pair {"
+             << *J->first << " <-> " << *J->second << "} of depth " <<
+             MaxDepth << " and size " << PrunedTree.size() <<
+            " (effective size: " << EffSize << ")\n");
+      if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) {
+        BestMaxDepth = MaxDepth;
+        BestEffSize = EffSize;
+        BestTree = PrunedTree;
+      }
+    }
+  }
+
+  // Given the list of candidate pairs, this function selects those
+  // that will be fused into vector instructions.
+  void BBVectorize::choosePairs(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      DenseMap<Value *, Value *>& ChosenPairs) {
+    bool UseCycleCheck =
+     CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck;
+    std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
+         E = PairableInsts.end(); I != E; ++I) {
+      // The number of possible pairings for this variable:
+      size_t NumChoices = CandidatePairs.count(*I);
+      if (!NumChoices) continue;
+
+      VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+
+      // The best pair to choose and its tree:
+      size_t BestMaxDepth = 0, BestEffSize = 0;
+      DenseSet<ValuePair> BestTree;
+      findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                      PairableInstUsers, PairableInstUserMap, ChosenPairs,
+                      BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+                      UseCycleCheck);
+
+      // A tree has been chosen (or not) at this point. If no tree was
+      // chosen, then this instruction, I, cannot be paired (and is no longer
+      // considered).
+
+      DEBUG(if (BestTree.size() > 0)
+              dbgs() << "BBV: selected pairs in the best tree for: "
+                     << *cast<Instruction>(*I) << "\n");
+
+      for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
+           SE2 = BestTree.end(); S != SE2; ++S) {
+        // Insert the members of this tree into the list of chosen pairs.
+        ChosenPairs.insert(ValuePair(S->first, S->second));
+        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
+               *S->second << "\n");
+
+        // Remove all candidate pairs that have values in the chosen tree.
+        for (std::multimap<Value *, Value *>::iterator K =
+               CandidatePairs.begin(); K != CandidatePairs.end();) {
+          if (K->first == S->first || K->second == S->first ||
+              K->second == S->second || K->first == S->second) {
+            // Don't remove the actual pair chosen so that it can be used
+            // in subsequent tree selections.
+            if (!(K->first == S->first && K->second == S->second))
+              CandidatePairs.erase(K++);
+            else
+              ++K;
+          } else {
+            ++K;
+          }
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
+  }
+
+  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
+                     unsigned n = 0) {
+    if (!I->hasName())
+      return "";
+
+    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
+             (n > 0 ? "." + utostr(n) : "")).str();
+  }
+
+  // Returns the value that is to be used as the pointer input to the vector
+  // instruction that fuses I with J.
+  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
+                     Instruction *I, Instruction *J, unsigned o,
+                     bool &FlipMemInputs) {
+    Value *IPtr, *JPtr;
+    unsigned IAlignment, JAlignment;
+    int64_t OffsetInElmts;
+    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+                          OffsetInElmts);
+
+    // The pointer value is taken to be the one with the lowest offset.
+    Value *VPtr;
+    if (OffsetInElmts > 0) {
+      VPtr = IPtr;
+    } else {
+      FlipMemInputs = true;
+      VPtr = JPtr;
+    }
+
+    Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType();
+    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *VArgPtrType = PointerType::get(VArgType,
+      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
+                        /* insert before */ FlipMemInputs ? J : I);
+  }
+
+  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+                     unsigned IdxOffset, std::vector<Constant*> &Mask) {
+    for (unsigned v = 0; v < NumElem/2; ++v) {
+      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
+      if (m < 0) {
+        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
+      } else {
+        unsigned mm = m + (int) IdxOffset;
+        if (m >= (int) NumInElem)
+          mm += (int) NumInElem;
+
+        Mask[v+MaskOffset] =
+          ConstantInt::get(Type::getInt32Ty(Context), mm);
+      }
+    }
+  }
+
+  // Returns the value that is to be used as the vector-shuffle mask to the
+  // vector instruction that fuses I with J.
+  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
+                     Instruction *I, Instruction *J) {
+    // This is the shuffle mask. We need to append the second
+    // mask to the first, and the numbers need to be adjusted.
+
+    Type *ArgType = I->getType();
+    Type *VArgType = getVecTypeForPair(ArgType);
+
+    // Get the total number of elements in the fused vector type.
+    // By definition, this must equal the number of elements in
+    // the final mask.
+    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    std::vector<Constant*> Mask(NumElem);
+
+    Type *OpType = I->getOperand(0)->getType();
+    unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+
+    // For the mask from the first pair...
+    fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+
+    // For the mask from the second pair...
+    fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
+                       Mask);
+
+    return ConstantVector::get(Mask);
+  }
+
+  // Returns the value to be used as the specified operand of the vector
+  // instruction that fuses I with J.
+  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool FlipMemInputs) {
+    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+      // Compute the fused vector type for this operand
+    Type *ArgType = I->getOperand(o)->getType();
+    VectorType *VArgType = getVecTypeForPair(ArgType);
+
+    Instruction *L = I, *H = J;
+    if (FlipMemInputs) {
+      L = J;
+      H = I;
+    }
+
+    if (ArgType->isVectorTy()) {
+      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      std::vector<Constant*> Mask(numElem);
+      for (unsigned v = 0; v < numElem; ++v)
+        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+
+      Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
+                                              H->getOperand(o),
+                                              ConstantVector::get(Mask),
+                                              getReplacementName(I, true, o));
+      BV->insertBefore(J);
+      return BV;
+    }
+
+    // If these two inputs are the output of another vector instruction,
+    // then we should use that output directly. It might be necessary to
+    // permute it first. [When pairings are fused recursively, you can
+    // end up with cases where a large vector is decomposed into scalars
+    // using extractelement instructions, then built into size-2
+    // vectors using insertelement and the into larger vectors using
+    // shuffles. InstCombine does not simplify all of these cases well,
+    // and so we make sure that shuffles are generated here when possible.
+    ExtractElementInst *LEE
+      = dyn_cast<ExtractElementInst>(L->getOperand(o));
+    ExtractElementInst *HEE
+      = dyn_cast<ExtractElementInst>(H->getOperand(o));
+
+    if (LEE && HEE &&
+        LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
+      VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
+      unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
+      unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
+      if (LEE->getOperand(0) == HEE->getOperand(0)) {
+        if (LowIndx == 0 && HighIndx == 1)
+          return LEE->getOperand(0);
+
+        std::vector<Constant*> Mask(2);
+        Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+        Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+        Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+                                          UndefValue::get(EEType),
+                                          ConstantVector::get(Mask),
+                                          getReplacementName(I, true, o));
+        BV->insertBefore(J);
+        return BV;
+      }
+
+      std::vector<Constant*> Mask(2);
+      HighIndx += EEType->getNumElements();
+      Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+      Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+      Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+                                          HEE->getOperand(0),
+                                          ConstantVector::get(Mask),
+                                          getReplacementName(I, true, o));
+      BV->insertBefore(J);
+      return BV;
+    }
+
+    Instruction *BV1 = InsertElementInst::Create(
+                                          UndefValue::get(VArgType),
+                                          L->getOperand(o), CV0,
+                                          getReplacementName(I, true, o, 1));
+    BV1->insertBefore(I);
+    Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
+                                          CV1,
+                                          getReplacementName(I, true, o, 2));
+    BV2->insertBefore(J);
+    return BV2;
+  }
+
+  // This function creates an array of values that will be used as the inputs
+  // to the vector instruction that fuses I with J.
+  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
+                     Instruction *I, Instruction *J,
+                     SmallVector<Value *, 3> &ReplacedOperands,
+                     bool &FlipMemInputs) {
+    FlipMemInputs = false;
+    unsigned NumOperands = I->getNumOperands();
+
+    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
+      // Iterate backward so that we look at the store pointer
+      // first and know whether or not we need to flip the inputs.
+
+      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
+        // This is the pointer for a load/store instruction.
+        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
+                                FlipMemInputs);
+        continue;
+      } else if (isa<CallInst>(I)) {
+        Function *F = cast<CallInst>(I)->getCalledFunction();
+        unsigned IID = F->getIntrinsicID();
+        if (o == NumOperands-1) {
+          BasicBlock &BB = *I->getParent();
+
+          Module *M = BB.getParent()->getParent();
+          Type *ArgType = I->getType();
+          Type *VArgType = getVecTypeForPair(ArgType);
+
+          // FIXME: is it safe to do this here?
+          ReplacedOperands[o] = Intrinsic::getDeclaration(M,
+            (Intrinsic::ID) IID, VArgType);
+          continue;
+        } else if (IID == Intrinsic::powi && o == 1) {
+          // The second argument of powi is a single integer and we've already
+          // checked that both arguments are equal. As a result, we just keep
+          // I's second argument.
+          ReplacedOperands[o] = I->getOperand(o);
+          continue;
+        }
+      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
+        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
+        continue;
+      }
+
+      ReplacedOperands[o] =
+        getReplacementInput(Context, I, J, o, FlipMemInputs);
+    }
+  }
+
+  // This function creates two values that represent the outputs of the
+  // original I and J instructions. These are generally vector shuffles
+  // or extracts. In many cases, these will end up being unused and, thus,
+  // eliminated by later passes.
+  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, Instruction *K,
+                     Instruction *&InsertionPt,
+                     Instruction *&K1, Instruction *&K2,
+                     bool &FlipMemInputs) {
+    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+    if (isa<StoreInst>(I)) {
+      AA->replaceWithNewValue(I, K);
+      AA->replaceWithNewValue(J, K);
+    } else {
+      Type *IType = I->getType();
+      Type *VType = getVecTypeForPair(IType);
+
+      if (IType->isVectorTy()) {
+          unsigned numElem = cast<VectorType>(IType)->getNumElements();
+          std::vector<Constant*> Mask1(numElem), Mask2(numElem);
+          for (unsigned v = 0; v < numElem; ++v) {
+            Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+            Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
+          }
+
+          K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                       ConstantVector::get(
+                                         FlipMemInputs ? Mask2 : Mask1),
+                                       getReplacementName(K, false, 1));
+          K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                       ConstantVector::get(
+                                         FlipMemInputs ? Mask1 : Mask2),
+                                       getReplacementName(K, false, 2));
+      } else {
+        K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
+                                          getReplacementName(K, false, 1));
+        K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
+                                          getReplacementName(K, false, 2));
+      }
+
+      K1->insertAfter(K);
+      K2->insertAfter(K1);
+      InsertionPt = K2;
+    }
+  }
+
+  // Move all uses of the function I (including pairing-induced uses) after J.
+  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I, Instruction *J) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+    for (; cast<Instruction>(L) != J; ++L)
+      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
+
+    assert(cast<Instruction>(L) == J &&
+      "Tracking has not proceeded far enough to check for dependencies");
+    // If J is now in the use set of I, then trackUsesOfI will return true
+    // and we have a dependency cycle (and the fusing operation must abort).
+    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
+  }
+
+  // Move all uses of the function I (including pairing-induced uses) after J.
+  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *&InsertionPt,
+                     Instruction *I, Instruction *J) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+    for (; cast<Instruction>(L) != J;) {
+      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+        // Move this instruction
+        Instruction *InstToMove = L; ++L;
+
+        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
+                        " to after " << *InsertionPt << "\n");
+        InstToMove->removeFromParent();
+        InstToMove->insertAfter(InsertionPt);
+        InsertionPt = InstToMove;
+      } else {
+        ++L;
+      }
+    }
+  }
+
+  // Collect all load instruction that are in the move set of a given first
+  // pair member.  These loads depend on the first instruction, I, and so need
+  // to be moved after J (the second instruction) when the pair is fused.
+  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+
+    // Note: We cannot end the loop when we reach J because J could be moved
+    // farther down the use chain by another instruction pairing. Also, J
+    // could be before I if this is an inverted input.
+    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
+      if (trackUsesOfI(Users, WriteSet, I, L)) {
+        if (L->mayReadFromMemory())
+          LoadMoveSet.insert(ValuePair(L, I));
+      }
+    }
+  }
+
+  // In cases where both load/stores and the computation of their pointers
+  // are chosen for vectorization, we can end up in a situation where the
+  // aliasing analysis starts returning different query results as the
+  // process of fusing instruction pairs continues. Because the algorithm
+  // relies on finding the same use trees here as were found earlier, we'll
+  // need to precompute the necessary aliasing information here and then
+  // manually update it during the fusion process.
+  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet) {
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+         PIE = PairableInsts.end(); PI != PIE; ++PI) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+      if (P == ChosenPairs.end()) continue;
+
+      Instruction *I = cast<Instruction>(P->first);
+      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+    }
+  }
+
+  // This function fuses the chosen instruction pairs into vector instructions,
+  // taking care preserve any needed scalar outputs and, then, it reorders the
+  // remaining instructions as needed (users of the first member of the pair
+  // need to be moved to after the location of the second member of the pair
+  // because the vector instruction is inserted in the location of the pair's
+  // second member).
+  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs) {
+    LLVMContext& Context = BB.getContext();
+
+    // During the vectorization process, the order of the pairs to be fused
+    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
+    // list. After a pair is fused, the flipped pair is removed from the list.
+    std::vector<ValuePair> FlippedPairs;
+    FlippedPairs.reserve(ChosenPairs.size());
+    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
+         E = ChosenPairs.end(); P != E; ++P)
+      FlippedPairs.push_back(ValuePair(P->second, P->first));
+    for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+         E = FlippedPairs.end(); P != E; ++P)
+      ChosenPairs.insert(*P);
+
+    std::multimap<Value *, Value *> LoadMoveSet;
+    collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+
+    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
+
+    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+      if (P == ChosenPairs.end()) {
+        ++PI;
+        continue;
+      }
+
+      if (getDepthFactor(P->first) == 0) {
+        // These instructions are not really fused, but are tracked as though
+        // they are. Any case in which it would be interesting to fuse them
+        // will be taken care of by InstCombine.
+        --NumFusedOps;
+        ++PI;
+        continue;
+      }
+
+      Instruction *I = cast<Instruction>(P->first),
+        *J = cast<Instruction>(P->second);
+
+      DEBUG(dbgs() << "BBV: fusing: " << *I <<
+             " <-> " << *J << "\n");
+
+      // Remove the pair and flipped pair from the list.
+      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
+      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
+      ChosenPairs.erase(FP);
+      ChosenPairs.erase(P);
+
+      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
+        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
+               " <-> " << *J <<
+               " aborted because of non-trivial dependency cycle\n");
+        --NumFusedOps;
+        ++PI;
+        continue;
+      }
+
+      bool FlipMemInputs;
+      unsigned NumOperands = I->getNumOperands();
+      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
+      getReplacementInputsForPair(Context, I, J, ReplacedOperands,
+        FlipMemInputs);
+
+      // Make a copy of the original operation, change its type to the vector
+      // type and replace its operands with the vector operands.
+      Instruction *K = I->clone();
+      if (I->hasName()) K->takeName(I);
+
+      if (!isa<StoreInst>(K))
+        K->mutateType(getVecTypeForPair(I->getType()));
+
+      for (unsigned o = 0; o < NumOperands; ++o)
+        K->setOperand(o, ReplacedOperands[o]);
+
+      // If we've flipped the memory inputs, make sure that we take the correct
+      // alignment.
+      if (FlipMemInputs) {
+        if (isa<StoreInst>(K))
+          cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
+        else
+          cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
+      }
+
+      K->insertAfter(J);
+
+      // Instruction insertion point:
+      Instruction *InsertionPt = K;
+      Instruction *K1 = 0, *K2 = 0;
+      replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
+        FlipMemInputs);
+
+      // The use tree of the first original instruction must be moved to after
+      // the location of the second instruction. The entire use tree of the
+      // first instruction is disjoint from the input tree of the second
+      // (by definition), and so commutes with it.
+
+      moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+
+      if (!isa<StoreInst>(I)) {
+        I->replaceAllUsesWith(K1);
+        J->replaceAllUsesWith(K2);
+        AA->replaceWithNewValue(I, K1);
+        AA->replaceWithNewValue(J, K2);
+      }
+
+      // Instructions that may read from memory may be in the load move set.
+      // Once an instruction is fused, we no longer need its move set, and so
+      // the values of the map never need to be updated. However, when a load
+      // is fused, we need to merge the entries from both instructions in the
+      // pair in case those instructions were in the move set of some other
+      // yet-to-be-fused pair. The loads in question are the keys of the map.
+      if (I->mayReadFromMemory()) {
+        std::vector<ValuePair> NewSetMembers;
+        VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
+        VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
+        for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
+             N != IPairRange.second; ++N)
+          NewSetMembers.push_back(ValuePair(K, N->second));
+        for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
+             N != JPairRange.second; ++N)
+          NewSetMembers.push_back(ValuePair(K, N->second));
+        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
+             AE = NewSetMembers.end(); A != AE; ++A)
+          LoadMoveSet.insert(*A);
+      }
+
+      // Before removing I, set the iterator to the next instruction.
+      PI = llvm::next(BasicBlock::iterator(I));
+      if (cast<Instruction>(PI) == J)
+        ++PI;
+
+      SE->forgetValue(I);
+      SE->forgetValue(J);
+      I->eraseFromParent();
+      J->eraseFromParent();
+    }
+
+    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
+  }
+}
+
+char BBVectorize::ID = 0;
+static const char bb_vectorize_name[] = "Basic-Block Vectorization";
+INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+
+BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
+  return new BBVectorize(C);
+}
+
+bool
+llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
+  BBVectorize BBVectorizer(P, C);
+  return BBVectorizer.vectorizeBB(BB);
+}
+
+//===----------------------------------------------------------------------===//
+VectorizeConfig::VectorizeConfig() {
+  VectorBits = ::VectorBits;
+  VectorizeInts = !::NoInts;
+  VectorizeFloats = !::NoFloats;
+  VectorizeCasts = !::NoCasts;
+  VectorizeMath = !::NoMath;
+  VectorizeFMA = !::NoFMA;
+  VectorizeMemOps = !::NoMemOps;
+  AlignedOnly = ::AlignedOnly;
+  ReqChainDepth= ::ReqChainDepth;
+  SearchLimit = ::SearchLimit;
+  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
+  SplatBreaksChain = ::SplatBreaksChain;
+  MaxInsts = ::MaxInsts;
+  MaxIter = ::MaxIter;
+  NoMemOpBoost = ::NoMemOpBoost;
+  FastDep = ::FastDep;
+}
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
new file mode 100644
index 0000000..4b66930
--- /dev/null
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_llvm_library(LLVMVectorize
+  BBVectorize.cpp
+  Vectorize.cpp
+  )
diff --git a/lib/Transforms/Vectorize/LLVMBuild.txt b/lib/Transforms/Vectorize/LLVMBuild.txt
new file mode 100644
index 0000000..7167d27
--- /dev/null
+++ b/lib/Transforms/Vectorize/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Vectorize
+parent = Transforms
+library_name = Vectorize
+required_libraries = Analysis Core InstCombine Support Target TransformUtils
+
diff --git a/lib/Transforms/Vectorize/Makefile b/lib/Transforms/Vectorize/Makefile
new file mode 100644
index 0000000..86c3658
--- /dev/null
+++ b/lib/Transforms/Vectorize/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMVectorize
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
new file mode 100644
index 0000000..1ef6002
--- /dev/null
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -0,0 +1,39 @@
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which 
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the 
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+  initializeBBVectorizePass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+  initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBBVectorizePass());
+}
+