81 files changed, 5370 insertions, 3831 deletions
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index fa007cf..e160f63 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -155,12 +155,12 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   for (unsigned i = 0; i != PointerArgs.size(); ++i) {
     bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal);
     Argument *PtrArg = PointerArgs[i].first;
-    const Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
     // If this is a byval argument, and if the aggregate type is small, just
     // pass the elements, which is always safe.
     if (isByVal) {
-      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (maxElements > 0 && STy->getNumElements() > maxElements) {
           DEBUG(dbgs() << "argpromotion disable promoting argument '"
                 << PtrArg->getName() << "' because it would require adding more"
@@ -190,7 +190,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     // If the argument is a recursive type and we're in a recursive
     // function, we could end up infinitely peeling the function argument.
     if (isSelfRecursive) {
-      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         bool RecursiveType = false;
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           if (STy->getElementType(i) == PtrArg->getType()) {
@@ -382,7 +382,8 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
     User *U = *UI;
     Operands.clear();
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (LI->isVolatile()) return false;  // Don't hack volatile loads
+      // Don't hack volatile/atomic loads
+      if (!LI->isSimple()) return false;
       Loads.push_back(LI);
       // Direct loads are equivalent to a GEP with a zero index and then a load.
       Operands.push_back(0);
@@ -410,7 +411,8 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
       for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end();
            UI != E; ++UI)
         if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
-          if (LI->isVolatile()) return false;  // Don't hack volatile loads
+          // Don't hack volatile/atomic loads
+          if (!LI->isSimple()) return false;
           Loads.push_back(LI);
         } else {
           // Other uses than load?
@@ -492,7 +494,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
   std::vector<Type*> Params;
 
   typedef std::set<IndicesVector> ScalarizeTable;
@@ -527,8 +529,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
        ++I, ++ArgIndex) {
     if (ByValArgsToTransform.count(I)) {
       // Simple byval argument? Just add all the struct element types.
-      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      const StructType *STy = cast<StructType>(AgTy);
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         Params.push_back(STy->getElementType(i));
       ++NumByValArgsPromoted;
@@ -576,9 +578,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       for (ScalarizeTable::iterator SI = ArgIndices.begin(),
              E = ArgIndices.end(); SI != E; ++SI) {
         // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(),
-                                                           SI->begin(),
-                                                           SI->end()));
+        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(), *SI));
         assert(Params.back());
       }
 
@@ -593,7 +593,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   if (Attributes attrs = PAL.getFnAttributes())
     AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
 
-  const Type *RetTy = FTy->getReturnType();
+  Type *RetTy = FTy->getReturnType();
 
   // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
   // have zero fixed arguments.
@@ -662,13 +662,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
       } else if (ByValArgsToTransform.count(I)) {
         // Emit a GEP and load for each element of the struct.
-        const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-        const StructType *STy = cast<StructType>(AgTy);
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
         Value *Idxs[2] = {
               ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-          Value *Idx = GetElementPtrInst::Create(*AI, Idxs, Idxs+2,
+          Value *Idx = GetElementPtrInst::Create(*AI, Idxs,
                                                  (*AI)->getName()+"."+utostr(i),
                                                  Call);
           // TODO: Tell AA about the new values?
@@ -686,12 +686,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
           LoadInst *OrigLoad = OriginalLoads[*SI];
           if (!SI->empty()) {
             Ops.reserve(SI->size());
-            const Type *ElTy = V->getType();
+            Type *ElTy = V->getType();
             for (IndicesVector::const_iterator II = SI->begin(),
                  IE = SI->end(); II != IE; ++II) {
               // Use i32 to index structs, and i64 for others (pointers/arrays).
               // This satisfies GEP constraints.
-              const Type *IdxTy = (ElTy->isStructTy() ?
+              Type *IdxTy = (ElTy->isStructTy() ?
                     Type::getInt32Ty(F->getContext()) : 
                     Type::getInt64Ty(F->getContext()));
               Ops.push_back(ConstantInt::get(IdxTy, *II));
@@ -699,8 +699,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
               ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
             }
             // And create a GEP to extract those indices.
-            V = GetElementPtrInst::Create(V, Ops.begin(), Ops.end(),
-                                          V->getName()+".idx", Call);
+            V = GetElementPtrInst::Create(V, Ops, V->getName()+".idx", Call);
             Ops.clear();
             AA.copyValue(OrigLoad->getOperand(0), V);
           }
@@ -792,16 +791,16 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       Instruction *InsertPt = NF->begin()->begin();
 
       // Just add all the struct element types.
-      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
       Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
-      const StructType *STy = cast<StructType>(AgTy);
+      StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {
             ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
 
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
         Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
         Value *Idx = 
-          GetElementPtrInst::Create(TheAlloca, Idxs, Idxs+2,
+          GetElementPtrInst::Create(TheAlloca, Idxs,
                                     TheAlloca->getName()+"."+Twine(i), 
                                     InsertPt);
         I2->setName(I->getName()+"."+Twine(i));
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 3de7bfc..4d8dbc2 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -13,10 +13,20 @@ add_llvm_library(LLVMipo
   Inliner.cpp
   Internalize.cpp
   LoopExtractor.cpp
-  LowerSetJmp.cpp
   MergeFunctions.cpp
   PartialInlining.cpp
+  PassManagerBuilder.cpp
   PruneEH.cpp
   StripDeadPrototypes.cpp
   StripSymbols.cpp
   )
+
+add_llvm_library_dependencies(LLVMipo
+  LLVMAnalysis
+  LLVMCore
+  LLVMScalarOpts
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  LLVMipa
+  )
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index a21efce..c3ecb7a 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -23,7 +23,9 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -37,10 +39,18 @@ namespace {
       initializeConstantMergePass(*PassRegistry::getPassRegistry());
     }
 
-    // run - For this pass, process all of the globals in the module,
-    // eliminating duplicate constants.
-    //
+    // For this pass, process all of the globals in the module, eliminating
+    // duplicate constants.
     bool runOnModule(Module &M);
+
+    // Return true iff we can determine the alignment of this global variable.
+    bool hasKnownAlignment(GlobalVariable *GV) const;
+
+    // Return the alignment of the global, including converting the default
+    // alignment to a concrete value.
+    unsigned getAlignment(GlobalVariable *GV) const;
+
+    const TargetData *TD;
   };
 }
 
@@ -77,15 +87,28 @@ static bool IsBetterCannonical(const GlobalVariable &A,
   return A.hasUnnamedAddr();
 }
 
+bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
+  return TD || GV->getAlignment() != 0;
+}
+
+unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+  if (TD)
+    return TD->getPreferredAlignment(GV);
+  return GV->getAlignment();
+}
+
 bool ConstantMerge::runOnModule(Module &M) {
+  TD = getAnalysisIfAvailable<TargetData>();
+
   // Find all the globals that are marked "used".  These cannot be merged.
   SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
   FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
   FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
   
-  // Map unique constant/section pairs to globals.  We don't want to merge
-  // globals in different sections.
-  DenseMap<Constant*, GlobalVariable*> CMap;
+  // Map unique <constants, has-unknown-alignment> pairs to globals.  We don't
+  // want to merge globals of unknown alignment with those of explicit
+  // alignment.  If we have TargetData, we always know the alignment.
+  DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap;
 
   // Replacements - This vector contains a list of replacements to perform.
   SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements;
@@ -120,7 +143,8 @@ bool ConstantMerge::runOnModule(Module &M) {
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
-      GlobalVariable *&Slot = CMap[Init];
+      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
+      GlobalVariable *&Slot = CMap[Pair];
 
       // If this is the first constant we find or if the old on is local,
       // replace with the current one. It the current is externally visible
@@ -152,7 +176,8 @@ bool ConstantMerge::runOnModule(Module &M) {
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
-      GlobalVariable *Slot = CMap[Init];
+      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
+      GlobalVariable *Slot = CMap[Pair];
 
       if (!Slot || Slot == GV)
         continue;
@@ -175,6 +200,14 @@ bool ConstantMerge::runOnModule(Module &M) {
     // now.  This avoid invalidating the pointers in CMap, which are unneeded
     // now.
     for (unsigned i = 0, e = Replacements.size(); i != e; ++i) {
+      // Bump the alignment if necessary.
+      if (Replacements[i].first->getAlignment() ||
+          Replacements[i].second->getAlignment()) {
+        Replacements[i].second->setAlignment(std::max(
+            Replacements[i].first->getAlignment(),
+            Replacements[i].second->getAlignment()));
+      }
+
       // Eliminate any uses of the dead global.
       Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
 
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 1517765..4bb6f7a 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -206,7 +206,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but doesn't have isVarArg set.
-  const FunctionType *FTy = Fn.getFunctionType();
+  FunctionType *FTy = Fn.getFunctionType();
 
   std::vector<Type*> Params(FTy->param_begin(), FTy->param_end());
   FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
@@ -344,7 +344,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
 static unsigned NumRetVals(const Function *F) {
   if (F->getReturnType()->isVoidTy())
     return 0;
-  else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType()))
+  else if (StructType *STy = dyn_cast<StructType>(F->getReturnType()))
     return STy->getNumElements();
   else
     return 1;
@@ -491,7 +491,7 @@ void DAE::SurveyFunction(const Function &F) {
   // Keep track of the number of live retvals, so we can skip checks once all
   // of them turn out to be live.
   unsigned NumLiveRetVals = 0;
-  const Type *STy = dyn_cast<StructType>(F.getReturnType());
+  Type *STy = dyn_cast<StructType>(F.getReturnType());
   // Loop all uses of the function.
   for (Value::const_use_iterator I = F.use_begin(), E = F.use_end();
        I != E; ++I) {
@@ -646,7 +646,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has fewer arguments and a different return type.
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
   std::vector<Type*> Params;
 
   // Set up to build a new list of parameter attributes.
@@ -660,7 +660,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // Find out the new return value.
 
   Type *RetTy = FTy->getReturnType();
-  const Type *NRetTy = NULL;
+  Type *NRetTy = NULL;
   unsigned RetCount = NumRetVals(F);
 
   // -1 means unused, other numbers are the new index
@@ -669,7 +669,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   if (RetTy->isVoidTy()) {
     NRetTy = RetTy;
   } else {
-    const StructType *STy = dyn_cast<StructType>(RetTy);
+    StructType *STy = dyn_cast<StructType>(RetTy);
     if (STy)
       // Look at each of the original return values individually.
       for (unsigned i = 0; i != RetCount; ++i) {
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 95decec..0edf342 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -163,14 +163,14 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
           ReadsMemory = true;
         continue;
       } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        // Ignore non-volatile loads from local memory.
+        // Ignore non-volatile loads from local memory. (Atomic is okay here.)
         if (!LI->isVolatile()) {
           AliasAnalysis::Location Loc = AA->getLocation(LI);
           if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
             continue;
         }
       } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // Ignore non-volatile stores to local memory.
+        // Ignore non-volatile stores to local memory. (Atomic is okay here.)
         if (!SI->isVolatile()) {
           AliasAnalysis::Location Loc = AA->getLocation(SI);
           if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 4ac721d..3552d03 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -195,12 +195,14 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
       }
       if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
         GS.isLoaded = true;
-        if (LI->isVolatile()) return true;  // Don't hack on volatile loads.
+        // Don't hack on volatile/atomic loads.
+        if (!LI->isSimple()) return true;
       } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // Don't allow a store OF the address, only stores TO the address.
         if (SI->getOperand(0) == V) return true;
 
-        if (SI->isVolatile()) return true;  // Don't hack on volatile stores.
+        // Don't hack on volatile/atomic stores.
+        if (!SI->isSimple()) return true;
 
         // If this is a direct store to the global (i.e., the global is a scalar
         // value, not an aggregate), keep more specific information about
@@ -281,18 +283,18 @@ static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) {
   } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Agg)) {
     if (IdxV < CP->getNumOperands()) return CP->getOperand(IdxV);
   } else if (isa<ConstantAggregateZero>(Agg)) {
-    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+    if (StructType *STy = dyn_cast<StructType>(Agg->getType())) {
       if (IdxV < STy->getNumElements())
         return Constant::getNullValue(STy->getElementType(IdxV));
-    } else if (const SequentialType *STy =
+    } else if (SequentialType *STy =
                dyn_cast<SequentialType>(Agg->getType())) {
       return Constant::getNullValue(STy->getElementType());
     }
   } else if (isa<UndefValue>(Agg)) {
-    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+    if (StructType *STy = dyn_cast<StructType>(Agg->getType())) {
       if (IdxV < STy->getNumElements())
         return UndefValue::get(STy->getElementType(IdxV));
-    } else if (const SequentialType *STy =
+    } else if (SequentialType *STy =
                dyn_cast<SequentialType>(Agg->getType())) {
       return UndefValue::get(STy->getElementType());
     }
@@ -430,7 +432,7 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
   ++GEPI;  // Skip over the pointer index.
 
   // If this is a use of an array allocation, do a bit more checking for sanity.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
     uint64_t NumElements = AT->getNumElements();
     ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
 
@@ -451,9 +453,9 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
          GEPI != E;
          ++GEPI) {
       uint64_t NumElements;
-      if (const ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
+      if (ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
         NumElements = SubArrayTy->getNumElements();
-      else if (const VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI))
+      else if (VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI))
         NumElements = SubVectorTy->getNumElements();
       else {
         assert((*GEPI)->isStructTy() &&
@@ -498,7 +500,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
 
   assert(GV->hasLocalLinkage() && !GV->isConstant());
   Constant *Init = GV->getInitializer();
-  const Type *Ty = Init->getType();
+  Type *Ty = Init->getType();
 
   std::vector<GlobalVariable*> NewGlobals;
   Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
@@ -508,7 +510,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
   if (StartAlignment == 0)
     StartAlignment = TD.getABITypeAlignment(GV->getType());
 
-  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
     NewGlobals.reserve(STy->getNumElements());
     const StructLayout &Layout = *TD.getStructLayout(STy);
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -531,9 +533,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
       if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i)))
         NGV->setAlignment(NewAlign);
     }
-  } else if (const SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+  } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
     unsigned NumElements = 0;
-    if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+    if (ArrayType *ATy = dyn_cast<ArrayType>(STy))
       NumElements = ATy->getNumElements();
     else
       NumElements = cast<VectorType>(STy)->getNumElements();
@@ -596,15 +598,14 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
         Idxs.push_back(NullInt);
         for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
           Idxs.push_back(CE->getOperand(i));
-        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr),
-                                                &Idxs[0], Idxs.size());
+        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr), Idxs);
       } else {
         GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
         SmallVector<Value*, 8> Idxs;
         Idxs.push_back(NullInt);
         for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
           Idxs.push_back(GEPI->getOperand(i));
-        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs.begin(), Idxs.end(),
+        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs,
                                            GEPI->getName()+"."+Twine(Val),GEPI);
       }
     }
@@ -753,8 +754,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
           break;
       if (Idxs.size() == GEPI->getNumOperands()-1)
         Changed |= OptimizeAwayTrappingUsesOfValue(GEPI,
-                          ConstantExpr::getGetElementPtr(NewV, &Idxs[0],
-                                                        Idxs.size()));
+                          ConstantExpr::getGetElementPtr(NewV, Idxs));
       if (GEPI->use_empty()) {
         Changed = true;
         GEPI->eraseFromParent();
@@ -846,12 +846,12 @@ static void ConstantPropUsersOf(Value *V) {
 /// malloc into a global, and any loads of GV as uses of the new global.
 static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                                      CallInst *CI,
-                                                     const Type *AllocTy,
+                                                     Type *AllocTy,
                                                      ConstantInt *NElements,
                                                      TargetData* TD) {
   DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
 
-  const Type *GlobalType;
+  Type *GlobalType;
   if (NElements->getZExtValue() == 1)
     GlobalType = AllocTy;
   else
@@ -1192,7 +1192,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
-    const StructType *ST =
+    StructType *ST =
       cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
 
     PHINode *NewPN =
@@ -1245,8 +1245,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
     GEPIdx.push_back(GEPI->getOperand(1));
     GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
 
-    Value *NGEPI = GetElementPtrInst::Create(NewPtr,
-                                             GEPIdx.begin(), GEPIdx.end(),
+    Value *NGEPI = GetElementPtrInst::Create(NewPtr, GEPIdx,
                                              GEPI->getName(), GEPI);
     GEPI->replaceAllUsesWith(NGEPI);
     GEPI->eraseFromParent();
@@ -1260,11 +1259,9 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
   // already been seen first by another load, so its uses have already been
   // processed.
   PHINode *PN = cast<PHINode>(LoadUser);
-  bool Inserted;
-  DenseMap<Value*, std::vector<Value*> >::iterator InsertPos;
-  tie(InsertPos, Inserted) =
-    InsertedScalarizedValues.insert(std::make_pair(PN, std::vector<Value*>()));
-  if (!Inserted) return;
+  if (!InsertedScalarizedValues.insert(std::make_pair(PN,
+                                              std::vector<Value*>())).second)
+    return;
 
   // If this is the first time we've seen this PHI, recursively process all
   // users.
@@ -1298,8 +1295,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                                             Value* NElems, TargetData *TD) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
-  const Type* MAT = getMallocAllocatedType(CI);
-  const StructType *STy = cast<StructType>(MAT);
+  Type* MAT = getMallocAllocatedType(CI);
+  StructType *STy = cast<StructType>(MAT);
 
   // There is guaranteed to be at least one use of the malloc (storing
   // it into GV).  If there are other uses, change them to be uses of
@@ -1313,8 +1310,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   std::vector<Value*> FieldMallocs;
 
   for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
-    const Type *FieldTy = STy->getElementType(FieldNo);
-    const PointerType *PFieldTy = PointerType::getUnqual(FieldTy);
+    Type *FieldTy = STy->getElementType(FieldNo);
+    PointerType *PFieldTy = PointerType::getUnqual(FieldTy);
 
     GlobalVariable *NGV =
       new GlobalVariable(*GV->getParent(),
@@ -1325,9 +1322,9 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     FieldGlobals.push_back(NGV);
 
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
-    if (const StructType *ST = dyn_cast<StructType>(FieldTy))
+    if (StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, 0,
@@ -1379,8 +1376,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
     Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
     Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
-                              Constant::getNullValue(GVVal->getType()),
-                              "tmp");
+                              Constant::getNullValue(GVVal->getType()));
     BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
                                                OrigBB->getParent());
     BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
@@ -1428,7 +1424,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 
     // Insert a store of null into each global.
     for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
-      const PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
+      PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
       Constant *Null = Constant::getNullValue(PT->getElementType());
       new StoreInst(Null, FieldGlobals[i], SI);
     }
@@ -1485,7 +1481,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 /// cast of malloc.
 static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                CallInst *CI,
-                                               const Type *AllocTy,
+                                               Type *AllocTy,
                                                Module::global_iterator &GVI,
                                                TargetData *TD) {
   if (!TD)
@@ -1538,10 +1534,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // If this is an allocation of a fixed size array of structs, analyze as a
   // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
   if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
-    if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+    if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
       AllocTy = AT->getElementType();
 
-  const StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
+  StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
   if (!AllocSTy)
     return false;
 
@@ -1552,8 +1548,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
-    if (const ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
-      const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
+      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
@@ -1596,7 +1592,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
         return true;
     } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
-      const Type* MallocType = getMallocAllocatedType(CI);
+      Type* MallocType = getMallocAllocatedType(CI);
       if (MallocType && TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
                                                            GVI, TD))
         return true;
@@ -1611,7 +1607,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
 /// can shrink the global into a boolean and select between the two values
 /// whenever it is used.  This exposes the values to other scalar optimizations.
 static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
-  const Type *GVElType = GV->getType()->getElementType();
+  Type *GVElType = GV->getType()->getElementType();
 
   // If GVElType is already i1, it is already shrunk.  If the type of the GV is
   // an FP value, pointer or vector, don't do this optimization because a select
@@ -1761,7 +1757,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV);
     Instruction& FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
-    const Type* ElemTy = GV->getType()->getElementType();
+    Type* ElemTy = GV->getType()->getElementType();
     // FIXME: Pass Global's alignment when globals have alignment
     AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
@@ -2003,7 +1999,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), 65535);
   CSVals[1] = 0;
 
-  const StructType *StructTy =
+  StructType *StructTy =
     cast <StructType>(
     cast<ArrayType>(GCL->getType()->getElementType())->getElementType());
 
@@ -2013,9 +2009,9 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
     if (Ctors[i]) {
       CSVals[1] = Ctors[i];
     } else {
-      const Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
+      Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
                                           false);
-      const PointerType *PFTy = PointerType::getUnqual(FTy);
+      PointerType *PFTy = PointerType::getUnqual(FTy);
       CSVals[1] = Constant::getNullValue(PFTy);
       CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()),
                                    0x7fffffff);
@@ -2196,7 +2192,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   }
 
   std::vector<Constant*> Elts;
-  if (const StructType *STy = dyn_cast<StructType>(Init->getType())) {
+  if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
 
     // Break up the constant into its elements.
     if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
@@ -2224,10 +2220,10 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   }
   
   ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
-  const SequentialType *InitTy = cast<SequentialType>(Init->getType());
+  SequentialType *InitTy = cast<SequentialType>(Init->getType());
 
   uint64_t NumElts;
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
+  if (ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
     NumElts = ATy->getNumElements();
   else
     NumElts = cast<VectorType>(InitTy)->getNumElements();
@@ -2338,7 +2334,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
     Constant *InstResult = 0;
 
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (SI->isVolatile()) return false;  // no volatile accesses.
+      if (!SI->isSimple()) return false;  // no volatile/atomic accesses.
       Constant *Ptr = getVal(Values, SI->getOperand(1));
       if (!isSimpleEnoughPointerToCommit(Ptr))
         // If this is too complex for us to commit, reject it.
@@ -2358,7 +2354,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
           // stored value.
           Ptr = CE->getOperand(0);
           
-          const Type *NewTy=cast<PointerType>(Ptr->getType())->getElementType();
+          Type *NewTy=cast<PointerType>(Ptr->getType())->getElementType();
           
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
@@ -2367,14 +2363,14 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
             // If NewTy is a struct, we can convert the pointer to the struct
             // into a pointer to its first member.
             // FIXME: This could be extended to support arrays as well.
-            if (const StructType *STy = dyn_cast<StructType>(NewTy)) {
+            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
               NewTy = STy->getTypeAtIndex(0U);
 
-              const IntegerType *IdxTy =IntegerType::get(NewTy->getContext(), 32);
+              IntegerType *IdxTy =IntegerType::get(NewTy->getContext(), 32);
               Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
               Constant * const IdxList[] = {IdxZero, IdxZero};
 
-              Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList, 2);
+              Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList);
             
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
@@ -2411,17 +2407,17 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
            i != e; ++i)
         GEPOps.push_back(getVal(Values, *i));
-      InstResult = cast<GEPOperator>(GEP)->isInBounds() ?
-          ConstantExpr::getInBoundsGetElementPtr(P, &GEPOps[0], GEPOps.size()) :
-          ConstantExpr::getGetElementPtr(P, &GEPOps[0], GEPOps.size());
+      InstResult =
+        ConstantExpr::getGetElementPtr(P, GEPOps,
+                                       cast<GEPOperator>(GEP)->isInBounds());
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-      if (LI->isVolatile()) return false;  // no volatile accesses.
+      if (!LI->isSimple()) return false;  // no volatile/atomic accesses.
       InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)),
                                      MutatedMemory);
       if (InstResult == 0) return false; // Could not evaluate load.
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
       if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
-      const Type *Ty = AI->getType()->getElementType();
+      Type *Ty = AI->getType()->getElementType();
       AllocaTmps.push_back(new GlobalVariable(Ty, false,
                                               GlobalValue::InternalLinkage,
                                               UndefValue::get(Ty),
@@ -2465,8 +2461,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
 
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
-        if (Constant *C = ConstantFoldCall(Callee, Formals.data(),
-                                           Formals.size())) {
+        if (Constant *C = ConstantFoldCall(Callee, Formals)) {
           InstResult = C;
         } else {
           return false;
@@ -2512,7 +2507,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
         CallStack.pop_back();  // return from fn.
         return true;  // We succeeded at evaluating this ctor!
       } else {
-        // invoke, unwind, unreachable.
+        // invoke, unwind, resume, unreachable.
         return false;  // Cannot handle this terminator.
       }
 
@@ -2711,7 +2706,7 @@ static Function *FindCXAAtExit(Module &M) {
   if (!Fn)
     return 0;
   
-  const FunctionType *FTy = Fn->getFunctionType();
+  FunctionType *FTy = Fn->getFunctionType();
   
   // Checking that the function has the right return type, the right number of 
   // parameters and that they all have pointer types should be enough.
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index 25c0134..d757e1f 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -167,7 +167,7 @@ bool IPCP::PropagateConstantReturn(Function &F) {
     
   // Check to see if this function returns a constant.
   SmallVector<Value *,4> RetVals;
-  const StructType *STy = dyn_cast<StructType>(F.getReturnType());
+  StructType *STy = dyn_cast<StructType>(F.getReturnType());
   if (STy)
     for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
       RetVals.push_back(UndefValue::get(STy->getElementType(i)));
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index 31ce95f..6233922 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/IPO.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
@@ -35,7 +36,6 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeLoopExtractorPass(Registry);
   initializeBlockExtractorPassPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
-  initializeLowerSetJmpPass(Registry);
   initializeMergeFunctionsPass(Registry);
   initializePartialInlinerPass(Registry);
   initializePruneEHPass(Registry);
@@ -70,6 +70,10 @@ void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createFunctionInliningPass());
 }
 
+void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(llvm::createAlwaysInlinerPass());
+}
+
 void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createGlobalDCEPass());
 }
@@ -82,10 +86,6 @@ void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIPConstantPropagationPass());
 }
 
-void LLVMAddLowerSetJmpPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createLowerSetJmpPass());
-}
-
 void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPruneEHPass());
 }
@@ -98,11 +98,6 @@ void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
   unwrap(PM)->add(createInternalizePass(AllButMain != 0));
 }
 
-
-void LLVMAddRaiseAllocationsPass(LLVMPassManagerRef PM) {
-  // FIXME: Remove in LLVM 3.0.
-}
-
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createStripDeadPrototypesPass());
 }
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index ce795b7..c0426da 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
@@ -32,10 +33,10 @@ namespace {
   // AlwaysInliner only inlines functions that are mark as "always inline".
   class AlwaysInliner : public Inliner {
     // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline; 
+    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
-    // Use extremely low threshold. 
+    // Use extremely low threshold.
     AlwaysInliner() : Inliner(ID, -2000000000) {
       initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
     }
@@ -52,8 +53,8 @@ namespace {
     void growCachedCostInfo(Function* Caller, Function* Callee) {
       CA.growCachedCostInfo(Caller, Callee);
     }
-    virtual bool doFinalization(CallGraph &CG) { 
-      return removeDeadFunctions(CG, &NeverInline); 
+    virtual bool doFinalization(CallGraph &CG) {
+      return removeDeadFunctions(CG, &NeverInline);
     }
     virtual bool doInitialization(CallGraph &CG);
     void releaseMemory() {
@@ -71,11 +72,13 @@ INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
 
 Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
 
-// doInitialization - Initializes the vector of functions that have not 
+// doInitialization - Initializes the vector of functions that have not
 // been annotated with the "always inline" attribute.
 bool AlwaysInliner::doInitialization(CallGraph &CG) {
+  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
+
   Module &M = CG.getModule();
-  
+
   for (Module::iterator I = M.begin(), E = M.end();
        I != E; ++I)
     if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline))
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 0c5b3be..84dd4fd 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
@@ -30,7 +31,7 @@ namespace {
 
   class SimpleInliner : public Inliner {
     // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline; 
+    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
     SimpleInliner() : Inliner(ID) {
@@ -68,16 +69,17 @@ INITIALIZE_PASS_END(SimpleInliner, "inline",
 
 Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
 
-Pass *llvm::createFunctionInliningPass(int Threshold) { 
+Pass *llvm::createFunctionInliningPass(int Threshold) {
   return new SimpleInliner(Threshold);
 }
 
 // doInitialization - Initializes the vector of functions that have been
 // annotated with the noinline attribute.
 bool SimpleInliner::doInitialization(CallGraph &CG) {
-  
+  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
+
   Module &M = CG.getModule();
-  
+
   for (Module::iterator I = M.begin(), E = M.end();
        I != E; ++I)
     if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline))
@@ -85,34 +87,34 @@ bool SimpleInliner::doInitialization(CallGraph &CG) {
 
   // Get llvm.noinline
   GlobalVariable *GV = M.getNamedGlobal("llvm.noinline");
-  
+
   if (GV == 0)
     return false;
 
   // Don't crash on invalid code
   if (!GV->hasDefinitiveInitializer())
     return false;
-  
+
   const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  
+
   if (InitList == 0)
     return false;
 
   // Iterate over each element and add to the NeverInline set
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
-        
+
     // Get Source
     const Constant *Elt = InitList->getOperand(i);
-        
+
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt))
-      if (CE->getOpcode() == Instruction::BitCast) 
+      if (CE->getOpcode() == Instruction::BitCast)
         Elt = CE->getOperand(0);
-    
+
     // Insert into set of functions to never inline
     if (const Function *F = dyn_cast<Function>(Elt))
       NeverInline.insert(F);
   }
-  
+
   return false;
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 57f3e77..f00935b 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -62,7 +62,7 @@ void Inliner::getAnalysisUsage(AnalysisUsage &Info) const {
 }
 
 
-typedef DenseMap<const ArrayType*, std::vector<AllocaInst*> >
+typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
 InlinedArrayAllocasTy;
 
 /// InlineCallIfPossible - If it is possible to inline the specified call site,
@@ -139,7 +139,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
     // Don't bother trying to merge array allocations (they will usually be
     // canonicalized to be an allocation *of* an array), or allocations whose
     // type is not itself an array (because we're afraid of pessimizing SRoA).
-    const ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+    ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
     if (ATy == 0 || AI->isArrayAllocation())
       continue;
     
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 848944d..4f96afe4 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/FunctionUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include <fstream>
@@ -53,12 +54,12 @@ namespace {
 
 char LoopExtractor::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract",
-                "Extract loops into new functions", false, false)
+                      "Extract loops into new functions", false, false)
 INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_END(LoopExtractor, "loop-extract",
-                "Extract loops into new functions", false, false)
+                    "Extract loops into new functions", false, false)
 
 namespace {
   /// SingleLoopExtractor - For bugpoint.
@@ -100,9 +101,9 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
     L->getHeader()->getParent()->getEntryBlock().getTerminator();
   if (!isa<BranchInst>(EntryTI) ||
       !cast<BranchInst>(EntryTI)->isUnconditional() ||
-      EntryTI->getSuccessor(0) != L->getHeader())
+      EntryTI->getSuccessor(0) != L->getHeader()) {
     ShouldExtractLoop = true;
-  else {
+  } else {
     // Check to see if any exits from the loop are more than just return
     // blocks.
     SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -113,6 +114,21 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
         break;
       }
   }
+
+  if (ShouldExtractLoop) {
+    // We must omit landing pads. Landing pads must accompany the invoke
+    // instruction. But this would result in a loop in the extracted
+    // function. An infinite cycle occurs when it tries to extract that loop as
+    // well.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      if (ExitBlocks[i]->isLandingPad()) {
+        ShouldExtractLoop = false;
+        break;
+      }
+  }
+
   if (ShouldExtractLoop) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
@@ -149,6 +165,7 @@ namespace {
   /// BlocksToNotExtract list.
   class BlockExtractorPass : public ModulePass {
     void LoadFile(const char *Filename);
+    void SplitLandingPadPreds(Function *F);
 
     std::vector<BasicBlock*> BlocksToNotExtract;
     std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
@@ -171,8 +188,7 @@ INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
 // createBlockExtractorPass - This pass extracts all blocks (except those
 // specified in the argument list) from the functions in the module.
 //
-ModulePass *llvm::createBlockExtractorPass()
-{
+ModulePass *llvm::createBlockExtractorPass() {
   return new BlockExtractorPass();
 }
 
@@ -194,6 +210,37 @@ void BlockExtractorPass::LoadFile(const char *Filename) {
   }
 }
 
+/// SplitLandingPadPreds - The landing pad needs to be extracted with the invoke
+/// instruction. The critical edge breaker will refuse to break critical edges
+/// to a landing pad. So do them here. After this method runs, all landing pads
+/// should have only one predecessor.
+void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    if (!II) continue;
+    BasicBlock *Parent = II->getParent();
+    BasicBlock *LPad = II->getUnwindDest();
+
+    // Look through the landing pad's predecessors. If one of them ends in an
+    // 'invoke', then we want to split the landing pad.
+    bool Split = false;
+    for (pred_iterator
+           PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ++PI) {
+      BasicBlock *BB = *PI;
+      if (BB->isLandingPad() && BB != Parent &&
+          isa<InvokeInst>(Parent->getTerminator())) {
+        Split = true;
+        break;
+      }
+    }
+
+    if (!Split) continue;
+
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", 0, NewBBs);
+  }
+}
+
 bool BlockExtractorPass::runOnModule(Module &M) {
   std::set<BasicBlock*> TranslatedBlocksToNotExtract;
   for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
@@ -236,13 +283,21 @@ bool BlockExtractorPass::runOnModule(Module &M) {
   // Now that we know which blocks to not extract, figure out which ones we WANT
   // to extract.
   std::vector<BasicBlock*> BlocksToExtract;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    SplitLandingPadPreds(&*F);
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       if (!TranslatedBlocksToNotExtract.count(BB))
         BlocksToExtract.push_back(BB);
+  }
 
-  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i)
-    ExtractBasicBlock(BlocksToExtract[i]);
+  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) {
+    SmallVector<BasicBlock*, 2> BlocksToExtractVec;
+    BlocksToExtractVec.push_back(BlocksToExtract[i]);
+    if (const InvokeInst *II =
+        dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator()))
+      BlocksToExtractVec.push_back(II->getUnwindDest());
+    ExtractBasicBlock(BlocksToExtractVec);
+  }
 
   return !BlocksToExtract.empty();
 }
diff --git a/lib/Transforms/IPO/LowerSetJmp.cpp b/lib/Transforms/IPO/LowerSetJmp.cpp
deleted file mode 100644
index 659476b..0000000
--- a/lib/Transforms/IPO/LowerSetJmp.cpp
+++ /dev/null
@@ -1,547 +0,0 @@
-//===- LowerSetJmp.cpp - Code pertaining to lowering set/long jumps -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements the lowering of setjmp and longjmp to use the
-//  LLVM invoke and unwind instructions as necessary.
-//
-//  Lowering of longjmp is fairly trivial. We replace the call with a
-//  call to the LLVM library function "__llvm_sjljeh_throw_longjmp()".
-//  This unwinds the stack for us calling all of the destructors for
-//  objects allocated on the stack.
-//
-//  At a setjmp call, the basic block is split and the setjmp removed.
-//  The calls in a function that have a setjmp are converted to invoke
-//  where the except part checks to see if it's a longjmp exception and,
-//  if so, if it's handled in the function. If it is, then it gets the
-//  value returned by the longjmp and goes to where the basic block was
-//  split. Invoke instructions are handled in a similar fashion with the
-//  original except block being executed if it isn't a longjmp except
-//  that is handled by that function.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// FIXME: This pass doesn't deal with PHI statements just yet. That is,
-// we expect this to occur before SSAification is done. This would seem
-// to make sense, but in general, it might be a good idea to make this
-// pass invokable via the "opt" command at will.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "lowersetjmp"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/InstVisitor.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include <map>
-using namespace llvm;
-
-STATISTIC(LongJmpsTransformed, "Number of longjmps transformed");
-STATISTIC(SetJmpsTransformed , "Number of setjmps transformed");
-STATISTIC(CallsTransformed   , "Number of calls invokified");
-STATISTIC(InvokesTransformed , "Number of invokes modified");
-
-namespace {
-  //===--------------------------------------------------------------------===//
-  // LowerSetJmp pass implementation.
-  class LowerSetJmp : public ModulePass, public InstVisitor<LowerSetJmp> {
-    // LLVM library functions...
-    Constant *InitSJMap;        // __llvm_sjljeh_init_setjmpmap
-    Constant *DestroySJMap;     // __llvm_sjljeh_destroy_setjmpmap
-    Constant *AddSJToMap;       // __llvm_sjljeh_add_setjmp_to_map
-    Constant *ThrowLongJmp;     // __llvm_sjljeh_throw_longjmp
-    Constant *TryCatchLJ;       // __llvm_sjljeh_try_catching_longjmp_exception
-    Constant *IsLJException;    // __llvm_sjljeh_is_longjmp_exception
-    Constant *GetLJValue;       // __llvm_sjljeh_get_longjmp_value
-
-    typedef std::pair<SwitchInst*, CallInst*> SwitchValuePair;
-
-    // Keep track of those basic blocks reachable via a depth-first search of
-    // the CFG from a setjmp call. We only need to transform those "call" and
-    // "invoke" instructions that are reachable from the setjmp call site.
-    std::set<BasicBlock*> DFSBlocks;
-
-    // The setjmp map is going to hold information about which setjmps
-    // were called (each setjmp gets its own number) and with which
-    // buffer it was called.
-    std::map<Function*, AllocaInst*>            SJMap;
-
-    // The rethrow basic block map holds the basic block to branch to if
-    // the exception isn't handled in the current function and needs to
-    // be rethrown.
-    std::map<const Function*, BasicBlock*>      RethrowBBMap;
-
-    // The preliminary basic block map holds a basic block that grabs the
-    // exception and determines if it's handled by the current function.
-    std::map<const Function*, BasicBlock*>      PrelimBBMap;
-
-    // The switch/value map holds a switch inst/call inst pair. The
-    // switch inst controls which handler (if any) gets called and the
-    // value is the value returned to that handler by the call to
-    // __llvm_sjljeh_get_longjmp_value.
-    std::map<const Function*, SwitchValuePair>  SwitchValMap;
-
-    // A map of which setjmps we've seen so far in a function.
-    std::map<const Function*, unsigned>         SetJmpIDMap;
-
-    AllocaInst*     GetSetJmpMap(Function* Func);
-    BasicBlock*     GetRethrowBB(Function* Func);
-    SwitchValuePair GetSJSwitch(Function* Func, BasicBlock* Rethrow);
-
-    void TransformLongJmpCall(CallInst* Inst);
-    void TransformSetJmpCall(CallInst* Inst);
-
-    bool IsTransformableFunction(StringRef Name);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    LowerSetJmp() : ModulePass(ID) {
-      initializeLowerSetJmpPass(*PassRegistry::getPassRegistry());
-    }
-
-    void visitCallInst(CallInst& CI);
-    void visitInvokeInst(InvokeInst& II);
-    void visitReturnInst(ReturnInst& RI);
-    void visitUnwindInst(UnwindInst& UI);
-
-    bool runOnModule(Module& M);
-    bool doInitialization(Module& M);
-  };
-} // end anonymous namespace
-
-char LowerSetJmp::ID = 0;
-INITIALIZE_PASS(LowerSetJmp, "lowersetjmp", "Lower Set Jump", false, false)
-
-// run - Run the transformation on the program. We grab the function
-// prototypes for longjmp and setjmp. If they are used in the program,
-// then we can go directly to the places they're at and transform them.
-bool LowerSetJmp::runOnModule(Module& M) {
-  bool Changed = false;
-
-  // These are what the functions are called.
-  Function* SetJmp = M.getFunction("llvm.setjmp");
-  Function* LongJmp = M.getFunction("llvm.longjmp");
-
-  // This program doesn't have longjmp and setjmp calls.
-  if ((!LongJmp || LongJmp->use_empty()) &&
-        (!SetJmp || SetJmp->use_empty())) return false;
-
-  // Initialize some values and functions we'll need to transform the
-  // setjmp/longjmp functions.
-  doInitialization(M);
-
-  if (SetJmp) {
-    for (Value::use_iterator B = SetJmp->use_begin(), E = SetJmp->use_end();
-         B != E; ++B) {
-      BasicBlock* BB = cast<Instruction>(*B)->getParent();
-      for (df_ext_iterator<BasicBlock*> I = df_ext_begin(BB, DFSBlocks),
-             E = df_ext_end(BB, DFSBlocks); I != E; ++I)
-        /* empty */;
-    }
-
-    while (!SetJmp->use_empty()) {
-      assert(isa<CallInst>(SetJmp->use_back()) &&
-             "User of setjmp intrinsic not a call?");
-      TransformSetJmpCall(cast<CallInst>(SetJmp->use_back()));
-      Changed = true;
-    }
-  }
-
-  if (LongJmp)
-    while (!LongJmp->use_empty()) {
-      assert(isa<CallInst>(LongJmp->use_back()) &&
-             "User of longjmp intrinsic not a call?");
-      TransformLongJmpCall(cast<CallInst>(LongJmp->use_back()));
-      Changed = true;
-    }
-
-  // Now go through the affected functions and convert calls and invokes
-  // to new invokes...
-  for (std::map<Function*, AllocaInst*>::iterator
-      B = SJMap.begin(), E = SJMap.end(); B != E; ++B) {
-    Function* F = B->first;
-    for (Function::iterator BB = F->begin(), BE = F->end(); BB != BE; ++BB)
-      for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ) {
-        visit(*IB++);
-        if (IB != BB->end() && IB->getParent() != BB)
-          break;  // The next instruction got moved to a different block!
-      }
-  }
-
-  DFSBlocks.clear();
-  SJMap.clear();
-  RethrowBBMap.clear();
-  PrelimBBMap.clear();
-  SwitchValMap.clear();
-  SetJmpIDMap.clear();
-
-  return Changed;
-}
-
-// doInitialization - For the lower long/setjmp pass, this ensures that a
-// module contains a declaration for the intrisic functions we are going
-// to call to convert longjmp and setjmp calls.
-//
-// This function is always successful, unless it isn't.
-bool LowerSetJmp::doInitialization(Module& M)
-{
-  const Type *SBPTy = Type::getInt8PtrTy(M.getContext());
-  const Type *SBPPTy = PointerType::getUnqual(SBPTy);
-
-  // N.B. See llvm/runtime/GCCLibraries/libexception/SJLJ-Exception.h for
-  // a description of the following library functions.
-
-  // void __llvm_sjljeh_init_setjmpmap(void**)
-  InitSJMap = M.getOrInsertFunction("__llvm_sjljeh_init_setjmpmap",
-                                    Type::getVoidTy(M.getContext()),
-                                    SBPPTy, (Type *)0);
-  // void __llvm_sjljeh_destroy_setjmpmap(void**)
-  DestroySJMap = M.getOrInsertFunction("__llvm_sjljeh_destroy_setjmpmap",
-                                       Type::getVoidTy(M.getContext()),
-                                       SBPPTy, (Type *)0);
-
-  // void __llvm_sjljeh_add_setjmp_to_map(void**, void*, unsigned)
-  AddSJToMap = M.getOrInsertFunction("__llvm_sjljeh_add_setjmp_to_map",
-                                     Type::getVoidTy(M.getContext()),
-                                     SBPPTy, SBPTy,
-                                     Type::getInt32Ty(M.getContext()),
-                                     (Type *)0);
-
-  // void __llvm_sjljeh_throw_longjmp(int*, int)
-  ThrowLongJmp = M.getOrInsertFunction("__llvm_sjljeh_throw_longjmp",
-                                       Type::getVoidTy(M.getContext()), SBPTy, 
-                                       Type::getInt32Ty(M.getContext()),
-                                       (Type *)0);
-
-  // unsigned __llvm_sjljeh_try_catching_longjmp_exception(void **)
-  TryCatchLJ =
-    M.getOrInsertFunction("__llvm_sjljeh_try_catching_longjmp_exception",
-                          Type::getInt32Ty(M.getContext()), SBPPTy, (Type *)0);
-
-  // bool __llvm_sjljeh_is_longjmp_exception()
-  IsLJException = M.getOrInsertFunction("__llvm_sjljeh_is_longjmp_exception",
-                                        Type::getInt1Ty(M.getContext()),
-                                        (Type *)0);
-
-  // int __llvm_sjljeh_get_longjmp_value()
-  GetLJValue = M.getOrInsertFunction("__llvm_sjljeh_get_longjmp_value",
-                                     Type::getInt32Ty(M.getContext()),
-                                     (Type *)0);
-  return true;
-}
-
-// IsTransformableFunction - Return true if the function name isn't one
-// of the ones we don't want transformed. Currently, don't transform any
-// "llvm.{setjmp,longjmp}" functions and none of the setjmp/longjmp error
-// handling functions (beginning with __llvm_sjljeh_...they don't throw
-// exceptions).
-bool LowerSetJmp::IsTransformableFunction(StringRef Name) {
-  return !Name.startswith("__llvm_sjljeh_");
-}
-
-// TransformLongJmpCall - Transform a longjmp call into a call to the
-// internal __llvm_sjljeh_throw_longjmp function. It then takes care of
-// throwing the exception for us.
-void LowerSetJmp::TransformLongJmpCall(CallInst* Inst)
-{
-  const Type* SBPTy = Type::getInt8PtrTy(Inst->getContext());
-
-  // Create the call to "__llvm_sjljeh_throw_longjmp". This takes the
-  // same parameters as "longjmp", except that the buffer is cast to a
-  // char*. It returns "void", so it doesn't need to replace any of
-  // Inst's uses and doesn't get a name.
-  CastInst* CI = 
-    new BitCastInst(Inst->getArgOperand(0), SBPTy, "LJBuf", Inst);
-  Value *Args[] = { CI, Inst->getArgOperand(1) };
-  CallInst::Create(ThrowLongJmp, Args, "", Inst);
-
-  SwitchValuePair& SVP = SwitchValMap[Inst->getParent()->getParent()];
-
-  // If the function has a setjmp call in it (they are transformed first)
-  // we should branch to the basic block that determines if this longjmp
-  // is applicable here. Otherwise, issue an unwind.
-  if (SVP.first)
-    BranchInst::Create(SVP.first->getParent(), Inst);
-  else
-    new UnwindInst(Inst->getContext(), Inst);
-
-  // Remove all insts after the branch/unwind inst.  Go from back to front to
-  // avoid replaceAllUsesWith if possible.
-  BasicBlock *BB = Inst->getParent();
-  Instruction *Removed;
-  do {
-    Removed = &BB->back();
-    // If the removed instructions have any users, replace them now.
-    if (!Removed->use_empty())
-      Removed->replaceAllUsesWith(UndefValue::get(Removed->getType()));
-    Removed->eraseFromParent();
-  } while (Removed != Inst);
-
-  ++LongJmpsTransformed;
-}
-
-// GetSetJmpMap - Retrieve (create and initialize, if necessary) the
-// setjmp map. This map is going to hold information about which setjmps
-// were called (each setjmp gets its own number) and with which buffer it
-// was called. There can be only one!
-AllocaInst* LowerSetJmp::GetSetJmpMap(Function* Func)
-{
-  if (SJMap[Func]) return SJMap[Func];
-
-  // Insert the setjmp map initialization before the first instruction in
-  // the function.
-  Instruction* Inst = Func->getEntryBlock().begin();
-  assert(Inst && "Couldn't find even ONE instruction in entry block!");
-
-  // Fill in the alloca and call to initialize the SJ map.
-  const Type *SBPTy =
-        Type::getInt8PtrTy(Func->getContext());
-  AllocaInst* Map = new AllocaInst(SBPTy, 0, "SJMap", Inst);
-  CallInst::Create(InitSJMap, Map, "", Inst);
-  return SJMap[Func] = Map;
-}
-
-// GetRethrowBB - Only one rethrow basic block is needed per function.
-// If this is a longjmp exception but not handled in this block, this BB
-// performs the rethrow.
-BasicBlock* LowerSetJmp::GetRethrowBB(Function* Func)
-{
-  if (RethrowBBMap[Func]) return RethrowBBMap[Func];
-
-  // The basic block we're going to jump to if we need to rethrow the
-  // exception.
-  BasicBlock* Rethrow =
-        BasicBlock::Create(Func->getContext(), "RethrowExcept", Func);
-
-  // Fill in the "Rethrow" BB with a call to rethrow the exception. This
-  // is the last instruction in the BB since at this point the runtime
-  // should exit this function and go to the next function.
-  new UnwindInst(Func->getContext(), Rethrow);
-  return RethrowBBMap[Func] = Rethrow;
-}
-
-// GetSJSwitch - Return the switch statement that controls which handler
-// (if any) gets called and the value returned to that handler.
-LowerSetJmp::SwitchValuePair LowerSetJmp::GetSJSwitch(Function* Func,
-                                                      BasicBlock* Rethrow)
-{
-  if (SwitchValMap[Func].first) return SwitchValMap[Func];
-
-  BasicBlock* LongJmpPre =
-        BasicBlock::Create(Func->getContext(), "LongJmpBlkPre", Func);
-
-  // Keep track of the preliminary basic block for some of the other
-  // transformations.
-  PrelimBBMap[Func] = LongJmpPre;
-
-  // Grab the exception.
-  CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept", LongJmpPre);
-
-  // The "decision basic block" gets the number associated with the
-  // setjmp call returning to switch on and the value returned by
-  // longjmp.
-  BasicBlock* DecisionBB =
-        BasicBlock::Create(Func->getContext(), "LJDecisionBB", Func);
-
-  BranchInst::Create(DecisionBB, Rethrow, Cond, LongJmpPre);
-
-  // Fill in the "decision" basic block.
-  CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal", DecisionBB);
-  CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum",
-                                     DecisionBB);
-
-  SwitchInst* SI = SwitchInst::Create(SJNum, Rethrow, 0, DecisionBB);
-  return SwitchValMap[Func] = SwitchValuePair(SI, LJVal);
-}
-
-// TransformSetJmpCall - The setjmp call is a bit trickier to transform.
-// We're going to convert all setjmp calls to nops. Then all "call" and
-// "invoke" instructions in the function are converted to "invoke" where
-// the "except" branch is used when returning from a longjmp call.
-void LowerSetJmp::TransformSetJmpCall(CallInst* Inst)
-{
-  BasicBlock* ABlock = Inst->getParent();
-  Function* Func = ABlock->getParent();
-
-  // Add this setjmp to the setjmp map.
-  const Type* SBPTy =
-          Type::getInt8PtrTy(Inst->getContext());
-  CastInst* BufPtr = 
-    new BitCastInst(Inst->getArgOperand(0), SBPTy, "SBJmpBuf", Inst);
-  Value *Args[] = {
-    GetSetJmpMap(Func), BufPtr,
-    ConstantInt::get(Type::getInt32Ty(Inst->getContext()), SetJmpIDMap[Func]++)
-  };
-  CallInst::Create(AddSJToMap, Args, "", Inst);
-
-  // We are guaranteed that there are no values live across basic blocks
-  // (because we are "not in SSA form" yet), but there can still be values live
-  // in basic blocks.  Because of this, splitting the setjmp block can cause
-  // values above the setjmp to not dominate uses which are after the setjmp
-  // call.  For all of these occasions, we must spill the value to the stack.
-  //
-  std::set<Instruction*> InstrsAfterCall;
-
-  // The call is probably very close to the end of the basic block, for the
-  // common usage pattern of: 'if (setjmp(...))', so keep track of the
-  // instructions after the call.
-  for (BasicBlock::iterator I = ++BasicBlock::iterator(Inst), E = ABlock->end();
-       I != E; ++I)
-    InstrsAfterCall.insert(I);
-
-  for (BasicBlock::iterator II = ABlock->begin();
-       II != BasicBlock::iterator(Inst); ++II)
-    // Loop over all of the uses of instruction.  If any of them are after the
-    // call, "spill" the value to the stack.
-    for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
-         UI != E; ++UI) {
-      User *U = *UI;
-      if (cast<Instruction>(U)->getParent() != ABlock ||
-          InstrsAfterCall.count(cast<Instruction>(U))) {
-        DemoteRegToStack(*II);
-        break;
-      }
-    }
-  InstrsAfterCall.clear();
-
-  // Change the setjmp call into a branch statement. We'll remove the
-  // setjmp call in a little bit. No worries.
-  BasicBlock* SetJmpContBlock = ABlock->splitBasicBlock(Inst);
-  assert(SetJmpContBlock && "Couldn't split setjmp BB!!");
-
-  SetJmpContBlock->setName(ABlock->getName()+"SetJmpCont");
-
-  // Add the SetJmpContBlock to the set of blocks reachable from a setjmp.
-  DFSBlocks.insert(SetJmpContBlock);
-
-  // This PHI node will be in the new block created from the
-  // splitBasicBlock call.
-  PHINode* PHI = PHINode::Create(Type::getInt32Ty(Inst->getContext()), 2,
-                                 "SetJmpReturn", Inst);
-
-  // Coming from a call to setjmp, the return is 0.
-  PHI->addIncoming(Constant::getNullValue(Type::getInt32Ty(Inst->getContext())),
-                   ABlock);
-
-  // Add the case for this setjmp's number...
-  SwitchValuePair SVP = GetSJSwitch(Func, GetRethrowBB(Func));
-  SVP.first->addCase(ConstantInt::get(Type::getInt32Ty(Inst->getContext()),
-                                      SetJmpIDMap[Func] - 1),
-                     SetJmpContBlock);
-
-  // Value coming from the handling of the exception.
-  PHI->addIncoming(SVP.second, SVP.second->getParent());
-
-  // Replace all uses of this instruction with the PHI node created by
-  // the eradication of setjmp.
-  Inst->replaceAllUsesWith(PHI);
-  Inst->eraseFromParent();
-
-  ++SetJmpsTransformed;
-}
-
-// visitCallInst - This converts all LLVM call instructions into invoke
-// instructions. The except part of the invoke goes to the "LongJmpBlkPre"
-// that grabs the exception and proceeds to determine if it's a longjmp
-// exception or not.
-void LowerSetJmp::visitCallInst(CallInst& CI)
-{
-  if (CI.getCalledFunction())
-    if (!IsTransformableFunction(CI.getCalledFunction()->getName()) ||
-        CI.getCalledFunction()->isIntrinsic()) return;
-
-  BasicBlock* OldBB = CI.getParent();
-
-  // If not reachable from a setjmp call, don't transform.
-  if (!DFSBlocks.count(OldBB)) return;
-
-  BasicBlock* NewBB = OldBB->splitBasicBlock(CI);
-  assert(NewBB && "Couldn't split BB of \"call\" instruction!!");
-  DFSBlocks.insert(NewBB);
-  NewBB->setName("Call2Invoke");
-
-  Function* Func = OldBB->getParent();
-
-  // Construct the new "invoke" instruction.
-  TerminatorInst* Term = OldBB->getTerminator();
-  CallSite CS(&CI);
-  std::vector<Value*> Params(CS.arg_begin(), CS.arg_end());
-  InvokeInst* II =
-    InvokeInst::Create(CI.getCalledValue(), NewBB, PrelimBBMap[Func],
-                       Params, CI.getName(), Term);
-  II->setCallingConv(CI.getCallingConv());
-  II->setAttributes(CI.getAttributes());
-
-  // Replace the old call inst with the invoke inst and remove the call.
-  CI.replaceAllUsesWith(II);
-  CI.eraseFromParent();
-
-  // The old terminator is useless now that we have the invoke inst.
-  Term->eraseFromParent();
-  ++CallsTransformed;
-}
-
-// visitInvokeInst - Converting the "invoke" instruction is fairly
-// straight-forward. The old exception part is replaced by a query asking
-// if this is a longjmp exception. If it is, then it goes to the longjmp
-// exception blocks. Otherwise, control is passed the old exception.
-void LowerSetJmp::visitInvokeInst(InvokeInst& II)
-{
-  if (II.getCalledFunction())
-    if (!IsTransformableFunction(II.getCalledFunction()->getName()) ||
-        II.getCalledFunction()->isIntrinsic()) return;
-
-  BasicBlock* BB = II.getParent();
-
-  // If not reachable from a setjmp call, don't transform.
-  if (!DFSBlocks.count(BB)) return;
-
-  BasicBlock* ExceptBB = II.getUnwindDest();
-
-  Function* Func = BB->getParent();
-  BasicBlock* NewExceptBB = BasicBlock::Create(II.getContext(), 
-                                               "InvokeExcept", Func);
-
-  // If this is a longjmp exception, then branch to the preliminary BB of
-  // the longjmp exception handling. Otherwise, go to the old exception.
-  CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept",
-                                          NewExceptBB);
-
-  BranchInst::Create(PrelimBBMap[Func], ExceptBB, IsLJExcept, NewExceptBB);
-
-  II.setUnwindDest(NewExceptBB);
-  ++InvokesTransformed;
-}
-
-// visitReturnInst - We want to destroy the setjmp map upon exit from the
-// function.
-void LowerSetJmp::visitReturnInst(ReturnInst &RI) {
-  Function* Func = RI.getParent()->getParent();
-  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &RI);
-}
-
-// visitUnwindInst - We want to destroy the setjmp map upon exit from the
-// function.
-void LowerSetJmp::visitUnwindInst(UnwindInst &UI) {
-  Function* Func = UI.getParent()->getParent();
-  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &UI);
-}
-
-ModulePass *llvm::createLowerSetJmpPass() {
-  return new LowerSetJmp();
-}
-
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 7796d05..0b01c38 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -76,7 +76,7 @@ STATISTIC(NumDoubleWeak, "Number of new functions created");
 /// functions that will compare equal, without looking at the instructions
 /// inside the function.
 static unsigned profileFunction(const Function *F) {
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
 
   FoldingSetNodeID ID;
   ID.AddInteger(F->size());
@@ -185,7 +185,7 @@ private:
   }
 
   /// Compare two Types, treating all pointer types as equal.
-  bool isEquivalentType(const Type *Ty1, const Type *Ty2) const;
+  bool isEquivalentType(Type *Ty1, Type *Ty2) const;
 
   // The two functions undergoing comparison.
   const Function *F1, *F2;
@@ -200,8 +200,8 @@ private:
 
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
-bool FunctionComparator::isEquivalentType(const Type *Ty1,
-                                          const Type *Ty2) const {
+bool FunctionComparator::isEquivalentType(Type *Ty1,
+                                          Type *Ty2) const {
   if (Ty1 == Ty2)
     return true;
   if (Ty1->getTypeID() != Ty2->getTypeID()) {
@@ -233,14 +233,14 @@ bool FunctionComparator::isEquivalentType(const Type *Ty1,
     return true;
 
   case Type::PointerTyID: {
-    const PointerType *PTy1 = cast<PointerType>(Ty1);
-    const PointerType *PTy2 = cast<PointerType>(Ty2);
+    PointerType *PTy1 = cast<PointerType>(Ty1);
+    PointerType *PTy2 = cast<PointerType>(Ty2);
     return PTy1->getAddressSpace() == PTy2->getAddressSpace();
   }
 
   case Type::StructTyID: {
-    const StructType *STy1 = cast<StructType>(Ty1);
-    const StructType *STy2 = cast<StructType>(Ty2);
+    StructType *STy1 = cast<StructType>(Ty1);
+    StructType *STy2 = cast<StructType>(Ty2);
     if (STy1->getNumElements() != STy2->getNumElements())
       return false;
 
@@ -255,8 +255,8 @@ bool FunctionComparator::isEquivalentType(const Type *Ty1,
   }
 
   case Type::FunctionTyID: {
-    const FunctionType *FTy1 = cast<FunctionType>(Ty1);
-    const FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    FunctionType *FTy1 = cast<FunctionType>(Ty1);
+    FunctionType *FTy2 = cast<FunctionType>(Ty2);
     if (FTy1->getNumParams() != FTy2->getNumParams() ||
         FTy1->isVarArg() != FTy2->isVarArg())
       return false;
@@ -272,8 +272,8 @@ bool FunctionComparator::isEquivalentType(const Type *Ty1,
   }
 
   case Type::ArrayTyID: {
-    const ArrayType *ATy1 = cast<ArrayType>(Ty1);
-    const ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    ArrayType *ATy1 = cast<ArrayType>(Ty1);
+    ArrayType *ATy2 = cast<ArrayType>(Ty2);
     return ATy1->getNumElements() == ATy2->getNumElements() &&
            isEquivalentType(ATy1->getElementType(), ATy2->getElementType());
   }
@@ -305,10 +305,14 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
   // Check special state that is a part of some instructions.
   if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
     return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment();
+           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() &&
+           LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
   if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
     return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment();
+           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
   if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
     return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
   if (const CallInst *CI = dyn_cast<CallInst>(I1))
@@ -317,22 +321,22 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
   if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
     return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
            CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1)) {
-    if (IVI->getNumIndices() != cast<InsertValueInst>(I2)->getNumIndices())
-      return false;
-    for (unsigned i = 0, e = IVI->getNumIndices(); i != e; ++i)
-      if (IVI->idx_begin()[i] != cast<InsertValueInst>(I2)->idx_begin()[i])
-        return false;
-    return true;
-  }
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1)) {
-    if (EVI->getNumIndices() != cast<ExtractValueInst>(I2)->getNumIndices())
-      return false;
-    for (unsigned i = 0, e = EVI->getNumIndices(); i != e; ++i)
-      if (EVI->idx_begin()[i] != cast<ExtractValueInst>(I2)->idx_begin()[i])
-        return false;
-    return true;
-  }
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I2)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
 
   return true;
 }
@@ -346,9 +350,9 @@ bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
     SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end());
     SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end());
     uint64_t Offset1 = TD->getIndexedOffset(GEP1->getPointerOperandType(),
-                                            Indices1.data(), Indices1.size());
+                                            Indices1);
     uint64_t Offset2 = TD->getIndexedOffset(GEP2->getPointerOperandType(),
-                                            Indices2.data(), Indices2.size());
+                                            Indices2);
     return Offset1 == Offset2;
   }
 
@@ -725,7 +729,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
 
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
-  const FunctionType *FFTy = F->getFunctionType();
+  FunctionType *FFTy = F->getFunctionType();
   for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
        AI != AE; ++AI) {
     Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
new file mode 100644
index 0000000..8fdfd72
--- /dev/null
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -0,0 +1,343 @@
+//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+
+#include "llvm/PassManager.h"
+#include "llvm/DefaultPasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+
+PassManagerBuilder::PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = 0;
+    Inliner = 0;
+    DisableSimplifyLibCalls = false;
+    DisableUnitAtATime = false;
+    DisableUnrollLoops = false;
+}
+
+PassManagerBuilder::~PassManagerBuilder() {
+  delete LibraryInfo;
+  delete Inliner;
+}
+
+/// Set of global extensions, automatically added as part of the standard set.
+static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
+   PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
+
+void PassManagerBuilder::addGlobalExtension(
+    PassManagerBuilder::ExtensionPointTy Ty,
+    PassManagerBuilder::ExtensionFn Fn) {
+  GlobalExtensions->push_back(std::make_pair(Ty, Fn));
+}
+
+void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+  Extensions.push_back(std::make_pair(Ty, Fn));
+}
+
+void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
+                                           PassManagerBase &PM) const {
+  for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i)
+    if ((*GlobalExtensions)[i].first == ETy)
+      (*GlobalExtensions)[i].second(*this, PM);
+  for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+    if (Extensions[i].first == ETy)
+      Extensions[i].second(*this, PM);
+}
+
+void
+PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
+}
+
+void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
+  addExtensionsToPM(EP_EarlyAsPossible, FPM);
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  if (OptLevel == 0) return;
+
+  addInitialAliasAnalysisPasses(FPM);
+
+  FPM.add(createCFGSimplificationPass());
+  FPM.add(createScalarReplAggregatesPass());
+  FPM.add(createEarlyCSEPass());
+  FPM.add(createLowerExpectIntrinsicPass());
+}
+
+void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
+  // If all optimizations are disabled, just run the always-inline pass.
+  if (OptLevel == 0) {
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = 0;
+    }
+    return;
+  }
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  addInitialAliasAnalysisPasses(MPM);
+
+  if (!DisableUnitAtATime) {
+    MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+
+    MPM.add(createIPSCCPPass());              // IP SCCP
+    MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+
+    MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+  }
+
+  // Start of CallGraph SCC passes.
+  if (!DisableUnitAtATime)
+    MPM.add(createPruneEHPass());             // Remove dead EH info
+  if (Inliner) {
+    MPM.add(Inliner);
+    Inliner = 0;
+  }
+  if (!DisableUnitAtATime)
+    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+  if (OptLevel > 2)
+    MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
+
+  // Start of function pass.
+  // Break up aggregate allocas, using SSAUpdater.
+  MPM.add(createScalarReplAggregatesPass(-1, false));
+  MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  if (!DisableSimplifyLibCalls)
+    MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
+  MPM.add(createJumpThreadingPass());         // Thread jumps.
+  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+
+  MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createReassociatePass());           // Reassociate expressions
+  MPM.add(createLoopRotatePass());            // Rotate Loop
+  MPM.add(createLICMPass());                  // Hoist loop invariants
+  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  MPM.add(createInstructionCombiningPass());
+  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  MPM.add(createLoopDeletionPass());          // Delete dead loops
+  if (!DisableUnrollLoops)
+    MPM.add(createLoopUnrollPass());          // Unroll small loops
+  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+
+  if (OptLevel > 1)
+    MPM.add(createGVNPass());                 // Remove redundancies
+  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+  MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
+  // Run instcombine after redundancy elimination to exploit opportunities
+  // opened up by them.
+  MPM.add(createInstructionCombiningPass());
+  MPM.add(createJumpThreadingPass());         // Thread jumps
+  MPM.add(createCorrelatedValuePropagationPass());
+  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+
+  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+
+  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+
+  if (!DisableUnitAtATime) {
+    // FIXME: We shouldn't bother with this anymore.
+    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+
+    // GlobalOpt already deletes dead functions and globals, at -O3 try a
+    // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+    if (OptLevel > 2)
+      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+
+    if (OptLevel > 1)
+      MPM.add(createConstantMergePass());     // Merge dup global constants
+  }
+}
+
+void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
+                                                bool Internalize,
+                                                bool RunInliner) {
+  // Provide AliasAnalysis services for optimizations.
+  addInitialAliasAnalysisPasses(PM);
+
+  // Now that composite has been compiled, scan through the module, looking
+  // for a main function.  If main is defined, mark all other functions
+  // internal.
+  if (Internalize)
+    PM.add(createInternalizePass(true));
+
+  // Propagate constants at call sites into the functions they call.  This
+  // opens opportunities for globalopt (and inlining) by substituting function
+  // pointers passed as arguments to direct uses of functions.
+  PM.add(createIPSCCPPass());
+
+  // Now that we internalized some globals, see if we can hack on them!
+  PM.add(createGlobalOptimizerPass());
+
+  // Linking modules together can lead to duplicated global constants, only
+  // keep one copy of each constant.
+  PM.add(createConstantMergePass());
+
+  // Remove unused arguments from functions.
+  PM.add(createDeadArgEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  PM.add(createInstructionCombiningPass());
+
+  // Inline small functions
+  if (RunInliner)
+    PM.add(createFunctionInliningPass());
+
+  PM.add(createPruneEHPass());   // Remove dead EH info.
+
+  // Optimize globals again if we ran the inliner.
+  if (RunInliner)
+    PM.add(createGlobalOptimizerPass());
+  PM.add(createGlobalDCEPass()); // Remove dead functions.
+
+  // If we didn't decide to inline a function, check to see if we can
+  // transform it to pass arguments by value instead of by reference.
+  PM.add(createArgumentPromotionPass());
+
+  // The IPO passes may leave cruft around.  Clean up after them.
+  PM.add(createInstructionCombiningPass());
+  PM.add(createJumpThreadingPass());
+  // Break up allocas
+  PM.add(createScalarReplAggregatesPass());
+
+  // Run a few AA driven optimizations here and now, to cleanup the code.
+  PM.add(createFunctionAttrsPass()); // Add nocapture.
+  PM.add(createGlobalsModRefPass()); // IP alias analysis.
+
+  PM.add(createLICMPass());      // Hoist loop invariants.
+  PM.add(createGVNPass());       // Remove redundancies.
+  PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+  // Nuke dead stores.
+  PM.add(createDeadStoreEliminationPass());
+
+  // Cleanup and simplify the code after the scalar optimizations.
+  PM.add(createInstructionCombiningPass());
+
+  PM.add(createJumpThreadingPass());
+
+  // Delete basic blocks, which optimization passes may have killed.
+  PM.add(createCFGSimplificationPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  PM.add(createGlobalDCEPass());
+}
+
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate(void) {
+  PassManagerBuilder *PMB = new PassManagerBuilder();
+  return wrap(PMB);
+}
+
+void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  delete Builder;
+}
+
+void
+LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
+                                  unsigned OptLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->OptLevel = OptLevel;
+}
+
+void
+LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
+                                   unsigned SizeLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->SizeLevel = SizeLevel;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnitAtATime = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnrollLoops = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
+                                                 LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableSimplifyLibCalls = Value;
+}
+
+void
+LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
+                                              unsigned Threshold) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->Inliner = createFunctionInliningPass(Threshold);
+}
+
+void
+LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  FunctionPassManager *FPM = unwrap<FunctionPassManager>(PM);
+  Builder->populateFunctionPassManager(*FPM);
+}
+
+void
+LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
+                                                LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  PassManagerBase *MPM = unwrap(PM);
+  Builder->populateModulePassManager(*MPM);
+}
+
+void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM,
+                                                  bool Internalize,
+                                                  bool RunInliner) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  PassManagerBase *LPM = unwrap(PM);
+  Builder->populateLTOPassManager(*LPM, Internalize, RunInliner);
+}
+
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index b7e63dc..cbb80f0 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -101,8 +101,9 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
       // Check to see if this function performs an unwind or calls an
       // unwinding function.
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        if (CheckUnwind && isa<UnwindInst>(BB->getTerminator())) {
-          // Uses unwind!
+        if (CheckUnwind && (isa<UnwindInst>(BB->getTerminator()) ||
+                            isa<ResumeInst>(BB->getTerminator()))) {
+          // Uses unwind / resume!
           SCCMightUnwind = true;
         } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) {
           SCCMightReturn = true;
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 0fbaff1..b5caa9a 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -180,7 +180,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
 
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *STy = StructTypes[i];
-    if (STy->isAnonymous() || STy->getName().empty()) continue;
+    if (STy->isLiteral() || STy->getName().empty()) continue;
     
     if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
       continue;
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index d070ccc..a46d5ad 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -13,3 +13,11 @@ add_llvm_library(LLVMInstCombine
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
+
+add_llvm_library_dependencies(LLVMInstCombine
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 8257d6b..3808278 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -11,6 +11,7 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -103,7 +104,7 @@ public:
   //
   Instruction *visitAdd(BinaryOperator &I);
   Instruction *visitFAdd(BinaryOperator &I);
-  Value *OptimizePointerDifference(Value *LHS, Value *RHS, const Type *Ty);
+  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
@@ -192,15 +193,16 @@ public:
   Instruction *visitExtractElementInst(ExtractElementInst &EI);
   Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
   Instruction *visitExtractValueInst(ExtractValueInst &EV);
+  Instruction *visitLandingPadInst(LandingPadInst &LI);
 
   // visitInstruction - Specify what to return for unhandled instructions...
   Instruction *visitInstruction(Instruction &I) { return 0; }
 
 private:
-  bool ShouldChangeType(const Type *From, const Type *To) const;
+  bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V) const;
-  const Type *FindElementAtOffset(const Type *Ty, int64_t Offset, 
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset, 
                                   SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
                                  
@@ -209,12 +211,13 @@ private:
   /// the cast can be eliminated by some other simple transformation, we prefer
   /// to do the simplification first.
   bool ShouldOptimizeCast(Instruction::CastOps opcode,const Value *V,
-                          const Type *Ty);
+                          Type *Ty);
 
   Instruction *visitCallSite(CallSite CS);
   Instruction *tryOptimizeCall(CallInst *CI, const TargetData *TD);
   bool transformConstExprCastCall(CallSite CS);
-  Instruction *transformCallThroughTrampoline(CallSite CS);
+  Instruction *transformCallThroughTrampoline(CallSite CS,
+                                              IntrinsicInst *Tramp);
   Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                  bool DoXform = true);
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
@@ -357,7 +360,7 @@ private:
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
 
-  Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
+  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
 };
 
       
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c36a955..d10046c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -188,7 +188,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     return BinaryOperator::CreateMul(LHS, AddOne(C2));
 
   // A+B --> A|B iff A and B have no bits set in common.
-  if (const IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+  if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
     APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
     APInt LHSKnownOne(IT->getBitWidth(), 0);
     APInt LHSKnownZero(IT->getBitWidth(), 0);
@@ -401,7 +401,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   TargetData &TD = *getTargetData();
   gep_type_iterator GTI = gep_type_begin(GEP);
-  const Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
   Value *Result = Constant::getNullValue(IntPtrTy);
 
   // If the GEP is inbounds, we know that none of the addressing operations will
@@ -420,7 +420,7 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
       if (OpC->isZero()) continue;
       
       // Handle a struct index, which adds its field offset to the pointer.
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
         
         if (Size)
@@ -460,7 +460,7 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 ///
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               const Type *Ty) {
+                                               Type *Ty) {
   assert(TD && "Must have target data info for this");
   
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 64ea36f..5e0bfe8 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1174,30 +1174,31 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         ((A == C && B == D) || (A == D && B == C)))
       return BinaryOperator::CreateXor(A, B);
     
-    if (Op0->hasOneUse() &&
-        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
-      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
-        I.swapOperands();     // Simplify below
-        std::swap(Op0, Op1);
-      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
-        cast<BinaryOperator>(Op0)->swapOperands();
-        I.swapOperands();     // Simplify below
-        std::swap(Op0, Op1);
+    // A&(A^B) => A & ~B
+    {
+      Value *tmpOp0 = Op0;
+      Value *tmpOp1 = Op1;
+      if (Op0->hasOneUse() &&
+          match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+        if (A == Op1 || B == Op1 ) {
+          tmpOp1 = Op0;
+          tmpOp0 = Op1;
+          // Simplify below
+        }
       }
-    }
 
-    if (Op1->hasOneUse() &&
-        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
-      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
-        cast<BinaryOperator>(Op1)->swapOperands();
-        std::swap(A, B);
+      if (tmpOp1->hasOneUse() &&
+          match(tmpOp1, m_Xor(m_Value(A), m_Value(B)))) {
+        if (B == tmpOp0) {
+          std::swap(A, B);
+        }
+        // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if
+        // A is originally -1 (or a vector of -1 and undefs), then we enter
+        // an endless loop. By checking that A is non-constant we ensure that
+        // we will never get to the loop.
+        if (A == tmpOp0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
+          return BinaryOperator::CreateAnd(A, Builder->CreateNot(B));
       }
-      // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if
-      // A is originally -1 (or a vector of -1 and undefs), then we enter
-      // an endless loop. By checking that A is non-constant we ensure that
-      // we will never get to the loop.
-      if (A == Op0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
-        return BinaryOperator::CreateAnd(A, Builder->CreateNot(B, "tmp"));
     }
 
     // (A&((~A)|B)) -> A&B
@@ -1224,7 +1225,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   // fold (and (cast A), (cast B)) -> (cast (and A, B))
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
     if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) {
-      const Type *SrcTy = Op0C->getOperand(0)->getType();
+      Type *SrcTy = Op0C->getOperand(0)->getType();
       if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ?
           SrcTy == Op1C->getOperand(0)->getType() &&
           SrcTy->isIntOrIntVectorTy()) {
@@ -2008,7 +2009,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
     CastInst *Op1C = dyn_cast<CastInst>(Op1);
     if (Op1C && Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
-      const Type *SrcTy = Op0C->getOperand(0)->getType();
+      Type *SrcTy = Op0C->getOperand(0)->getType();
       if (SrcTy == Op1C->getOperand(0)->getType() &&
           SrcTy->isIntOrIntVectorTy()) {
         Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
@@ -2227,14 +2228,14 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       if (A == Op1)                                  // (B|A)^B == (A|B)^B
         std::swap(A, B);
       if (B == Op1)                                  // (A|B)^B == A & ~B
-        return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1, "tmp"));
+        return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
     } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && 
                Op0I->hasOneUse()){
       if (A == Op1)                                        // (A&B)^A -> (B&A)^A
         std::swap(A, B);
       if (B == Op1 &&                                      // (B&A)^A == ~B & A
           !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
-        return BinaryOperator::CreateAnd(Builder->CreateNot(A, "tmp"), Op1);
+        return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1);
       }
     }
   }
@@ -2288,7 +2289,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
     if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
       if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
-        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        Type *SrcTy = Op0C->getOperand(0)->getType();
         if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() &&
             // Only do this if the casts both really cause code to be generated.
             ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 537f2b3..c7b3ff8 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -22,8 +21,8 @@ using namespace llvm;
 
 /// getPromotedType - Return the specified type promoted as it would be to pass
 /// though a va_arg area.
-static const Type *getPromotedType(const Type *Ty) {
-  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+static Type *getPromotedType(Type *Ty) {
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
     if (ITy->getBitWidth() < 32)
       return Type::getInt32Ty(Ty->getContext());
   }
@@ -64,7 +63,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAddrSp =
     cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
 
-  const IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
+  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
   Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
   
@@ -76,18 +75,18 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // integer datatype.
   Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
   if (StrippedDest != MI->getArgOperand(0)) {
-    const Type *SrcETy = cast<PointerType>(StrippedDest->getType())
+    Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
     if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
       // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
       // down through these levels if so.
       while (!SrcETy->isSingleValueType()) {
-        if (const StructType *STy = dyn_cast<StructType>(SrcETy)) {
+        if (StructType *STy = dyn_cast<StructType>(SrcETy)) {
           if (STy->getNumElements() == 1)
             SrcETy = STy->getElementType(0);
           else
             break;
-        } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
+        } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
           if (ATy->getNumElements() == 1)
             SrcETy = ATy->getElementType();
           else
@@ -142,7 +141,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   
   // memset(s,c,n) -> store s, c (for n=1,2,4,8)
   if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
-    const Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
+    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
     
     Value *Dest = MI->getDest();
     unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
@@ -250,7 +249,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // We need target data for just about everything so depend on it.
     if (!TD) break;
     
-    const Type *ReturnTy = CI.getType();
+    Type *ReturnTy = CI.getType();
     uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL;
 
     // Get to the real allocated thing and offset as fast as possible.
@@ -266,8 +265,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // Get the current byte offset into the thing. Use the original
       // operand in case we're looking through a bitcast.
       SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
-      Offset = TD->getIndexedOffset(GEP->getPointerOperandType(),
-                                    Ops.data(), Ops.size());
+      Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
 
       Op1 = GEP->getPointerOperand()->stripPointerCasts();
 
@@ -300,7 +298,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       }
     } else if (CallInst *MI = extractMallocCall(Op1)) {
       // Get allocation size.
-      const Type* MallocType = getMallocAllocatedType(MI);
+      Type* MallocType = getMallocAllocatedType(MI);
       if (MallocType && MallocType->isSized())
         if (Value *NElems = getMallocArraySize(MI, TD, true))
           if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
@@ -355,7 +353,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::cttz: {
     // If all bits below the first known one are known zero,
     // this value is constant.
-    const IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
+    IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
     // FIXME: Try to simplify vectors of integers.
     if (!IT) break;
     uint32_t BitWidth = IT->getBitWidth();
@@ -374,7 +372,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ctlz: {
     // If all bits above the first known one are known zero,
     // this value is constant.
-    const IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
+    IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
     // FIXME: Try to simplify vectors of integers.
     if (!IT) break;
     uint32_t BitWidth = IT->getBitWidth();
@@ -392,7 +390,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::uadd_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-    const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
+    IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
     APInt Mask = APInt::getSignBit(BitWidth);
     APInt LHSKnownZero(BitWidth, 0);
@@ -416,7 +414,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           UndefValue::get(LHS->getType()),
           ConstantInt::getTrue(II->getContext())
         };
-        const StructType *ST = cast<StructType>(II->getType());
+        StructType *ST = cast<StructType>(II->getType());
         Constant *Struct = ConstantStruct::get(ST, V);
         return InsertValueInst::Create(Struct, Add, 0);
       }
@@ -430,7 +428,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           UndefValue::get(LHS->getType()),
           ConstantInt::getFalse(II->getContext())
         };
-        const StructType *ST = cast<StructType>(II->getType());
+        StructType *ST = cast<StructType>(II->getType());
         Constant *Struct = ConstantStruct::get(ST, V);
         return InsertValueInst::Create(Struct, Add, 0);
       }
@@ -559,7 +557,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) {
-      const Type *OpPtrTy = 
+      Type *OpPtrTy = 
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(II->getArgOperand(0), Ptr);
@@ -570,7 +568,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
-      const Type *OpPtrTy = 
+      Type *OpPtrTy = 
         PointerType::getUnqual(II->getArgOperand(1)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
       return new StoreInst(II->getArgOperand(1), Ptr);
@@ -656,15 +654,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           
           if (ExtractedElts[Idx] == 0) {
             ExtractedElts[Idx] = 
-              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1, 
-                  ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                   Idx&15, false), "tmp");
+              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
+                                            Builder->getInt32(Idx&15));
           }
         
           // Insert this value into the result vector.
           Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx],
-                         ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                          i, false), "tmp");
+                                                Builder->getInt32(i));
         }
         return CastInst::Create(Instruction::BitCast, Result, CI.getType());
       }
@@ -733,9 +729,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       }
     }
     
-    // If the stack restore is in a return/unwind block and if there are no
-    // allocas or calls between the restore and the return, nuke the restore.
-    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)))
+    // If the stack restore is in a return, resume, or unwind block and if there
+    // are no allocas or calls between the restore and the return, nuke the
+    // restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI) ||
+                          isa<UnwindInst>(TI)))
       return EraseInstFromFunction(CI);
     break;
   }
@@ -765,9 +763,9 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   if (!CS.paramHasAttr(ix, Attribute::ByVal))
     return true;
 
-  const Type* SrcTy = 
+  Type* SrcTy = 
             cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
-  const Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
   if (!SrcTy->isSized() || !DstTy->isSized())
     return false;
   if (!TD || TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy))
@@ -820,6 +818,83 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) {
   return Simplifier.NewInstruction;
 }
 
+static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
+  // Strip off at most one level of pointer casts, looking for an alloca.  This
+  // is good enough in practice and simpler than handling any number of casts.
+  Value *Underlying = TrampMem->stripPointerCasts();
+  if (Underlying != TrampMem &&
+      (!Underlying->hasOneUse() || *Underlying->use_begin() != TrampMem))
+    return 0;
+  if (!isa<AllocaInst>(Underlying))
+    return 0;
+
+  IntrinsicInst *InitTrampoline = 0;
+  for (Value::use_iterator I = TrampMem->use_begin(), E = TrampMem->use_end();
+       I != E; I++) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(*I);
+    if (!II)
+      return 0;
+    if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
+      if (InitTrampoline)
+        // More than one init_trampoline writes to this value.  Give up.
+        return 0;
+      InitTrampoline = II;
+      continue;
+    }
+    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
+      // Allow any number of calls to adjust.trampoline.
+      continue;
+    return 0;
+  }
+
+  // No call to init.trampoline found.
+  if (!InitTrampoline)
+    return 0;
+
+  // Check that the alloca is being used in the expected way.
+  if (InitTrampoline->getOperand(0) != TrampMem)
+    return 0;
+
+  return InitTrampoline;
+}
+
+static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+                                               Value *TrampMem) {
+  // Visit all the previous instructions in the basic block, and try to find a
+  // init.trampoline which has a direct path to the adjust.trampoline.
+  for (BasicBlock::iterator I = AdjustTramp,
+       E = AdjustTramp->getParent()->begin(); I != E; ) {
+    Instruction *Inst = --I;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
+          II->getOperand(0) == TrampMem)
+        return II;
+    if (Inst->mayWriteToMemory())
+      return 0;
+  }
+  return 0;
+}
+
+// Given a call to llvm.adjust.trampoline, find and return the corresponding
+// call to llvm.init.trampoline if the call to the trampoline can be optimized
+// to a direct call to a function.  Otherwise return NULL.
+//
+static IntrinsicInst *FindInitTrampoline(Value *Callee) {
+  Callee = Callee->stripPointerCasts();
+  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
+  if (!AdjustTramp ||
+      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
+    return 0;
+
+  Value *TrampMem = AdjustTramp->getOperand(0);
+
+  if (IntrinsicInst *IT = FindInitTrampolineFromAlloca(TrampMem))
+    return IT;
+  if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
+    return IT;
+  return 0;
+}
+
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
@@ -879,13 +954,11 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     return EraseInstFromFunction(*CS.getInstruction());
   }
 
-  if (BitCastInst *BC = dyn_cast<BitCastInst>(Callee))
-    if (IntrinsicInst *In = dyn_cast<IntrinsicInst>(BC->getOperand(0)))
-      if (In->getIntrinsicID() == Intrinsic::init_trampoline)
-        return transformCallThroughTrampoline(CS);
+  if (IntrinsicInst *II = FindInitTrampoline(Callee))
+    return transformCallThroughTrampoline(CS, II);
 
-  const PointerType *PTy = cast<PointerType>(Callee->getType());
-  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   if (FTy->isVarArg()) {
     int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
     // See if we can optimize any arguments passed through the varargs area of
@@ -934,9 +1007,9 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // would cause a type conversion of one of our arguments, change this call to
   // be a direct call with arguments casted to the appropriate types.
   //
-  const FunctionType *FT = Callee->getFunctionType();
-  const Type *OldRetTy = Caller->getType();
-  const Type *NewRetTy = FT->getReturnType();
+  FunctionType *FT = Callee->getFunctionType();
+  Type *OldRetTy = Caller->getType();
+  Type *NewRetTy = FT->getReturnType();
 
   if (NewRetTy->isStructTy())
     return false; // TODO: Handle multiple return values.
@@ -982,8 +1055,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   CallSite::arg_iterator AI = CS.arg_begin();
   for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
-    const Type *ParamTy = FT->getParamType(i);
-    const Type *ActTy = (*AI)->getType();
+    Type *ParamTy = FT->getParamType(i);
+    Type *ActTy = (*AI)->getType();
 
     if (!CastInst::isCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
@@ -995,11 +1068,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
     if (ParamTy != ActTy && (Attrs & Attribute::ByVal)) {
-      const PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
+      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
       
-      const Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
+      Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
       if (TD->getTypeAllocSize(CurElTy) !=
           TD->getTypeAllocSize(ParamPTy->getElementType()))
         return false;
@@ -1023,7 +1096,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // If the callee is just a declaration, don't change the varargsness of the
     // call.  We don't want to introduce a varargs call where one doesn't
     // already exist.
-    const PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
+    PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
     if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
       return false;
   }
@@ -1062,13 +1135,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
-    const Type *ParamTy = FT->getParamType(i);
+    Type *ParamTy = FT->getParamType(i);
     if ((*AI)->getType() == ParamTy) {
       Args.push_back(*AI);
     } else {
       Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
           false, ParamTy, false);
-      Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy, "tmp"));
+      Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy));
     }
 
     // Add any parameter attributes.
@@ -1089,12 +1162,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     } else {
       // Add all of the arguments in their promoted form to the arg list.
       for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
-        const Type *PTy = getPromotedType((*AI)->getType());
+        Type *PTy = getPromotedType((*AI)->getType());
         if (PTy != (*AI)->getType()) {
           // Must promote to pass through va_arg area!
           Instruction::CastOps opcode =
             CastInst::getCastOpcode(*AI, false, PTy, false);
-          Args.push_back(Builder->CreateCast(opcode, *AI, PTy, "tmp"));
+          Args.push_back(Builder->CreateCast(opcode, *AI, PTy));
         } else {
           Args.push_back(*AI);
         }
@@ -1138,13 +1211,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!NV->getType()->isVoidTy()) {
       Instruction::CastOps opcode =
         CastInst::getCastOpcode(NC, false, OldRetTy, false);
-      NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
+      NV = NC = CastInst::Create(opcode, NC, OldRetTy);
       NC->setDebugLoc(Caller->getDebugLoc());
 
       // If this is an invoke instruction, we should insert it after the first
       // non-phi, instruction in the normal successor block.
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-        BasicBlock::iterator I = II->getNormalDest()->getFirstNonPHI();
+        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
         InsertNewInstBefore(NC, *I);
       } else {
         // Otherwise, it's a call, just insert cast right after the call.
@@ -1163,13 +1236,16 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   return true;
 }
 
-// transformCallThroughTrampoline - Turn a call to a function created by the
-// init_trampoline intrinsic into a direct call to the underlying function.
+// transformCallThroughTrampoline - Turn a call to a function created by
+// init_trampoline / adjust_trampoline intrinsic pair into a direct call to the
+// underlying function.
 //
-Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
+Instruction *
+InstCombiner::transformCallThroughTrampoline(CallSite CS,
+                                             IntrinsicInst *Tramp) {
   Value *Callee = CS.getCalledValue();
-  const PointerType *PTy = cast<PointerType>(Callee->getType());
-  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   const AttrListPtr &Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
@@ -1177,12 +1253,12 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
   if (Attrs.hasAttrSomewhere(Attribute::Nest))
     return 0;
 
-  IntrinsicInst *Tramp =
-    cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
+  assert(Tramp &&
+         "transformCallThroughTrampoline called with incorrect CallSite.");
 
   Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
-  const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
-  const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
+  PointerType *NestFPTy = cast<PointerType>(NestF->getType());
+  FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
   const AttrListPtr &NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 82c734e..f10e48a 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
@@ -79,14 +80,14 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // This requires TargetData to get the alloca alignment and size information.
   if (!TD) return 0;
 
-  const PointerType *PTy = cast<PointerType>(CI.getType());
+  PointerType *PTy = cast<PointerType>(CI.getType());
   
   BuilderTy AllocaBuilder(*Builder);
   AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
 
   // Get the type really allocated and the type casted to.
-  const Type *AllocElTy = AI.getAllocatedType();
-  const Type *CastElTy = PTy->getElementType();
+  Type *AllocElTy = AI.getAllocatedType();
+  Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
 
   unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy);
@@ -121,13 +122,13 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   } else {
     Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
     // Insert before the alloca, not before the cast.
-    Amt = AllocaBuilder.CreateMul(Amt, NumElements, "tmp");
+    Amt = AllocaBuilder.CreateMul(Amt, NumElements);
   }
   
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
-    Amt = AllocaBuilder.CreateAdd(Amt, Off, "tmp");
+    Amt = AllocaBuilder.CreateAdd(Amt, Off);
   }
   
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
@@ -151,7 +152,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
 /// EvaluateInDifferentType - Given an expression that 
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, 
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
@@ -229,12 +230,12 @@ static Instruction::CastOps
 isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
-  const Type *DstTy,     ///< The target type for the second cast instruction
+  Type *DstTy,     ///< The target type for the second cast instruction
   TargetData *TD         ///< The target data for pointer size
 ) {
 
-  const Type *SrcTy = CI->getOperand(0)->getType();   // A from above
-  const Type *MidTy = CI->getType();                  // B from above
+  Type *SrcTy = CI->getOperand(0)->getType();   // A from above
+  Type *MidTy = CI->getType();                  // B from above
 
   // Get the opcodes of the two Cast instructions
   Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
@@ -260,7 +261,7 @@ isEliminableCastPair(
 /// the cast can be eliminated by some other simple transformation, we prefer
 /// to do the simplification first.
 bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
-                                      const Type *Ty) {
+                                      Type *Ty) {
   // Noop casts and casts of constants should be eliminated trivially.
   if (V->getType() == Ty || isa<Constant>(V)) return false;
   
@@ -324,7 +325,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateTruncated(Value *V, const Type *Ty) {
+static bool CanEvaluateTruncated(Value *V, Type *Ty) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
@@ -332,7 +333,7 @@ static bool CanEvaluateTruncated(Value *V, const Type *Ty) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
   
-  const Type *OrigTy = V->getType();
+  Type *OrigTy = V->getType();
   
   // If this is an extension from the dest type, we can eliminate it, even if it
   // has multiple uses.
@@ -435,7 +436,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     return &CI;
   
   Value *Src = CI.getOperand(0);
-  const Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+  Type *DestTy = CI.getType(), *SrcTy = Src->getType();
   
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -456,7 +457,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
     Constant *One = ConstantInt::get(Src->getType(), 1);
-    Src = Builder->CreateAnd(Src, One, "tmp");
+    Src = Builder->CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
@@ -518,7 +519,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                    In->getType()->getScalarSizeInBits()-1);
       In = Builder->CreateLShr(In, Sh, In->getName()+".lobit");
       if (In->getType() != CI.getType())
-        In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/, "tmp");
+        In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/);
 
       if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
         Constant *One = ConstantInt::get(In->getType(), 1);
@@ -572,7 +573,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
-          In = Builder->CreateXor(In, One, "tmp");
+          In = Builder->CreateXor(In, One);
         }
           
         if (CI.getType() == In->getType())
@@ -586,7 +587,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   // It is also profitable to transform icmp eq into not(xor(A, B)) because that
   // may lead to additional simplifications.
   if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
-    if (const IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
       uint32_t BitWidth = ITy->getBitWidth();
       Value *LHS = ICI->getOperand(0);
       Value *RHS = ICI->getOperand(1);
@@ -644,7 +645,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 /// clear the top bits anyway, doing this has no extra cost.
 ///
 /// This function works on both vectors and scalars.
-static bool CanEvaluateZExtd(Value *V, const Type *Ty, unsigned &BitsToClear) {
+static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
@@ -758,7 +759,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     return &CI;
   
   Value *Src = CI.getOperand(0);
-  const Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
   
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -820,7 +821,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
                                                            AndValue));
     }
     if (SrcSize > DstSize) {
-      Value *Trunc = Builder->CreateTrunc(A, CI.getType(), "tmp");
+      Value *Trunc = Builder->CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
       return BinaryOperator::CreateAnd(Trunc, 
                                        ConstantInt::get(Trunc->getType(),
@@ -867,7 +868,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
             Value *TI0 = TI->getOperand(0);
             if (TI0->getType() == CI.getType()) {
               Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
-              Value *NewAnd = Builder->CreateAnd(TI0, ZC, "tmp");
+              Value *NewAnd = Builder->CreateAnd(TI0, ZC);
               return BinaryOperator::CreateXor(NewAnd, ZC);
             }
           }
@@ -900,7 +901,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
                                    Op0->getType()->getScalarSizeInBits()-1);
       Value *In = Builder->CreateAShr(Op0, Sh, Op0->getName()+".lobit");
       if (In->getType() != CI.getType())
-        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/, "tmp");
+        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/);
 
       if (Pred == ICmpInst::ICMP_SGT)
         In = Builder->CreateNot(In, In->getName()+".not");
@@ -965,10 +966,10 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
   }
 
   // vector (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed.
-  if (const VectorType *VTy = dyn_cast<VectorType>(CI.getType())) {
+  if (VectorType *VTy = dyn_cast<VectorType>(CI.getType())) {
     if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_Zero()) &&
         Op0->getType() == CI.getType()) {
-      const Type *EltTy = VTy->getElementType();
+      Type *EltTy = VTy->getElementType();
 
       // splat the shift constant to a constant vector.
       Constant *VSh = ConstantInt::get(VTy, EltTy->getScalarSizeInBits()-1);
@@ -988,7 +989,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateSExtd(Value *V, const Type *Ty) {
+static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
          "Can't sign extend type to a smaller type");
   // If this is a constant, it can be trivially promoted.
@@ -1063,7 +1064,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     return &CI;
   
   Value *Src = CI.getOperand(0);
-  const Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -1192,7 +1193,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     case Instruction::FMul:
     case Instruction::FDiv:
     case Instruction::FRem:
-      const Type *SrcTy = OpI->getType();
+      Type *SrcTy = OpI->getType();
       Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
       Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
       if (LHSTrunc->getType() != SrcTy && 
@@ -1306,13 +1307,13 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
     if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
         TD->getPointerSizeInBits()) {
       Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                      TD->getIntPtrType(CI.getContext()), "tmp");
+                                      TD->getIntPtrType(CI.getContext()));
       return new IntToPtrInst(P, CI.getType());
     }
     if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
         TD->getPointerSizeInBits()) {
       Value *P = Builder->CreateZExt(CI.getOperand(0),
-                                     TD->getIntPtrType(CI.getContext()), "tmp");
+                                     TD->getIntPtrType(CI.getContext()));
       return new IntToPtrInst(P, CI.getType());
     }
   }
@@ -1351,7 +1352,7 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       
       // Get the base pointer input of the bitcast, and the type it points to.
       Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
-      const Type *GEPIdxTy =
+      Type *GEPIdxTy =
       cast<PointerType>(OrigBase->getType())->getElementType();
       SmallVector<Value*, 8> NewIndices;
       if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices)) {
@@ -1359,9 +1360,8 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
         Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
-        Builder->CreateInBoundsGEP(OrigBase,
-                                   NewIndices.begin(), NewIndices.end()) :
-        Builder->CreateGEP(OrigBase, NewIndices.begin(), NewIndices.end());
+        Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
+        Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
         
         if (isa<BitCastInst>(CI))
@@ -1382,14 +1382,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   if (TD) {
     if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
       Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()),
-                                         "tmp");
+                                         TD->getIntPtrType(CI.getContext()));
       return new TruncInst(P, CI.getType());
     }
     if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) {
       Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()),
-                                         "tmp");
+                                         TD->getIntPtrType(CI.getContext()));
       return new ZExtInst(P, CI.getType());
     }
   }
@@ -1402,12 +1400,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
 /// replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
+static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
                                          InstCombiner &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
-  const VectorType *SrcTy = cast<VectorType>(InVal->getType());
+  VectorType *SrcTy = cast<VectorType>(InVal->getType());
   
   if (SrcTy->getElementType() != DestTy->getElementType()) {
     // The input types don't need to be identical, but for now they must be the
@@ -1427,7 +1425,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
   // size of the input.
   SmallVector<Constant*, 16> ShuffleMask;
   Value *V2;
-  const IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext());
   
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
@@ -1453,11 +1451,11 @@ static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
   return new ShuffleVectorInst(InVal, V2, ConstantVector::get(ShuffleMask));
 }
 
-static bool isMultipleOfTypeSize(unsigned Value, const Type *Ty) {
+static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
   return Value % Ty->getPrimitiveSizeInBits() == 0;
 }
 
-static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) {
+static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
   return Value / Ty->getPrimitiveSizeInBits();
 }
 
@@ -1471,7 +1469,7 @@ static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) {
 /// filling in Elements with the elements found here.
 static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                      SmallVectorImpl<Value*> &Elements,
-                                     const Type *VecEltTy) {
+                                     Type *VecEltTy) {
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
   
@@ -1508,7 +1506,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
       C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
                                        C->getType()->getPrimitiveSizeInBits()));
     unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
-    const Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+    Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
     
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
@@ -1572,7 +1570,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
-  const VectorType *DestVecTy = cast<VectorType>(CI.getType());
+  VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
@@ -1599,7 +1597,7 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// bitcast.  The various long double bitcasts can't get in here.
 static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   Value *Src = CI.getOperand(0);
-  const Type *DestTy = CI.getType();
+  Type *DestTy = CI.getType();
 
   // If this is a bitcast from int to float, check to see if the int is an
   // extraction from a vector.
@@ -1607,7 +1605,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   // bitcast(trunc(bitcast(somevector)))
   if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
       isa<VectorType>(VecInput->getType())) {
-    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    VectorType *VecTy = cast<VectorType>(VecInput->getType());
     unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
 
     if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) {
@@ -1628,7 +1626,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
                                 m_ConstantInt(ShAmt)))) &&
       isa<VectorType>(VecInput->getType())) {
-    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    VectorType *VecTy = cast<VectorType>(VecInput->getType());
     unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
     if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 &&
         ShAmt->getZExtValue() % DestWidth == 0) {
@@ -1651,18 +1649,18 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
   Value *Src = CI.getOperand(0);
-  const Type *SrcTy = Src->getType();
-  const Type *DestTy = CI.getType();
+  Type *SrcTy = Src->getType();
+  Type *DestTy = CI.getType();
 
   // Get rid of casts from one type to the same type. These are useless and can
   // be replaced by the operand.
   if (DestTy == Src->getType())
     return ReplaceInstUsesWith(CI, Src);
 
-  if (const PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
-    const PointerType *SrcPTy = cast<PointerType>(SrcTy);
-    const Type *DstElTy = DstPTy->getElementType();
-    const Type *SrcElTy = SrcPTy->getElementType();
+  if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    Type *DstElTy = DstPTy->getElementType();
+    Type *SrcElTy = SrcPTy->getElementType();
     
     // If the address spaces don't match, don't eliminate the bitcast, which is
     // required for changing types.
@@ -1693,7 +1691,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
       SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
-      return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end());
+      return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
   
@@ -1702,7 +1700,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
       return I;
 
-  if (const VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
     if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
       Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType());
       return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
@@ -1731,7 +1729,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (const VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
     if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
       Value *Elem = 
         Builder->CreateExtractElement(Src,
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c78760b..bb1cbfa 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Target/TargetData.h"
@@ -56,7 +57,7 @@ static bool AddWithOverflow(Constant *&Result, Constant *In1,
                             Constant *In2, bool IsSigned = false) {
   Result = ConstantExpr::getAdd(In1, In2);
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
       Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
       if (HasAddOverflow(ExtractElement(Result, Idx),
@@ -78,7 +79,7 @@ static bool HasSubOverflow(ConstantInt *Result,
                            bool IsSigned) {
   if (!IsSigned)
     return Result->getValue().ugt(In1->getValue());
-  
+
   if (In2->isNegative())
     return Result->getValue().slt(In1->getValue());
 
@@ -91,7 +92,7 @@ static bool SubWithOverflow(Constant *&Result, Constant *In1,
                             Constant *In2, bool IsSigned = false) {
   Result = ConstantExpr::getSub(In1, In2);
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
       Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
       if (HasSubOverflow(ExtractElement(Result, Idx),
@@ -128,7 +129,7 @@ static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
     // True if LHS u> RHS and RHS == high-bit-mask - 1
     TrueIfSigned = true;
     return RHS->isMaxValue(true);
-  case ICmpInst::ICMP_UGE: 
+  case ICmpInst::ICMP_UGE:
     // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
     TrueIfSigned = true;
     return RHS->getValue().isSignBit();
@@ -143,7 +144,7 @@ static bool isHighOnes(const ConstantInt *CI) {
   return (~CI->getValue() + 1).isPowerOf2();
 }
 
-/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a 
+/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a
 /// set of known zero and one bits, compute the maximum and minimum values that
 /// could have the specified known zero and known one bits, returning them in
 /// min/max.
@@ -160,7 +161,7 @@ static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
   // bit if it is unknown.
   Min = KnownOne;
   Max = KnownOne|UnknownBits;
-  
+
   if (UnknownBits.isNegative()) { // Sign bit is unknown
     Min.setBit(Min.getBitWidth()-1);
     Max.clearBit(Max.getBitWidth()-1);
@@ -179,7 +180,7 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
          KnownZero.getBitWidth() == Max.getBitWidth() &&
          "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
   APInt UnknownBits = ~(KnownZero|KnownOne);
-  
+
   // The minimum value is when the unknown bits are all zeros.
   Min = KnownOne;
   // The maximum value is when the unknown bits are all ones.
@@ -201,10 +202,10 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
   // We need TD information to know the pointer size unless this is inbounds.
   if (!GEP->isInBounds() && TD == 0) return 0;
-  
+
   ConstantArray *Init = dyn_cast<ConstantArray>(GV->getInitializer());
   if (Init == 0 || Init->getNumOperands() > 1024) return 0;
-  
+
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
   //
@@ -219,31 +220,31 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // type they index.  Collect the indices.  This is typically for arrays of
   // structs.
   SmallVector<unsigned, 4> LaterIndices;
-  
-  const Type *EltTy = cast<ArrayType>(Init->getType())->getElementType();
+
+  Type *EltTy = cast<ArrayType>(Init->getType())->getElementType();
   for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
     ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (Idx == 0) return 0;  // Variable index.
-    
+
     uint64_t IdxVal = Idx->getZExtValue();
     if ((unsigned)IdxVal != IdxVal) return 0; // Too large array index.
-    
-    if (const StructType *STy = dyn_cast<StructType>(EltTy))
+
+    if (StructType *STy = dyn_cast<StructType>(EltTy))
       EltTy = STy->getElementType(IdxVal);
-    else if (const ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
+    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
       if (IdxVal >= ATy->getNumElements()) return 0;
       EltTy = ATy->getElementType();
     } else {
       return 0; // Unknown type.
     }
-    
+
     LaterIndices.push_back(IdxVal);
   }
-  
+
   enum { Overdefined = -3, Undefined = -2 };
 
   // Variables for our state machines.
-  
+
   // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
   // "i == 47 | i == 87", where 47 is the first index the condition is true for,
   // and 87 is the second (and last) index.  FirstTrueElement is -2 when
@@ -254,7 +255,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
   // form "i != 47 & i != 87".  Same state transitions as for true elements.
   int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
-  
+
   /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
   /// define a state machine that triggers for ranges of values that the index
   /// is true or false for.  This triggers on things like "abbbbc"[i] == 'b'.
@@ -262,25 +263,25 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   /// index in the range (inclusive).  We use -2 for undefined here because we
   /// use relative comparisons and don't want 0-1 to match -1.
   int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
-  
+
   // MagicBitvector - This is a magic bitvector where we set a bit if the
   // comparison is true for element 'i'.  If there are 64 elements or less in
   // the array, this will fully represent all the comparison results.
   uint64_t MagicBitvector = 0;
-  
-  
+
+
   // Scan the array and see if one of our patterns matches.
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
   for (unsigned i = 0, e = Init->getNumOperands(); i != e; ++i) {
     Constant *Elt = Init->getOperand(i);
-    
+
     // If this is indexing an array of structures, get the structure element.
     if (!LaterIndices.empty())
       Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
-    
+
     // If the element is masked, handle it.
     if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
-    
+
     // Find out if the comparison would be true or false for the i'th element.
     Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
                                                   CompareRHS, TD);
@@ -294,15 +295,15 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
         FalseRangeEnd = i;
       continue;
     }
-    
+
     // If we can't compute the result for any of the elements, we have to give
     // up evaluating the entire conditional.
     if (!isa<ConstantInt>(C)) return 0;
-    
+
     // Otherwise, we know if the comparison is true or false for this element,
     // update our state machines.
     bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
-    
+
     // State machine for single/double/range index comparison.
     if (IsTrueForElt) {
       // Update the TrueElement state machine.
@@ -314,7 +315,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
           SecondTrueElement = i;
         else
           SecondTrueElement = Overdefined;
-        
+
         // Update range state machine.
         if (TrueRangeEnd == (int)i-1)
           TrueRangeEnd = i;
@@ -331,7 +332,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
           SecondFalseElement = i;
         else
           SecondFalseElement = Overdefined;
-        
+
         // Update range state machine.
         if (FalseRangeEnd == (int)i-1)
           FalseRangeEnd = i;
@@ -339,12 +340,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
           FalseRangeEnd = Overdefined;
       }
     }
-    
-    
+
+
     // If this element is in range, update our magic bitvector.
     if (i < 64 && IsTrueForElt)
       MagicBitvector |= 1ULL << i;
-    
+
     // If all of our states become overdefined, bail out early.  Since the
     // predicate is expensive, only check it every 8 elements.  This is only
     // really useful for really huge arrays.
@@ -364,20 +365,20 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (!GEP->isInBounds() &&
       Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits())
     Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
-  
+
   // If the comparison is only true for one or two elements, emit direct
   // comparisons.
   if (SecondTrueElement != Overdefined) {
     // None true -> false.
     if (FirstTrueElement == Undefined)
       return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext()));
-    
+
     Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
-    
+
     // True for one element -> 'i == 47'.
     if (SecondTrueElement == Undefined)
       return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
-    
+
     // True for two elements -> 'i == 47 | i == 72'.
     Value *C1 = Builder->CreateICmpEQ(Idx, FirstTrueIdx);
     Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
@@ -391,36 +392,36 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     // None false -> true.
     if (FirstFalseElement == Undefined)
       return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext()));
-    
+
     Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
 
     // False for one element -> 'i != 47'.
     if (SecondFalseElement == Undefined)
       return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
-     
+
     // False for two elements -> 'i != 47 & i != 72'.
     Value *C1 = Builder->CreateICmpNE(Idx, FirstFalseIdx);
     Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
     Value *C2 = Builder->CreateICmpNE(Idx, SecondFalseIdx);
     return BinaryOperator::CreateAnd(C1, C2);
   }
-  
+
   // If the comparison can be replaced with a range comparison for the elements
   // where it is true, emit the range check.
   if (TrueRangeEnd != Overdefined) {
     assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
-    
+
     // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
     if (FirstTrueElement) {
       Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
       Idx = Builder->CreateAdd(Idx, Offs);
     }
-    
+
     Value *End = ConstantInt::get(Idx->getType(),
                                   TrueRangeEnd-FirstTrueElement+1);
     return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
   }
-  
+
   // False range check.
   if (FalseRangeEnd != Overdefined) {
     assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
@@ -429,19 +430,19 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
       Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
       Idx = Builder->CreateAdd(Idx, Offs);
     }
-    
+
     Value *End = ConstantInt::get(Idx->getType(),
                                   FalseRangeEnd-FirstFalseElement);
     return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
   }
-  
-  
+
+
   // If a 32-bit or 64-bit magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
   if (Init->getNumOperands() <= 32 ||
       (TD && Init->getNumOperands() <= 64 && TD->isLegalInteger(64))) {
-    const Type *Ty;
+    Type *Ty;
     if (Init->getNumOperands() <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
     else
@@ -451,7 +452,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
     return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
   }
-  
+
   return 0;
 }
 
@@ -465,11 +466,11 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 /// to generate the first by knowing that pointer arithmetic doesn't overflow.
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
-/// 
+///
 static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   TargetData &TD = *IC.getTargetData();
   gep_type_iterator GTI = gep_type_begin(GEP);
-  
+
   // Check to see if this gep only has a single variable index.  If so, and if
   // any constant indices are a multiple of its scale, then we can compute this
   // in terms of the scale of the variable index.  For example, if the GEP
@@ -481,9 +482,9 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
       // Compute the aggregate offset of constant indices.
       if (CI->isZero()) continue;
-      
+
       // Handle a struct index, which adds its field offset to the pointer.
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
       } else {
         uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
@@ -494,33 +495,33 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
       break;
     }
   }
-  
+
   // If there are no variable indices, we must have a constant offset, just
   // evaluate it the general way.
   if (i == e) return 0;
-  
+
   Value *VariableIdx = GEP->getOperand(i);
   // Determine the scale factor of the variable element.  For example, this is
   // 4 if the variable index is into an array of i32.
   uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType());
-  
+
   // Verify that there are no other variable indices.  If so, emit the hard way.
   for (++i, ++GTI; i != e; ++i, ++GTI) {
     ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (!CI) return 0;
-    
+
     // Compute the aggregate offset of constant indices.
     if (CI->isZero()) continue;
-    
+
     // Handle a struct index, which adds its field offset to the pointer.
-    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
       Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
     } else {
       uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
       Offset += Size*CI->getSExtValue();
     }
   }
-  
+
   // Okay, we know we have a single variable index, which must be a
   // pointer/array/vector index.  If there is no offset, life is simple, return
   // the index.
@@ -530,19 +531,19 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
       VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
   }
-  
+
   // Otherwise, there is an index.  The computation we will do will be modulo
   // the pointer size, so get it.
   uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
-  
+
   Offset &= PtrSizeMask;
   VariableScale &= PtrSizeMask;
-  
+
   // To do this transformation, any constant index must be a multiple of the
   // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
   // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
@@ -550,9 +551,9 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   int64_t NewOffs = Offset / (int64_t)VariableScale;
   if (Offset != NewOffs*(int64_t)VariableScale)
     return 0;
-  
+
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
-  const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
     VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
@@ -576,7 +577,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // know pointers can't overflow since the gep is inbounds.  See if we can
     // output an optimized form.
     Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
-    
+
     // If not, synthesize the offset the hard way.
     if (Offset == 0)
       Offset = EmitGEPOffset(GEPLHS);
@@ -686,7 +687,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
     bool isTrue = ICmpInst::isTrueWhenEqual(Pred);
     return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
   }
-  
+
   // (X+4) == X -> false.
   if (Pred == ICmpInst::ICMP_EQ)
     return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext()));
@@ -698,22 +699,22 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
-  
+
   // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255
   // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
   // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
   if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
-    Value *R = 
+    Value *R =
       ConstantExpr::getSub(ConstantInt::getAllOnesValue(CI->getType()), CI);
     return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
   }
-  
+
   // (X+1) >u X        --> X <u (0-1)        --> X != 255
   // (X+2) >u X        --> X <u (0-2)        --> X <u 254
   // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0
   if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
     return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI));
-  
+
   unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits();
   ConstantInt *SMax = ConstantInt::get(X->getContext(),
                                        APInt::getSignedMaxValue(BitWidth));
@@ -726,14 +727,14 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
   // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127
   if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
     return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI));
-  
+
   // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127
   // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126
   // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
   // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
   // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126
   // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
-  
+
   assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
   Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1);
   return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
@@ -745,14 +746,14 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                                           ConstantInt *DivRHS) {
   ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
   const APInt &CmpRHSV = CmpRHS->getValue();
-  
-  // FIXME: If the operand types don't match the type of the divide 
+
+  // FIXME: If the operand types don't match the type of the divide
   // then don't attempt this transform. The code below doesn't have the
   // logic to deal with a signed divide and an unsigned compare (and
-  // vice versa). This is because (x /s C1) <s C2  produces different 
+  // vice versa). This is because (x /s C1) <s C2  produces different
   // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
-  // (x /u C1) <u C2.  Simply casting the operands and result won't 
-  // work. :(  The if statement below tests that condition and bails 
+  // (x /u C1) <u C2.  Simply casting the operands and result won't
+  // work. :(  The if statement below tests that condition and bails
   // if it finds it.
   bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
   if (!ICI.isEquality() && DivIsSigned != ICI.isSigned())
@@ -768,14 +769,14 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   }
 
   // Compute Prod = CI * DivRHS. We are essentially solving an equation
-  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and 
-  // C2 (CI). By solving for X we can turn this into a range check 
-  // instead of computing a divide. 
+  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and
+  // C2 (CI). By solving for X we can turn this into a range check
+  // instead of computing a divide.
   Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS);
 
   // Determine if the product overflows by seeing if the product is
   // not equal to the divide. Make sure we do the same kind of divide
-  // as in the LHS instruction that we're folding. 
+  // as in the LHS instruction that we're folding.
   bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
                  ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
 
@@ -785,9 +786,9 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   /// If the division is known to be exact, then there is no remainder from the
   /// divide, so the covered range size is unit, otherwise it is the divisor.
   ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS;
-  
+
   // Figure out the interval that is being checked.  For example, a comparison
-  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). 
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
   // Compute this interval based on the constants involved and the signedness of
   // the compare/divide.  This computes a half-open interval, keeping track of
   // whether either value in the interval overflows.  After analysis each
@@ -805,7 +806,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
       // to the same result value.
       HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false);
     }
-    
+
   } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
     if (CmpRHSV == 0) {       // (X / pos) op 0
       // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
@@ -848,7 +849,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
       if (!HiOverflow)
         HiOverflow = SubWithOverflow(HiBound, Prod, RangeSize, true);
     }
-    
+
     // Dividing by a negative swaps the condition.  LT <-> GT
     Pred = ICmpInst::getSwappedPredicate(Pred);
   }
@@ -901,7 +902,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
 Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
                                           ConstantInt *ShAmt) {
   const APInt &CmpRHSV = cast<ConstantInt>(ICI.getOperand(1))->getValue();
-  
+
   // Check that the shift amount is in range.  If not, don't perform
   // undefined shifts.  When the shift is visited it will be
   // simplified.
@@ -909,48 +910,48 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
   if (ShAmtVal >= TypeBits || ShAmtVal == 0)
     return 0;
-  
+
   if (!ICI.isEquality()) {
     // If we have an unsigned comparison and an ashr, we can't simplify this.
     // Similarly for signed comparisons with lshr.
     if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
       return 0;
-    
+
     // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
     // by a power of 2.  Since we already have logic to simplify these,
     // transform to div and then simplify the resultant comparison.
     if (Shr->getOpcode() == Instruction::AShr &&
         (!Shr->isExact() || ShAmtVal == TypeBits - 1))
       return 0;
-    
+
     // Revisit the shift (to delete it).
     Worklist.Add(Shr);
-    
+
     Constant *DivCst =
       ConstantInt::get(Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
-    
+
     Value *Tmp =
       Shr->getOpcode() == Instruction::AShr ?
       Builder->CreateSDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()) :
       Builder->CreateUDiv(Shr->getOperand(0), DivCst, "", Shr->isExact());
-    
+
     ICI.setOperand(0, Tmp);
-    
+
     // If the builder folded the binop, just return it.
     BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
     if (TheDiv == 0)
       return &ICI;
-    
+
     // Otherwise, fold this div/compare.
     assert(TheDiv->getOpcode() == Instruction::SDiv ||
            TheDiv->getOpcode() == Instruction::UDiv);
-    
+
     Instruction *Res = FoldICmpDivCst(ICI, TheDiv, cast<ConstantInt>(DivCst));
     assert(Res && "This div/cst should have folded!");
     return Res;
   }
-  
-  
+
+
   // If we are comparing against bits always shifted out, the
   // comparison cannot succeed.
   APInt Comp = CmpRHSV << ShAmtVal;
@@ -959,25 +960,25 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
     Comp = Comp.lshr(ShAmtVal);
   else
     Comp = Comp.ashr(ShAmtVal);
-  
+
   if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
     bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
     Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
                                      IsICMP_NE);
     return ReplaceInstUsesWith(ICI, Cst);
   }
-  
+
   // Otherwise, check to see if the bits shifted out are known to be zero.
   // If so, we can compare against the unshifted value:
   //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
   if (Shr->hasOneUse() && Shr->isExact())
     return new ICmpInst(ICI.getPredicate(), Shr->getOperand(0), ShiftedCmpRHS);
-  
+
   if (Shr->hasOneUse()) {
     // Otherwise strength reduce the shift into an and.
     APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
     Constant *Mask = ConstantInt::get(ICI.getContext(), Val);
-    
+
     Value *And = Builder->CreateAnd(Shr->getOperand(0),
                                     Mask, Shr->getName()+".mask");
     return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS);
@@ -992,7 +993,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                                           Instruction *LHSI,
                                                           ConstantInt *RHS) {
   const APInt &RHSV = RHS->getValue();
-  
+
   switch (LHSI->getOpcode()) {
   case Instruction::Trunc:
     if (ICI.isEquality() && LHSI->hasOneUse()) {
@@ -1003,7 +1004,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
       APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
       ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
-      
+
       // If all the high bits are known, we can do this xform.
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
         // Pull in the high bits from known-ones set.
@@ -1014,7 +1015,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       }
     }
     break;
-      
+
   case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
     if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
       // If this is a comparison that tests the signbit (X < 0) or (x > -1),
@@ -1022,7 +1023,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) ||
           (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) {
         Value *CompareVal = LHSI->getOperand(0);
-        
+
         // If the sign bit of the XorCST is not set, there is no change to
         // the operation, just stop using the Xor.
         if (!XorCST->isNegative()) {
@@ -1030,13 +1031,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           Worklist.Add(LHSI);
           return &ICI;
         }
-        
+
         // Was the old condition true if the operand is positive?
         bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
-        
+
         // If so, the new one isn't.
         isTrueIfPositive ^= true;
-        
+
         if (isTrueIfPositive)
           return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal,
                               SubOne(RHS));
@@ -1075,13 +1076,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
         LHSI->getOperand(0)->hasOneUse()) {
       ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
-      
+
       // If the LHS is an AND of a truncating cast, we can widen the
       // and/compare to be the input width without changing the value
       // produced, eliminating a cast.
       if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
         // We can do this transformation if either the AND constant does not
-        // have its sign bit set or if it is an equality comparison. 
+        // have its sign bit set or if it is an equality comparison.
         // Extending a relational comparison when we're checking the sign
         // bit would not work.
         if (ICI.isEquality() ||
@@ -1098,7 +1099,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       // If the LHS is an AND of a zext, and we have an equality compare, we can
       // shrink the and/compare to the smaller type, eliminating the cast.
       if (ZExtInst *Cast = dyn_cast<ZExtInst>(LHSI->getOperand(0))) {
-        const IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
+        IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
         // Make sure we don't compare the upper bits, SimplifyDemandedBits
         // should fold the icmp to true/false in that case.
         if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) {
@@ -1118,12 +1119,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
       if (Shift && !Shift->isShift())
         Shift = 0;
-      
+
       ConstantInt *ShAmt;
       ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
-      const Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
-      const Type *AndTy = AndCST->getType();          // Type of the and.
-      
+      Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+      Type *AndTy = AndCST->getType();          // Type of the and.
+
       // We can fold this as long as we can't shift unknown bits
       // into the mask.  This can only happen with signed shift
       // rights, as they sign-extend.
@@ -1134,20 +1135,20 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // of the bits shifted in could be tested after the mask.
           uint32_t TyBits = Ty->getPrimitiveSizeInBits();
           int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits);
-          
+
           uint32_t BitWidth = AndTy->getPrimitiveSizeInBits();
-          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & 
+          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
                AndCST->getValue()) == 0)
             CanFold = true;
         }
-        
+
         if (CanFold) {
           Constant *NewCst;
           if (Shift->getOpcode() == Instruction::Shl)
             NewCst = ConstantExpr::getLShr(RHS, ShAmt);
           else
             NewCst = ConstantExpr::getShl(RHS, ShAmt);
-          
+
           // Check to see if we are shifting out any of the bits being
           // compared.
           if (ConstantExpr::get(Shift->getOpcode(),
@@ -1175,7 +1176,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           }
         }
       }
-      
+
       // Turn ((X >> Y) & C) == 0  into  (X & (C << Y)) == 0.  The later is
       // preferable because it allows the C<<Y expression to be hoisted out
       // of a loop if Y is invariant and X is not.
@@ -1185,21 +1186,21 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         // Compute C << Y.
         Value *NS;
         if (Shift->getOpcode() == Instruction::LShr) {
-          NS = Builder->CreateShl(AndCST, Shift->getOperand(1), "tmp");
+          NS = Builder->CreateShl(AndCST, Shift->getOperand(1));
         } else {
           // Insert a logical shift.
-          NS = Builder->CreateLShr(AndCST, Shift->getOperand(1), "tmp");
+          NS = Builder->CreateLShr(AndCST, Shift->getOperand(1));
         }
-        
+
         // Compute X & (C << Y).
-        Value *NewAnd = 
+        Value *NewAnd =
           Builder->CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
-        
+
         ICI.setOperand(0, NewAnd);
         return &ICI;
       }
     }
-      
+
     // Try to optimize things like "A[i]&42 == 0" to index computations.
     if (LoadInst *LI = dyn_cast<LoadInst>(LHSI->getOperand(0))) {
       if (GetElementPtrInst *GEP =
@@ -1234,19 +1235,19 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-    
+
   case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
     ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
     if (!ShAmt) break;
-    
+
     uint32_t TypeBits = RHSV.getBitWidth();
-    
+
     // Check that the shift amount is in range.  If not, don't perform
     // undefined shifts.  When the shift is visited it will be
     // simplified.
     if (ShAmt->uge(TypeBits))
       break;
-    
+
     if (ICI.isEquality()) {
       // If we are comparing against bits always shifted out, the
       // comparison cannot succeed.
@@ -1259,34 +1260,34 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE);
         return ReplaceInstUsesWith(ICI, Cst);
       }
-      
+
       // If the shift is NUW, then it is just shifting out zeros, no need for an
       // AND.
       if (cast<BinaryOperator>(LHSI)->hasNoUnsignedWrap())
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantExpr::getLShr(RHS, ShAmt));
-      
+
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
         Constant *Mask =
-          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits, 
+          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits,
                                                        TypeBits-ShAmtVal));
-        
+
         Value *And =
           Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask");
         return new ICmpInst(ICI.getPredicate(), And,
                             ConstantExpr::getLShr(RHS, ShAmt));
       }
     }
-    
+
     // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
     bool TrueIfSigned = false;
     if (LHSI->hasOneUse() &&
         isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
       // (X << 31) <s 0  --> (X&1) != 0
       Constant *Mask = ConstantInt::get(LHSI->getOperand(0)->getType(),
-                                        APInt::getOneBitSet(TypeBits, 
+                                        APInt::getOneBitSet(TypeBits,
                                             TypeBits-ShAmt->getZExtValue()-1));
       Value *And =
         Builder->CreateAnd(LHSI->getOperand(0), Mask, LHSI->getName()+".mask");
@@ -1295,7 +1296,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-    
+
   case Instruction::LShr:         // (icmp pred (shr X, ShAmt), CI)
   case Instruction::AShr: {
     // Handle equality comparisons of shift-by-constant.
@@ -1312,13 +1313,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-    
+
   case Instruction::SDiv:
   case Instruction::UDiv:
     // Fold: icmp pred ([us]div X, C1), C2 -> range test
-    // Fold this div into the comparison, producing a range check. 
-    // Determine, based on the divide type, what the range is being 
-    // checked.  If there is an overflow on the low or high side, remember 
+    // Fold this div into the comparison, producing a range check.
+    // Determine, based on the divide type, what the range is being
+    // checked.  If there is an overflow on the low or high side, remember
     // it, otherwise compute the range [low, hi) bounding the new value.
     // See: InsertRangeTest above for the kinds of replacements possible.
     if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
@@ -1357,12 +1358,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-  
+
   // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
   if (ICI.isEquality()) {
     bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-    
-    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and 
+
+    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and
     // the second operand is a constant, simplify a bit.
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
       switch (BO->getOpcode()) {
@@ -1389,7 +1390,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // Replace ((add A, B) != 0) with (A != -B) if A or B is
           // efficiently invertible, or if the add has just this one use.
           Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
-          
+
           if (Value *NegVal = dyn_castNegVal(BOp1))
             return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
           if (Value *NegVal = dyn_castNegVal(BOp0))
@@ -1432,11 +1433,11 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           Constant *NotCI = ConstantExpr::getNot(RHS);
           if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
             return ReplaceInstUsesWith(ICI,
-                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()), 
+                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
                                        isICMP_NE));
         }
         break;
-        
+
       case Instruction::And:
         if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
           // If bits are being compared against that are and'd out, then the
@@ -1445,7 +1446,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             return ReplaceInstUsesWith(ICI,
                              ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
                                        isICMP_NE));
-          
+
           // If we have ((X & C) == C), turn it into ((X & C) != 0).
           if (RHS == BOC && RHSV.isPowerOf2())
             return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
@@ -1460,16 +1461,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           if (BOC->getValue().isSignBit()) {
             Value *X = BO->getOperand(0);
             Constant *Zero = Constant::getNullValue(X->getType());
-            ICmpInst::Predicate pred = isICMP_NE ? 
+            ICmpInst::Predicate pred = isICMP_NE ?
               ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
             return new ICmpInst(pred, X, Zero);
           }
-          
+
           // ((X & ~7) == 0) --> X < 8
           if (RHSV == 0 && isHighOnes(BOC)) {
             Value *X = BO->getOperand(0);
             Constant *NegX = ConstantExpr::getNeg(BOC);
-            ICmpInst::Predicate pred = isICMP_NE ? 
+            ICmpInst::Predicate pred = isICMP_NE ?
               ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
             return new ICmpInst(pred, X, NegX);
           }
@@ -1517,11 +1518,11 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
 Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
   Value *LHSCIOp        = LHSCI->getOperand(0);
-  const Type *SrcTy     = LHSCIOp->getType();
-  const Type *DestTy    = LHSCI->getType();
+  Type *SrcTy     = LHSCIOp->getType();
+  Type *DestTy    = LHSCI->getType();
   Value *RHSCIOp;
 
-  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the 
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
   if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
       TD->getPointerSizeInBits() ==
@@ -1539,7 +1540,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
     if (RHSOp)
       return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
   }
-  
+
   // The code below only handles extension cast instructions, so far.
   // Enforce this.
   if (LHSCI->getOpcode() != Instruction::ZExt &&
@@ -1552,9 +1553,9 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
     // Not an extension from the same type?
     RHSCIOp = CI->getOperand(0);
-    if (RHSCIOp->getType() != LHSCIOp->getType()) 
+    if (RHSCIOp->getType() != LHSCIOp->getType())
       return 0;
-    
+
     // If the signedness of the two casts doesn't agree (i.e. one is a sext
     // and the other is a zext), then we can't handle this.
     if (CI->getOpcode() != LHSCI->getOpcode())
@@ -1599,7 +1600,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
     return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, Res1);
   }
 
-  // The re-extended constant changed so the constant cannot be represented 
+  // The re-extended constant changed so the constant cannot be represented
   // in the shorter type. Consequently, we cannot emit a simple comparison.
   // All the cases that fold to true or false will have already been handled
   // by SimplifyICmpInst, so only deal with the tricky case.
@@ -1637,26 +1638,26 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // llvm.sadd.with.overflow.  To do this, we have to replace the original add
   // with a narrower add, and discard the add-with-constant that is part of the
   // range check (if we can't eliminate it, this isn't profitable).
-  
+
   // In order to eliminate the add-with-constant, the compare can be its only
   // use.
   Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
   if (!AddWithCst->hasOneUse()) return 0;
-  
+
   // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
   if (!CI2->getValue().isPowerOf2()) return 0;
   unsigned NewWidth = CI2->getValue().countTrailingZeros();
   if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return 0;
-    
+
   // The width of the new add formed is 1 more than the bias.
   ++NewWidth;
-  
+
   // Check to see that CI1 is an all-ones value with NewWidth bits.
   if (CI1->getBitWidth() == NewWidth ||
       CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
     return 0;
-  
-  // In order to replace the original add with a narrower 
+
+  // In order to replace the original add with a narrower
   // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
   // and truncates that discard the high bits of the add.  Verify that this is
   // the case.
@@ -1664,7 +1665,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   for (Value::use_iterator UI = OrigAdd->use_begin(), E = OrigAdd->use_end();
        UI != E; ++UI) {
     if (*UI == AddWithCst) continue;
-    
+
     // Only accept truncates for now.  We would really like a nice recursive
     // predicate like SimplifyDemandedBits, but which goes downwards the use-def
     // chain to see which bits of a value are actually demanded.  If the
@@ -1674,32 +1675,32 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
     if (TI == 0 ||
         TI->getType()->getPrimitiveSizeInBits() > NewWidth) return 0;
   }
-  
+
   // If the pattern matches, truncate the inputs to the narrower type and
   // use the sadd_with_overflow intrinsic to efficiently compute both the
   // result and the overflow bit.
   Module *M = I.getParent()->getParent()->getParent();
-  
+
   Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
   Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow,
                                        NewType);
 
   InstCombiner::BuilderTy *Builder = IC.Builder;
-  
+
   // Put the new code above the original add, in case there are any uses of the
   // add between the add and the compare.
   Builder->SetInsertPoint(OrigAdd);
-  
+
   Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc");
   Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc");
   CallInst *Call = Builder->CreateCall2(F, TruncA, TruncB, "sadd");
   Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result");
   Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType());
-  
+
   // The inner add was the result of the narrow add, zero extended to the
   // wider type.  Replace it with the result computed by the intrinsic.
   IC.ReplaceInstUsesWith(*OrigAdd, ZExt);
-  
+
   // The original icmp gets replaced with the overflow value.
   return ExtractValueInst::Create(Call, 1, "sadd.overflow");
 }
@@ -1709,13 +1710,13 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV,
   // Don't bother doing this transformation for pointers, don't do it for
   // vectors.
   if (!isa<IntegerType>(OrigAddV->getType())) return 0;
-  
+
   // If the add is a constant expr, then we don't bother transforming it.
   Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV);
   if (OrigAdd == 0) return 0;
-  
+
   Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1);
-  
+
   // Put the new code above the original add, in case there are any uses of the
   // add between the add and the compare.
   InstCombiner::BuilderTy *Builder = IC.Builder;
@@ -1740,13 +1741,13 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
                                  unsigned BitWidth, bool isSignCheck) {
   if (isSignCheck)
     return APInt::getSignBit(BitWidth);
-  
+
   ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
   if (!CI) return APInt::getAllOnesValue(BitWidth);
   const APInt &RHS = CI->getValue();
-  
+
   switch (I.getPredicate()) {
-  // For a UGT comparison, we don't care about any bits that 
+  // For a UGT comparison, we don't care about any bits that
   // correspond to the trailing ones of the comparand.  The value of these
   // bits doesn't impact the outcome of the comparison, because any value
   // greater than the RHS must differ in a bit higher than these due to carry.
@@ -1755,7 +1756,7 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
     APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes);
     return ~lowBitsSet;
   }
-  
+
   // Similarly, for a ULT comparison, we don't care about the trailing zeros.
   // Any value less than the RHS must differ in a higher bit because of carries.
   case ICmpInst::ICMP_ULT: {
@@ -1763,17 +1764,17 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
     APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros);
     return ~lowBitsSet;
   }
-  
+
   default:
     return APInt::getAllOnesValue(BitWidth);
   }
-  
+
 }
 
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  
+
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
@@ -1782,11 +1783,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     std::swap(Op0, Op1);
     Changed = true;
   }
-  
+
   if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
-  
-  const Type *Ty = Op0->getType();
+
+  Type *Ty = Op0->getType();
 
   // icmp's with boolean values can always be turned into bitwise operations
   if (Ty->isIntegerTy(1)) {
@@ -1835,13 +1836,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     BitWidth = Ty->getScalarSizeInBits();
   else if (TD)  // Pointers require TD info to get their size.
     BitWidth = TD->getTypeSizeInBits(Ty->getScalarType());
-  
+
   bool isSignBit = false;
 
   // See if we are doing a comparison with a constant.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
     Value *A = 0, *B = 0;
-    
+
     // Match the following pattern, which is a common idiom when writing
     // overflow-safe integer arithmetic function.  The source performs an
     // addition in wider type, and explicitly checks for overflow using
@@ -1849,9 +1850,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // sadd_with_overflow intrinsic.
     //
     // TODO: This could probably be generalized to handle other overflow-safe
-    // operations if we worked out the formulas to compute the appropriate 
+    // operations if we worked out the formulas to compute the appropriate
     // magic constants.
-    // 
+    //
     // sum = a + b
     // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8
     {
@@ -1861,14 +1862,14 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (Instruction *Res = ProcessUGT_ADDCST_ADD(I, A, B, CI2, CI, *this))
         return Res;
     }
-    
+
     // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
     if (I.isEquality() && CI->isZero() &&
         match(Op0, m_Sub(m_Value(A), m_Value(B)))) {
       // (icmp cond A B) if cond is equality
       return new ICmpInst(I.getPredicate(), A, B);
     }
-    
+
     // If we have an icmp le or icmp ge instruction, turn it into the
     // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
     // them being folded in the code below.  The SimplifyICmpInst code has
@@ -1892,7 +1893,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
                           ConstantInt::get(CI->getContext(), CI->getValue()-1));
     }
-    
+
     // If this comparison is a normal comparison, it demands all
     // bits, if it is a sign bit comparison, it only demands the sign bit.
     bool UnusedBit;
@@ -1948,7 +1949,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     case ICmpInst::ICMP_EQ: {
       if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
         return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-        
+
       // If all bits are known zero except for one, then we know at most one
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
@@ -1960,7 +1961,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
             LHSC->getValue() != Op0KnownZeroInverted)
           LHS = Op0;
-        
+
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) == 0" into "x != 3".
         Value *X = 0;
@@ -1969,7 +1970,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           return new ICmpInst(ICmpInst::ICMP_NE, X,
                               ConstantInt::get(X->getType(), CmpVal));
         }
-        
+
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
         // then turn "((8 >>u x)&1) == 0" into "x != 3".
         const APInt *CI;
@@ -1979,13 +1980,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                               ConstantInt::get(X->getType(),
                                                CI->countTrailingZeros()));
       }
-        
+
       break;
     }
     case ICmpInst::ICMP_NE: {
       if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
         return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
-      
+
       // If all bits are known zero except for one, then we know at most one
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
@@ -1997,7 +1998,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
             LHSC->getValue() != Op0KnownZeroInverted)
           LHS = Op0;
-        
+
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) != 0" into "x == 3".
         Value *X = 0;
@@ -2006,7 +2007,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           return new ICmpInst(ICmpInst::ICMP_EQ, X,
                               ConstantInt::get(X->getType(), CmpVal));
         }
-        
+
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
         // then turn "((8 >>u x)&1) != 0" into "x == 3".
         const APInt *CI;
@@ -2016,7 +2017,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                               ConstantInt::get(X->getType(),
                                                CI->countTrailingZeros()));
       }
-      
+
       break;
     }
     case ICmpInst::ICMP_ULT:
@@ -2137,9 +2138,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   // See if we are doing a comparison between a constant and an instruction that
   // can be folded into the comparison.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-    // Since the RHS is a ConstantInt (CI), if the left hand side is an 
-    // instruction, see if that instruction also has constants so that the 
-    // instruction can be folded into the icmp 
+    // Since the RHS is a ConstantInt (CI), if the left hand side is an
+    // instruction, see if that instruction also has constants so that the
+    // instruction can be folded into the icmp
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
       if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
         return Res;
@@ -2194,7 +2195,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
-            TD->getIntPtrType(RHSC->getContext()) == 
+            TD->getIntPtrType(RHSC->getContext()) ==
                LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
@@ -2227,8 +2228,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
   // now.
   if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
-    if (Op0->getType()->isPointerTy() && 
-        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { 
+    if (Op0->getType()->isPointerTy() &&
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
       // We keep moving the cast from the left operand over to the right
       // operand, where it can often be eliminated completely.
       Op0 = CI->getOperand(0);
@@ -2250,7 +2251,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       return new ICmpInst(I.getPredicate(), Op0, Op1);
     }
   }
-  
+
   if (isa<CastInst>(Op0)) {
     // Handle the special case of: icmp (cast bool to X), <cst>
     // This comes up when you have code like
@@ -2384,7 +2385,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
             return new ICmpInst(Pred, BO0->getOperand(0),
                                 BO1->getOperand(0));
           }
-          
+
           if (CI->isMaxValue(true)) {
             ICmpInst::Predicate Pred = I.isSigned()
                                            ? I.getUnsignedPredicate()
@@ -2404,7 +2405,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           // Mask = -1 >> count-trailing-zeros(Cst).
           if (!CI->isZero() && !CI->isOne()) {
             const APInt &AP = CI->getValue();
-            ConstantInt *Mask = ConstantInt::get(I.getContext(), 
+            ConstantInt *Mask = ConstantInt::get(I.getContext(),
                                     APInt::getLowBitsSet(AP.getBitWidth(),
                                                          AP.getBitWidth() -
                                                     AP.countTrailingZeros()));
@@ -2438,7 +2439,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
     }
   }
-  
+
   { Value *A, *B;
     // ~x < ~y --> y < x
     // ~x < cst --> ~cst < x
@@ -2452,11 +2453,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // (a+b) <u a  --> llvm.uadd.with.overflow.
     // (a+b) <u b  --> llvm.uadd.with.overflow.
     if (I.getPredicate() == ICmpInst::ICMP_ULT &&
-        match(Op0, m_Add(m_Value(A), m_Value(B))) && 
+        match(Op0, m_Add(m_Value(A), m_Value(B))) &&
         (Op1 == A || Op1 == B))
       if (Instruction *R = ProcessUAddIdiom(I, Op0, *this))
         return R;
-                                 
+
     // a >u (a+b)  --> llvm.uadd.with.overflow.
     // b >u (a+b)  --> llvm.uadd.with.overflow.
     if (I.getPredicate() == ICmpInst::ICMP_UGT &&
@@ -2465,7 +2466,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (Instruction *R = ProcessUAddIdiom(I, Op1, *this))
         return R;
   }
-  
+
   if (I.isEquality()) {
     Value *A, *B, *C, *D;
 
@@ -2483,10 +2484,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
             match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
           Constant *NC = ConstantInt::get(I.getContext(),
                                           C1->getValue() ^ C2->getValue());
-          Value *Xor = Builder->CreateXor(C, NC, "tmp");
+          Value *Xor = Builder->CreateXor(C, NC);
           return new ICmpInst(I.getPredicate(), A, Xor);
         }
-        
+
         // A^B == A^D -> B == D
         if (A == C) return new ICmpInst(I.getPredicate(), B, D);
         if (A == D) return new ICmpInst(I.getPredicate(), B, C);
@@ -2494,7 +2495,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         if (B == D) return new ICmpInst(I.getPredicate(), A, C);
       }
     }
-    
+
     if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
         (A == Op0 || B == Op0)) {
       // A == (A^B)  ->  B == 0
@@ -2504,10 +2505,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
 
     // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
-    if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && 
+    if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
         match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
       Value *X = 0, *Y = 0, *Z = 0;
-      
+
       if (A == C) {
         X = B; Y = D; Z = A;
       } else if (A == D) {
@@ -2517,16 +2518,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       } else if (B == D) {
         X = A; Y = C; Z = B;
       }
-      
+
       if (X) {   // Build (X^Y) & Z
-        Op1 = Builder->CreateXor(X, Y, "tmp");
-        Op1 = Builder->CreateAnd(Op1, Z, "tmp");
+        Op1 = Builder->CreateXor(X, Y);
+        Op1 = Builder->CreateAnd(Op1, Z);
         I.setOperand(0, Op1);
         I.setOperand(1, Constant::getNullValue(Op1->getType()));
         return &I;
       }
     }
-    
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
@@ -2539,21 +2540,21 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // when it exposes other optimizations.
         !A->hasOneUse()) {
       unsigned ASize =cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
-      
+
       if (ShAmt < ASize) {
         APInt MaskV =
           APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
         MaskV <<= ShAmt;
-        
+
         APInt CmpV = Cst1->getValue().zext(ASize);
         CmpV <<= ShAmt;
-        
+
         Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
         return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
       }
     }
   }
-  
+
   {
     Value *X; ConstantInt *Cst;
     // icmp X+Cst, X
@@ -2579,31 +2580,31 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
                                                 Constant *RHSC) {
   if (!isa<ConstantFP>(RHSC)) return 0;
   const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
-  
+
   // Get the width of the mantissa.  We don't want to hack on conversions that
   // might lose information from the integer, e.g. "i64 -> float"
   int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
   if (MantissaWidth == -1) return 0;  // Unknown.
-  
+
   // Check to see that the input is converted from an integer type that is small
   // enough that preserves all bits.  TODO: check here for "known" sign bits.
   // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
   unsigned InputSize = LHSI->getOperand(0)->getType()->getScalarSizeInBits();
-  
+
   // If this is a uitofp instruction, we need an extra bit to hold the sign.
   bool LHSUnsigned = isa<UIToFPInst>(LHSI);
   if (LHSUnsigned)
     ++InputSize;
-  
+
   // If the conversion would lose info, don't hack on this.
   if ((int)InputSize > MantissaWidth)
     return 0;
-  
+
   // Otherwise, we can potentially simplify the comparison.  We know that it
   // will always come through as an integer value and we know the constant is
   // not a NAN (it would have been previously simplified).
   assert(!RHS.isNaN() && "NaN comparison not already folded!");
-  
+
   ICmpInst::Predicate Pred;
   switch (I.getPredicate()) {
   default: llvm_unreachable("Unexpected predicate!");
@@ -2636,15 +2637,15 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
   case FCmpInst::FCMP_UNO:
     return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
   }
-  
-  const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
-  
+
+  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
   // Now we know that the APFloat is a normal number, zero or inf.
-  
+
   // See if the FP constant is too large for the integer.  For example,
   // comparing an i8 to 300.0.
   unsigned IntWidth = IntTy->getScalarSizeInBits();
-  
+
   if (!LHSUnsigned) {
     // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
     // and large values.
@@ -2670,7 +2671,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
     }
   }
-  
+
   if (!LHSUnsigned) {
     // See if the RHS value is < SignedMin.
     APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
@@ -2766,7 +2767,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
 
 Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
-  
+
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
@@ -2776,7 +2777,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   }
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  
+
   if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
@@ -2792,7 +2793,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       I.setPredicate(FCmpInst::FCMP_UNO);
       I.setOperand(1, Constant::getNullValue(Op0->getType()));
       return &I;
-      
+
     case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
     case FCmpInst::FCMP_OEQ:    // True if ordered and equal
     case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
@@ -2803,7 +2804,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       return &I;
     }
   }
-    
+
   // Handle fcmp with constant RHS
   if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
@@ -2836,10 +2837,14 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         APFloat F = RHSF->getValueAPF();
         F.convert(*Sem, APFloat::rmNearestTiesToEven, &Lossy);
 
-        // Avoid lossy conversions and denormals.
+        // Avoid lossy conversions and denormals. Zero is a special case
+        // that's OK to convert.
+        APFloat Fabs = F;
+        Fabs.clearSign();
         if (!Lossy &&
-            F.compare(APFloat::getSmallestNormalized(*Sem)) !=
-                                                           APFloat::cmpLessThan)
+            ((Fabs.compare(APFloat::getSmallestNormalized(*Sem)) !=
+                 APFloat::cmpLessThan) || Fabs.isZero()))
+
           return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
                               ConstantFP::get(RHSC->getContext(), F));
         break;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index f499290..7446a51 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -26,7 +26,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
   if (TD) {
-    const Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
     if (AI.getArraySize()->getType() != IntPtrTy) {
       Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                         IntPtrTy, false);
@@ -38,7 +38,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (AI.isArrayAllocation()) {  // Check C != 1
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
-      const Type *NewTy = 
+      Type *NewTy = 
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
@@ -58,8 +58,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       Idx[0] = NullIdx;
       Idx[1] = NullIdx;
       Instruction *GEP =
-           GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2,
-                                             New->getName()+".sub");
+           GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub");
       InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
@@ -92,28 +91,28 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
   User *CI = cast<User>(LI.getOperand(0));
   Value *CastOp = CI->getOperand(0);
 
-  const PointerType *DestTy = cast<PointerType>(CI->getType());
-  const Type *DestPTy = DestTy->getElementType();
-  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+  PointerType *DestTy = cast<PointerType>(CI->getType());
+  Type *DestPTy = DestTy->getElementType();
+  if (PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
 
     // If the address spaces don't match, don't eliminate the cast.
     if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
       return 0;
 
-    const Type *SrcPTy = SrcTy->getElementType();
+    Type *SrcPTy = SrcTy->getElementType();
 
     if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || 
          DestPTy->isVectorTy()) {
       // If the source is an array, the code below will not succeed.  Check to
       // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
       // constants.
-      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+      if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
           if (ASrcTy->getNumElements() != 0) {
             Value *Idxs[2];
             Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext()));
             Idxs[1] = Idxs[0];
-            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
             SrcTy = cast<PointerType>(CastOp->getType());
             SrcPTy = SrcTy->getElementType();
           }
@@ -133,6 +132,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
         LoadInst *NewLoad = 
           IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
         NewLoad->setAlignment(LI.getAlignment());
+        NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
         // Now cast the result of the load.
         return new BitCastInst(NewLoad, LI.getType());
       }
@@ -163,8 +163,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
       return Res;
 
-  // None of the following transforms are legal for volatile loads.
-  if (LI.isVolatile()) return 0;
+  // None of the following transforms are legal for volatile/atomic loads.
+  // FIXME: Some of it is okay for atomic loads; needs refactoring.
+  if (!LI.isSimple()) return 0;
   
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
@@ -256,11 +257,11 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   User *CI = cast<User>(SI.getOperand(1));
   Value *CastOp = CI->getOperand(0);
 
-  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
-  const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
+  Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
   if (SrcTy == 0) return 0;
   
-  const Type *SrcPTy = SrcTy->getElementType();
+  Type *SrcPTy = SrcTy->getElementType();
 
   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
     return 0;
@@ -280,12 +281,12 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     NewGEPIndices.push_back(Zero);
     
     while (1) {
-      if (const StructType *STy = dyn_cast<StructType>(SrcPTy)) {
+      if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
         if (!STy->getNumElements()) /* Struct can be empty {} */
           break;
         NewGEPIndices.push_back(Zero);
         SrcPTy = STy->getElementType(0);
-      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
+      } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
         NewGEPIndices.push_back(Zero);
         SrcPTy = ATy->getElementType();
       } else {
@@ -314,8 +315,8 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   Value *NewCast;
   Value *SIOp0 = SI.getOperand(0);
   Instruction::CastOps opcode = Instruction::BitCast;
-  const Type* CastSrcTy = SIOp0->getType();
-  const Type* CastDstTy = SrcPTy;
+  Type* CastSrcTy = SIOp0->getType();
+  Type* CastDstTy = SrcPTy;
   if (CastDstTy->isPointerTy()) {
     if (CastSrcTy->isIntegerTy())
       opcode = Instruction::IntToPtr;
@@ -327,8 +328,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   // SIOp0 is a pointer to aggregate and this is a store to the first field,
   // emit a GEP to index into its first field.
   if (!NewGEPIndices.empty())
-    CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices.begin(),
-                                           NewGEPIndices.end());
+    CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
   
   NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
                                    SIOp0->getName()+".c");
@@ -370,21 +370,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   Value *Val = SI.getOperand(0);
   Value *Ptr = SI.getOperand(1);
 
-  // If the RHS is an alloca with a single use, zapify the store, making the
-  // alloca dead.
-  if (!SI.isVolatile()) {
-    if (Ptr->hasOneUse()) {
-      if (isa<AllocaInst>(Ptr)) 
-        return EraseInstFromFunction(SI);
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
-        if (isa<AllocaInst>(GEP->getOperand(0))) {
-          if (GEP->getOperand(0)->hasOneUse())
-            return EraseInstFromFunction(SI);
-        }
-      }
-    }
-  }
-
   // Attempt to improve the alignment.
   if (TD) {
     unsigned KnownAlign =
@@ -400,6 +385,23 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       SI.setAlignment(EffectiveStoreAlign);
   }
 
+  // Don't hack volatile/atomic stores.
+  // FIXME: Some bits are legal for atomic stores; needs refactoring.
+  if (!SI.isSimple()) return 0;
+
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  if (Ptr->hasOneUse()) {
+    if (isa<AllocaInst>(Ptr)) 
+      return EraseInstFromFunction(SI);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      if (isa<AllocaInst>(GEP->getOperand(0))) {
+        if (GEP->getOperand(0)->hasOneUse())
+          return EraseInstFromFunction(SI);
+      }
+    }
+  }
+
   // Do really simple DSE, to catch cases where there are several consecutive
   // stores to the same location, separated by a few arithmetic operations. This
   // situation often occurs with bitfield accesses.
@@ -417,8 +419,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
-      if (!PrevSI->isVolatile() &&equivalentAddressValues(PrevSI->getOperand(1),
-                                                          SI.getOperand(1))) {
+      if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
+                                                        SI.getOperand(1))) {
         ++NumDeadStore;
         ++BBI;
         EraseInstFromFunction(*PrevSI);
@@ -432,7 +434,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     // then *this* store is dead (X = load P; store X -> P).
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
-          !SI.isVolatile())
+          LI->isSimple())
         return EraseInstFromFunction(SI);
       
       // Otherwise, this is a load from some other location.  Stores before it
@@ -444,9 +446,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
       break;
   }
-  
-  
-  if (SI.isVolatile()) return 0;  // Don't hack volatile stores.
 
   // store X, null    -> turns into 'unreachable' in SimplifyCFG
   if (isa<ConstantPointerNull>(Ptr) && SI.getPointerAddressSpace() == 0) {
@@ -549,11 +548,11 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
         return false;
       --BBI;
     }
-    // If this isn't a store, isn't a store to the same location, or if the
-    // alignments differ, bail out.
+    // If this isn't a store, isn't a store to the same location, or is not the
+    // right kind of store, bail out.
     OtherStore = dyn_cast<StoreInst>(BBI);
     if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
-        OtherStore->getAlignment() != SI.getAlignment())
+        !SI.isSameOperationAs(OtherStore))
       return false;
   } else {
     // Otherwise, the other block ended with a conditional branch. If one of the
@@ -569,7 +568,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
       // Check to see if we find the matching store.
       if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
         if (OtherStore->getOperand(1) != SI.getOperand(1) ||
-            OtherStore->getAlignment() != SI.getAlignment())
+            !SI.isSameOperationAs(OtherStore))
           return false;
         break;
       }
@@ -601,10 +600,12 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   
   // Advance to a place where it is safe to insert the new store and
   // insert it.
-  BBI = DestBB->getFirstNonPHI();
+  BBI = DestBB->getFirstInsertionPt();
   StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
-                                   OtherStore->isVolatile(),
-                                   SI.getAlignment());
+                                   SI.isVolatile(),
+                                   SI.getAlignment(),
+                                   SI.getOrdering(),
+                                   SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 630a6fe..7f48125 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -38,7 +38,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
                       m_Value(B))) &&
       // The "1" can be any value known to be a power of 2.
       isPowerOfTwo(PowerOf2, IC.getTargetData())) {
-    A = IC.Builder->CreateSub(A, B, "tmp");
+    A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
   
@@ -131,7 +131,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     { Value *X; ConstantInt *C1;
       if (Op0->hasOneUse() &&
           match(Op0, m_Add(m_Value(X), m_ConstantInt(C1)))) {
-        Value *Add = Builder->CreateMul(X, CI, "tmp");
+        Value *Add = Builder->CreateMul(X, CI);
         return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI));
       }
     }
@@ -244,7 +244,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
     if (BoolCast) {
       Value *V = Builder->CreateSub(Constant::getNullValue(I.getType()),
-                                    BoolCast, "tmp");
+                                    BoolCast);
       return BinaryOperator::CreateAnd(V, OtherOp);
     }
   }
@@ -421,7 +421,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
 
 /// dyn_castZExtVal - Checks if V is a zext or constant that can
 /// be truncated to Ty without losing bits.
-static Value *dyn_castZExtVal(Value *V, const Type *Ty) {
+static Value *dyn_castZExtVal(Value *V, Type *Ty) {
   if (ZExtInst *Z = dyn_cast<ZExtInst>(V)) {
     if (Z->getSrcTy() == Ty)
       return Z->getOperand(0);
@@ -466,8 +466,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   { const APInt *CI; Value *N;
     if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) {
       if (*CI != 1)
-        N = Builder->CreateAdd(N, ConstantInt::get(I.getType(), CI->logBase2()),
-                               "tmp");
+        N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2()));
       if (I.isExact())
         return BinaryOperator::CreateExactLShr(Op0, N);
       return BinaryOperator::CreateLShr(Op0, N);
@@ -630,7 +629,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
   if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
-    Value *Add = Builder->CreateAdd(Op1, N1, "tmp");
+    Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 3777340..664546c 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -28,8 +28,8 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   Value *LHSVal = FirstInst->getOperand(0);
   Value *RHSVal = FirstInst->getOperand(1);
     
-  const Type *LHSType = LHSVal->getType();
-  const Type *RHSType = RHSVal->getType();
+  Type *LHSType = LHSVal->getType();
+  Type *RHSType = RHSVal->getType();
   
   bool isNUW = false, isNSW = false, isExact = false;
   if (OverflowingBinaryOperator *BO =
@@ -229,8 +229,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   
   Value *Base = FixedOperands[0];
   GetElementPtrInst *NewGEP = 
-    GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
-                              FixedOperands.end());
+    GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1));
   if (AllInBounds) NewGEP->setIsInBounds();
   NewGEP->setDebugLoc(FirstInst->getDebugLoc());
   return NewGEP;
@@ -287,7 +286,12 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
 
 Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
-  
+
+  // FIXME: This is overconservative; this transform is allowed in some cases
+  // for atomic operations.
+  if (FirstLI->isAtomic())
+    return 0;
+
   // When processing loads, we need to propagate two bits of information to the
   // sunk load: whether it is volatile, and what its alignment is.  We currently
   // don't sink loads when some have their alignment specified and some don't.
@@ -397,7 +401,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // the same type or "+42") we can pull the operation through the PHI, reducing
   // code size and simplifying code.
   Constant *ConstantOp = 0;
-  const Type *CastSrcTy = 0;
+  Type *CastSrcTy = 0;
   bool isNUW = false, isNSW = false, isExact = false;
   
   if (isa<CastInst>(FirstInst)) {
@@ -572,7 +576,7 @@ struct LoweredPHIRecord {
   unsigned Shift;     // The amount shifted.
   unsigned Width;     // The width extracted.
   
-  LoweredPHIRecord(PHINode *pn, unsigned Sh, const Type *Ty)
+  LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
     : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
   
   // Ctor form used by DenseMap.
@@ -701,7 +705,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
     unsigned PHIId = PHIUsers[UserI].PHIId;
     PHINode *PN = PHIsToSlice[PHIId];
     unsigned Offset = PHIUsers[UserI].Shift;
-    const Type *Ty = PHIUsers[UserI].Inst->getType();
+    Type *Ty = PHIUsers[UserI].Inst->getType();
     
     PHINode *EltPHI;
     
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 5733c20..91e60a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -323,9 +324,14 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     }
 
     // All operands were constants, fold it.
-    if (ConstOps.size() == I->getNumOperands())
+    if (ConstOps.size() == I->getNumOperands()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        if (!LI->isVolatile())
+          return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
+
       return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                      ConstOps.data(), ConstOps.size(), TD);
+                                      ConstOps, TD);
+    }
   }
 
   return 0;
@@ -363,7 +369,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
       case ICmpInst::ICMP_UGT:
       case ICmpInst::ICMP_SGT: {
         // These transformations only work for selects over integers.
-        const IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType());
+        IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType());
         if (!SelectTy)
           break;
 
@@ -443,7 +449,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   // FIXME: Type and constness constraints could be lifted, but we have to
   //        watch code size carefully. We should consider xor instead of
   //        sub/add when we decide to do that.
-  if (const IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
+  if (IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
     if (TrueVal->getType() == Ty) {
       if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
         ConstantInt *C1 = NULL, *C2 = NULL;
@@ -476,10 +482,16 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
         SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
+      return ReplaceInstUsesWith(SI, FalseVal);
   } else if (Pred == ICmpInst::ICMP_NE) {
     if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
+      return ReplaceInstUsesWith(SI, TrueVal);
   }
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 811f949..6d85add 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
@@ -207,11 +208,12 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     return I;
     
   case Instruction::Shl: {
-    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
 
     // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
-    
+    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+
     // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
     if (isLeftShift) {
       // If this is oversized composite shift, then unsigned shifts get 0.
@@ -219,7 +221,9 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       if (NewShAmt >= TypeWidth)
         return Constant::getNullValue(I->getType());
 
-      I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt));
+      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
+      BO->setHasNoUnsignedWrap(false);
+      BO->setHasNoSignedWrap(false);
       return I;
     }
     
@@ -227,11 +231,11 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // zeros.
     if (CI->getValue() == NumBits) {
       APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits));
-      V = IC.Builder->CreateAnd(I->getOperand(0),
-                                ConstantInt::get(I->getContext(), Mask));
+      V = IC.Builder->CreateAnd(BO->getOperand(0),
+                                ConstantInt::get(BO->getContext(), Mask));
       if (Instruction *VI = dyn_cast<Instruction>(V)) {
-        VI->moveBefore(I);
-        VI->takeName(I);
+        VI->moveBefore(BO);
+        VI->takeName(BO);
       }
       return V;
     }
@@ -239,23 +243,27 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
-    I->setOperand(1, ConstantInt::get(I->getType(),
-                                      CI->getZExtValue() - NumBits));
-    return I;
+    BO->setOperand(1, ConstantInt::get(BO->getType(),
+                                       CI->getZExtValue() - NumBits));
+    BO->setHasNoUnsignedWrap(false);
+    BO->setHasNoSignedWrap(false);
+    return BO;
   }
   case Instruction::LShr: {
-    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
     // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
+    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
     
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) {
       // If this is oversized composite shift, then unsigned shifts get 0.
       unsigned NewShAmt = NumBits+CI->getZExtValue();
       if (NewShAmt >= TypeWidth)
-        return Constant::getNullValue(I->getType());
+        return Constant::getNullValue(BO->getType());
       
-      I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt));
+      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
+      BO->setIsExact(false);
       return I;
     }
     
@@ -264,7 +272,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     if (CI->getValue() == NumBits) {
       APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits));
       V = IC.Builder->CreateAnd(I->getOperand(0),
-                                ConstantInt::get(I->getContext(), Mask));
+                                ConstantInt::get(BO->getContext(), Mask));
       if (Instruction *VI = dyn_cast<Instruction>(V)) {
         VI->moveBefore(I);
         VI->takeName(I);
@@ -275,9 +283,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
-    I->setOperand(1, ConstantInt::get(I->getType(),
-                                      CI->getZExtValue() - NumBits));
-    return I;
+    BO->setOperand(1, ConstantInt::get(BO->getType(),
+                                       CI->getZExtValue() - NumBits));
+    BO->setIsExact(false);
+    return BO;
   }
     
   case Instruction::Select:
@@ -528,7 +537,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     
     uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
     
-    const IntegerType *Ty = cast<IntegerType>(I.getType());
+    IntegerType *Ty = cast<IntegerType>(I.getType());
     
     // Check for (X << c1) << c2  and  (X >> c1) >> c2
     if (I.getOpcode() == ShiftOp->getOpcode()) {
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8fea8eb..5cd9a4b 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -103,7 +103,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   assert(V != 0 && "Null pointer of Value???");
   assert(Depth <= 6 && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
-  const Type *VTy = V->getType();
+  Type *VTy = V->getType();
   assert((TD || !VTy->isPointerTy()) &&
          "SimplifyDemandedBits needs to know bit widths!");
   assert((!TD || TD->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) &&
@@ -325,8 +325,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
         Constant *AndC = Constant::getIntegerValue(VTy,
                                                    ~RHSKnownOne & DemandedMask);
-        Instruction *And = 
-          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
+        Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         return InsertNewInstWith(And, *I);
       }
     }
@@ -351,14 +350,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         
         Constant *AndC =
           ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
-        Instruction *NewAnd = 
-          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
+        Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         InsertNewInstWith(NewAnd, *I);
         
         Constant *XorC =
           ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
-        Instruction *NewXor =
-          BinaryOperator::CreateXor(NewAnd, XorC, "tmp");
+        Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
         return InsertNewInstWith(NewXor, *I);
       }
 
@@ -404,8 +401,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
       return 0;  // vector->int or fp->int?
 
-    if (const VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
-      if (const VectorType *SrcVTy =
+    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
+      if (VectorType *SrcVTy =
             dyn_cast<VectorType>(I->getOperand(0)->getType())) {
         if (DstVTy->getNumElements() != SrcVTy->getNumElements())
           // Don't touch a bitcast between vectors of different element counts.
@@ -826,7 +823,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
   UndefElts = 0;
   if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
-    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Undef = UndefValue::get(EltTy);
 
     std::vector<Constant*> Elts;
@@ -855,7 +852,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     if (DemandedElts.isAllOnesValue())
       return 0;
     
-    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Zero = Constant::getNullValue(EltTy);
     Constant *Undef = UndefValue::get(EltTy);
     std::vector<Constant*> Elts;
@@ -962,6 +959,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       unsigned MaskVal = Shuffle->getMaskValue(i);
       if (MaskVal == -1u) {
         UndefElts.setBit(i);
+      } else if (!DemandedElts[i]) {
+        NewUndefElts = true;
+        UndefElts.setBit(i);
       } else if (MaskVal < LHSVWidth) {
         if (UndefElts4[MaskVal]) {
           NewUndefElts = true;
@@ -992,7 +992,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   }
   case Instruction::BitCast: {
     // Vector->vector casts only.
-    const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
     if (!VTy) break;
     unsigned InVWidth = VTy->getNumElements();
     APInt InputDemandedElts(InVWidth, 0);
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index ad6a8d0..154267c 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -77,7 +77,7 @@ static std::vector<int> getShuffleMask(const ShuffleVectorInst *SVI) {
 /// extracted from the vector.
 static Value *FindScalarElement(Value *V, unsigned EltNo) {
   assert(V->getType()->isVectorTy() && "Not looking at a vector?");
-  const VectorType *PTy = cast<VectorType>(V->getType());
+  VectorType *PTy = cast<VectorType>(V->getType());
   unsigned Width = PTy->getNumElements();
   if (EltNo >= Width)  // Out of range access.
     return UndefValue::get(PTy->getElementType());
@@ -175,7 +175,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     // the same number of elements, see if we can find the source element from
     // it.  In this case, we will end up needing to bitcast the scalars.
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
-      if (const VectorType *VT =
+      if (VectorType *VT =
           dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
         if (VT->getNumElements() == VectorWidth)
           if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
@@ -225,7 +225,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
           SrcIdx -= LHSWidth;
           Src = SVI->getOperand(1);
         }
-        const Type *Int32Ty = Type::getInt32Ty(EI.getContext());
+        Type *Int32Ty = Type::getInt32Ty(EI.getContext());
         return ExtractElementInst::Create(Src,
                                           ConstantInt::get(Int32Ty,
                                                            SrcIdx, false));
@@ -555,7 +555,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         // shuffle mask, do the replacement.
         if (isSplat || NewMask == LHSMask || NewMask == Mask) {
           std::vector<Constant*> Elts;
-          const Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+          Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
           for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
             if (NewMask[i] < 0) {
               Elts.push_back(UndefValue::get(Int32Ty));
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index ab98ef9..92874b9 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -46,8 +46,10 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm-c/Initialization.h"
 #include <algorithm>
 #include <climits>
@@ -83,7 +85,7 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 /// ShouldChangeType - Return true if it is desirable to convert a computation
 /// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
 /// type for example, or from a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(const Type *From, const Type *To) const {
+bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
   
   // If we don't have TD, we don't know if the source/dest are legal.
@@ -107,6 +109,43 @@ bool InstCombiner::ShouldChangeType(const Type *From, const Type *To) const {
   return true;
 }
 
+// Return true, if No Signed Wrap should be maintained for I.
+// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
+// where both B and C should be ConstantInts, results in a constant that does
+// not overflow. This function only handles the Add and Sub opcodes. For
+// all other opcodes, the function conservatively returns false.
+static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  if (!OBO || !OBO->hasNoSignedWrap()) {
+    return false;
+  }
+
+  // We reason about Add and Sub Only.
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (Opcode != Instruction::Add && 
+      Opcode != Instruction::Sub) {
+    return false;
+  }
+
+  ConstantInt *CB = dyn_cast<ConstantInt>(B);
+  ConstantInt *CC = dyn_cast<ConstantInt>(C);
+
+  if (!CB || !CC) {
+    return false;
+  }
+
+  const APInt &BVal = CB->getValue();
+  const APInt &CVal = CC->getValue();
+  bool Overflow = false;
+
+  if (Opcode == Instruction::Add) {
+    BVal.sadd_ov(CVal, Overflow);
+  } else {
+    BVal.ssub_ov(CVal, Overflow);
+  }
+
+  return !Overflow;
+}
 
 /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
 /// operators which are associative or commutative:
@@ -158,7 +197,16 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, V);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          if (MaintainNoSignedWrap(I, B, C) &&
+	      (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
+            // Note: this is only valid because SimplifyBinOp doesn't look at
+            // the operands to Op0.
+            I.clearSubclassOptionalData();
+            I.setHasNoSignedWrap(true);
+          } else {
+            I.clearSubclassOptionalData();
+          }
+            
           Changed = true;
           ++NumReassoc;
           continue;
@@ -240,7 +288,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Constant *C2 = cast<Constant>(Op1->getOperand(1));
 
         Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
-        Instruction *New = BinaryOperator::Create(Opcode, A, B);
+        BinaryOperator *New = BinaryOperator::Create(Opcode, A, B);
         InsertNewInstWith(New, I);
         New->takeName(Op1);
         I.setOperand(0, New);
@@ -248,6 +296,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
         I.clearSubclassOptionalData();
+
         Changed = true;
         continue;
       }
@@ -516,8 +565,8 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
     // If it's a bitcast involving vectors, make sure it has the same number of
     // elements on both sides.
     if (BitCastInst *BC = dyn_cast<BitCastInst>(&Op)) {
-      const VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
-      const VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+      VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+      VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
 
       // Verify that either both or neither are vectors.
       if ((SrcTy == NULL) != (DestTy == NULL)) return 0;
@@ -654,7 +703,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     }
   } else { 
     CastInst *CI = cast<CastInst>(&I);
-    const Type *RetTy = CI->getType();
+    Type *RetTy = CI->getType();
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV;
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
@@ -680,7 +729,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 /// or not there is a sequence of GEP indices into the type that will land us at
 /// the specified offset.  If so, fill them into NewIndices and return the
 /// resultant element type, otherwise return null.
-const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset, 
+Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, 
                                           SmallVectorImpl<Value*> &NewIndices) {
   if (!TD) return 0;
   if (!Ty->isSized()) return 0;
@@ -688,7 +737,7 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  const Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
   int64_t FirstIdx = 0;
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -711,7 +760,7 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
     if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
       return 0;
     
-    if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
       const StructLayout *SL = TD->getStructLayout(STy);
       assert(Offset < (int64_t)SL->getSizeInBytes() &&
              "Offset must stay within the indexed type");
@@ -722,7 +771,7 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
       
       Offset -= SL->getElementOffset(Elt);
       Ty = STy->getElementType(Elt);
-    } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
       uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType());
       assert(EltSize && "Cannot index into a zero-sized array");
       NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
@@ -737,12 +786,20 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
   return Ty;
 }
 
-
+static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
+  // If this GEP has only 0 indices, it is the same pointer as
+  // Src. If Src is not a trivial GEP too, don't combine
+  // the indices.
+  if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
+      !Src.hasOneUse())
+    return false;
+  return true;
+}
 
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
-  if (Value *V = SimplifyGEPInst(&Ops[0], Ops.size(), TD))
+  if (Value *V = SimplifyGEPInst(Ops, TD))
     return ReplaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
@@ -751,13 +808,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // by multiples of a zero size type with zero.
   if (TD) {
     bool MadeChange = false;
-    const Type *IntPtrTy = TD->getIntPtrType(GEP.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(GEP.getContext());
 
     gep_type_iterator GTI = gep_type_begin(GEP);
     for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end();
          I != E; ++I, ++GTI) {
       // Skip indices into struct types.
-      const SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
+      SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
       if (!SeqTy) continue;
 
       // If the element type has zero size then any index over it is equivalent
@@ -785,21 +842,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // getelementptr instructions into a single instruction.
   //
   if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
-
-    // If this GEP has only 0 indices, it is the same pointer as
-    // Src. If Src is not a trivial GEP too, don't combine
-    // the indices.
-    if (GEP.hasAllZeroIndices() && !Src->hasAllZeroIndices() &&
-        !Src->hasOneUse())
+    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
       return 0;
 
     // Note that if our source is a gep chain itself that we wait for that
     // chain to be resolved before we perform this transformation.  This
     // avoids us creating a TON of code in some cases.
-    //
-    if (GetElementPtrInst *SrcGEP =
-          dyn_cast<GetElementPtrInst>(Src->getOperand(0)))
-      if (SrcGEP->getNumOperands() == 2)
+    if (GEPOperator *SrcGEP =
+          dyn_cast<GEPOperator>(Src->getOperand(0)))
+      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
         return 0;   // Wait until our source is folded to completion.
 
     SmallVector<Value*, 8> Indices;
@@ -851,15 +902,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
     if (!Indices.empty())
       return (GEP.isInBounds() && Src->isInBounds()) ?
-        GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices.begin(),
-                                          Indices.end(), GEP.getName()) :
-        GetElementPtrInst::Create(Src->getOperand(0), Indices.begin(),
-                                  Indices.end(), GEP.getName());
+        GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices,
+                                          GEP.getName()) :
+        GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
   }
 
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
-  const PointerType *StrippedPtrTy =cast<PointerType>(StrippedPtr->getType());
+  PointerType *StrippedPtrTy =cast<PointerType>(StrippedPtr->getType());
   if (StrippedPtr != PtrOp &&
     StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
@@ -875,21 +925,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     //
     // This occurs when the program declares an array extern like "int X[];"
     if (HasZeroPointerIndex) {
-      const PointerType *CPTy = cast<PointerType>(PtrOp->getType());
-      if (const ArrayType *CATy =
+      PointerType *CPTy = cast<PointerType>(PtrOp->getType());
+      if (ArrayType *CATy =
           dyn_cast<ArrayType>(CPTy->getElementType())) {
         // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
         if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
           // -> GEP i8* X, ...
           SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
           GetElementPtrInst *Res =
-            GetElementPtrInst::Create(StrippedPtr, Idx.begin(),
-                                      Idx.end(), GEP.getName());
+            GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName());
           Res->setIsInBounds(GEP.isInBounds());
           return Res;
         }
         
-        if (const ArrayType *XATy =
+        if (ArrayType *XATy =
               dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
           // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
           if (CATy->getElementType() == XATy->getElementType()) {
@@ -907,8 +956,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // Transform things like:
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
-      const Type *SrcElTy = StrippedPtrTy->getElementType();
-      const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      Type *SrcElTy = StrippedPtrTy->getElementType();
+      Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
       if (TD && SrcElTy->isArrayTy() &&
           TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
           TD->getTypeAllocSize(ResElTy)) {
@@ -916,8 +965,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
         Idx[1] = GEP.getOperand(1);
         Value *NewGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(StrippedPtr, Idx, Idx + 2, GEP.getName()) :
-          Builder->CreateGEP(StrippedPtr, Idx, Idx + 2, GEP.getName());
+          Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
+          Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
         // V and GEP are both pointer types --> BitCast
         return new BitCastInst(NewGEP, GEP.getType());
       }
@@ -975,8 +1024,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
           Idx[1] = NewIdx;
           Value *NewGEP = GEP.isInBounds() ?
-            Builder->CreateInBoundsGEP(StrippedPtr, Idx, Idx + 2,GEP.getName()):
-            Builder->CreateGEP(StrippedPtr, Idx, Idx + 2, GEP.getName());
+            Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()):
+            Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
           // The NewGEP must be pointer typed, so must the old one -> BitCast
           return new BitCastInst(NewGEP, GEP.getType());
         }
@@ -1023,14 +1072,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
       SmallVector<Value*, 8> NewIndices;
-      const Type *InTy =
+      Type *InTy =
         cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
       if (FindElementAtOffset(InTy, Offset, NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices.begin(),
-                                     NewIndices.end()) :
-          Builder->CreateGEP(BCI->getOperand(0), NewIndices.begin(),
-                             NewIndices.end());
+          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
+          Builder->CreateGEP(BCI->getOperand(0), NewIndices);
         
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
@@ -1045,15 +1092,43 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
 
 
-static bool IsOnlyNullComparedAndFreed(const Value &V) {
-  for (Value::const_use_iterator UI = V.use_begin(), UE = V.use_end();
+static bool IsOnlyNullComparedAndFreed(Value *V, SmallVectorImpl<WeakVH> &Users,
+                                       int Depth = 0) {
+  if (Depth == 8)
+    return false;
+
+  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
        UI != UE; ++UI) {
-    const User *U = *UI;
-    if (isFreeCall(U))
+    User *U = *UI;
+    if (isFreeCall(U)) {
+      Users.push_back(U);
       continue;
-    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(U))
-      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1)))
+    }
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
+      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) {
+        Users.push_back(ICI);
+        continue;
+      }
+    }
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (IsOnlyNullComparedAndFreed(BCI, Users, Depth+1)) {
+        Users.push_back(BCI);
+        continue;
+      }
+    }
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (IsOnlyNullComparedAndFreed(GEPI, Users, Depth+1)) {
+        Users.push_back(GEPI);
+        continue;
+      }
+    }
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        Users.push_back(II);
         continue;
+      }
+    }
     return false;
   }
   return true;
@@ -1063,25 +1138,20 @@ Instruction *InstCombiner::visitMalloc(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
-  if (IsOnlyNullComparedAndFreed(MI)) {
-    for (Value::use_iterator UI = MI.use_begin(), UE = MI.use_end();
-         UI != UE;) {
-      // We can assume that every remaining use is a free call or an icmp eq/ne
-      // to null, so the cast is safe.
-      Instruction *I = cast<Instruction>(*UI);
-
-      // Early increment here, as we're about to get rid of the user.
-      ++UI;
-
-      if (isFreeCall(I)) {
-        EraseInstFromFunction(*cast<CallInst>(I));
-        continue;
+  SmallVector<WeakVH, 64> Users;
+  if (IsOnlyNullComparedAndFreed(&MI, Users)) {
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      Instruction *I = cast_or_null<Instruction>(&*Users[i]);
+      if (!I) continue;
+
+      if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
+        ReplaceInstUsesWith(*C,
+                            ConstantInt::get(Type::getInt1Ty(C->getContext()),
+                                             C->isFalseWhenEqual()));
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        ReplaceInstUsesWith(*I, UndefValue::get(I->getType()));
       }
-      // Again, the cast is safe.
-      ICmpInst *C = cast<ICmpInst>(I);
-      ReplaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()),
-                                               C->isFalseWhenEqual()));
-      EraseInstFromFunction(*C);
+      EraseInstFromFunction(*I);
     }
     return EraseInstFromFunction(MI);
   }
@@ -1120,8 +1190,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       !isa<Constant>(X)) {
     // Swap Destinations and condition...
     BI.setCondition(X);
-    BI.setSuccessor(0, FalseDest);
-    BI.setSuccessor(1, TrueDest);
+    BI.swapSuccessors();
     return &BI;
   }
 
@@ -1136,8 +1205,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
       
       // Swap Destinations and condition.
-      BI.setSuccessor(0, FalseDest);
-      BI.setSuccessor(1, TrueDest);
+      BI.swapSuccessors();
       Worklist.Add(Cond);
       return &BI;
     }
@@ -1153,8 +1221,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       ICmpInst *Cond = cast<ICmpInst>(BI.getCondition());
       Cond->setPredicate(ICmpInst::getInversePredicate(IPred));
       // Swap Destinations and condition.
-      BI.setSuccessor(0, FalseDest);
-      BI.setSuccessor(1, TrueDest);
+      BI.swapSuccessors();
       Worklist.Add(Cond);
       return &BI;
     }
@@ -1168,11 +1235,17 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     if (I->getOpcode() == Instruction::Add)
       if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // change 'switch (X+4) case 1:' into 'switch (X) case -3'
-        for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2)
-          SI.setOperand(i,
-                   ConstantExpr::getSub(cast<Constant>(SI.getOperand(i)),
-                                                AddRHS));
-        SI.setOperand(0, I->getOperand(0));
+        unsigned NumCases = SI.getNumCases();
+        // Skip the first item since that's the default case.
+        for (unsigned i = 1; i < NumCases; ++i) {
+          ConstantInt* CaseVal = SI.getCaseValue(i);
+          Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal),
+                                                      AddRHS);
+          assert(isa<ConstantInt>(NewCaseVal) &&
+                 "Result of expression should be constant");
+          SI.setSuccessorValue(i, cast<ConstantInt>(NewCaseVal));
+        }
+        SI.setCondition(I->getOperand(0));
         Worklist.Add(I);
         return &SI;
       }
@@ -1242,7 +1315,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       Value *NewEV = Builder->CreateExtractValue(IV->getAggregateOperand(),
                                                  EV.getIndices());
       return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
-                                     ArrayRef<unsigned>(insi, inse));
+                                     makeArrayRef(insi, inse));
     }
     if (insi == inse)
       // The insert list is a prefix of the extract list
@@ -1254,7 +1327,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // with
       // %E extractvalue { i32 } { i32 42 }, 0
       return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
-                                      ArrayRef<unsigned>(exti, exte));
+                                      makeArrayRef(exti, exte));
   }
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
     // We're extracting from an intrinsic, see if we're the only user, which
@@ -1310,7 +1383,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
     // load from a GEP. This reduces the size of the load.
     // FIXME: If a load is used only by extractvalue instructions then this
     //        could be done regardless of having multiple uses.
-    if (!L->isVolatile() && L->hasOneUse()) {
+    if (L->isSimple() && L->hasOneUse()) {
       // extractvalue has integer indices, getelementptr has Value*s. Convert.
       SmallVector<Value*, 4> Indices;
       // Prefix an i32 0 since we need the first element.
@@ -1322,8 +1395,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // We need to insert these at the location of the old load, not at that of
       // the extractvalue.
       Builder->SetInsertPoint(L->getParent(), L);
-      Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(),
-                                              Indices.begin(), Indices.end());
+      Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(), Indices);
       // Returning the load directly will cause the main loop to insert it in
       // the wrong spot, so use ReplaceInstUsesWith().
       return ReplaceInstUsesWith(EV, Builder->CreateLoad(GEP));
@@ -1339,6 +1411,342 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   return 0;
 }
 
+enum Personality_Type {
+  Unknown_Personality,
+  GNU_Ada_Personality,
+  GNU_CXX_Personality
+};
+
+/// RecognizePersonality - See if the given exception handling personality
+/// function is one that we understand.  If so, return a description of it;
+/// otherwise return Unknown_Personality.
+static Personality_Type RecognizePersonality(Value *Pers) {
+  Function *F = dyn_cast<Function>(Pers->stripPointerCasts());
+  if (!F)
+    return Unknown_Personality;
+  return StringSwitch<Personality_Type>(F->getName())
+    .Case("__gnat_eh_personality", GNU_Ada_Personality)
+    .Case("__gxx_personality_v0", GNU_CXX_Personality)
+    .Default(Unknown_Personality);
+}
+
+/// isCatchAll - Return 'true' if the given typeinfo will match anything.
+static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) {
+  switch (Personality) {
+  case Unknown_Personality:
+    return false;
+  case GNU_Ada_Personality:
+    // While __gnat_all_others_value will match any Ada exception, it doesn't
+    // match foreign exceptions (or didn't, before gcc-4.7).
+    return false;
+  case GNU_CXX_Personality:
+    return TypeInfo->isNullValue();
+  }
+  llvm_unreachable("Unknown personality!");
+}
+
+static bool shorter_filter(const Value *LHS, const Value *RHS) {
+  return
+    cast<ArrayType>(LHS->getType())->getNumElements()
+  <
+    cast<ArrayType>(RHS->getType())->getNumElements();
+}
+
+Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+  // The logic here should be correct for any real-world personality function.
+  // However if that turns out not to be true, the offending logic can always
+  // be conditioned on the personality function, like the catch-all logic is.
+  Personality_Type Personality = RecognizePersonality(LI.getPersonalityFn());
+
+  // Simplify the list of clauses, eg by removing repeated catch clauses
+  // (these are often created by inlining).
+  bool MakeNewInstruction = false; // If true, recreate using the following:
+  SmallVector<Value *, 16> NewClauses; // - Clauses for the new instruction;
+  bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
+
+  SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
+  for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
+    bool isLastClause = i + 1 == e;
+    if (LI.isCatch(i)) {
+      // A catch clause.
+      Value *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = cast<Constant>(CatchClause->stripPointerCasts());
+
+      // If we already saw this clause, there is no point in having a second
+      // copy of it.
+      if (AlreadyCaught.insert(TypeInfo)) {
+        // This catch clause was not already seen.
+        NewClauses.push_back(CatchClause);
+      } else {
+        // Repeated catch clause - drop the redundant copy.
+        MakeNewInstruction = true;
+      }
+
+      // If this is a catch-all then there is no point in keeping any following
+      // clauses or marking the landingpad as having a cleanup.
+      if (isCatchAll(Personality, TypeInfo)) {
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+    } else {
+      // A filter clause.  If any of the filter elements were already caught
+      // then they can be dropped from the filter.  It is tempting to try to
+      // exploit the filter further by saying that any typeinfo that does not
+      // occur in the filter can't be caught later (and thus can be dropped).
+      // However this would be wrong, since typeinfos can match without being
+      // equal (for example if one represents a C++ class, and the other some
+      // class derived from it).
+      assert(LI.isFilter(i) && "Unsupported landingpad clause!");
+      Value *FilterClause = LI.getClause(i);
+      ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
+      unsigned NumTypeInfos = FilterType->getNumElements();
+
+      // An empty filter catches everything, so there is no point in keeping any
+      // following clauses or marking the landingpad as having a cleanup.  By
+      // dealing with this case here the following code is made a bit simpler.
+      if (!NumTypeInfos) {
+        NewClauses.push_back(FilterClause);
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+
+      bool MakeNewFilter = false; // If true, make a new filter.
+      SmallVector<Constant *, 16> NewFilterElts; // New elements.
+      if (isa<ConstantAggregateZero>(FilterClause)) {
+        // Not an empty filter - it contains at least one null typeinfo.
+        assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
+        Constant *TypeInfo =
+          Constant::getNullValue(FilterType->getElementType());
+        // If this typeinfo is a catch-all then the filter can never match.
+        if (isCatchAll(Personality, TypeInfo)) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // There is no point in having multiple copies of this typeinfo, so
+        // discard all but the first copy if there is more than one.
+        NewFilterElts.push_back(TypeInfo);
+        if (NumTypeInfos > 1)
+          MakeNewFilter = true;
+      } else {
+        ConstantArray *Filter = cast<ConstantArray>(FilterClause);
+        SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
+        NewFilterElts.reserve(NumTypeInfos);
+
+        // Remove any filter elements that were already caught or that already
+        // occurred in the filter.  While there, see if any of the elements are
+        // catch-alls.  If so, the filter can be discarded.
+        bool SawCatchAll = false;
+        for (unsigned j = 0; j != NumTypeInfos; ++j) {
+          Value *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = cast<Constant>(Elt->stripPointerCasts());
+          if (isCatchAll(Personality, TypeInfo)) {
+            // This element is a catch-all.  Bail out, noting this fact.
+            SawCatchAll = true;
+            break;
+          }
+          if (AlreadyCaught.count(TypeInfo))
+            // Already caught by an earlier clause, so having it in the filter
+            // is pointless.
+            continue;
+          // There is no point in having multiple copies of the same typeinfo in
+          // a filter, so only add it if we didn't already.
+          if (SeenInFilter.insert(TypeInfo))
+            NewFilterElts.push_back(cast<Constant>(Elt));
+        }
+        // A filter containing a catch-all cannot match anything by definition.
+        if (SawCatchAll) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // If we dropped something from the filter, make a new one.
+        if (NewFilterElts.size() < NumTypeInfos)
+          MakeNewFilter = true;
+      }
+      if (MakeNewFilter) {
+        FilterType = ArrayType::get(FilterType->getElementType(),
+                                    NewFilterElts.size());
+        FilterClause = ConstantArray::get(FilterType, NewFilterElts);
+        MakeNewInstruction = true;
+      }
+
+      NewClauses.push_back(FilterClause);
+
+      // If the new filter is empty then it will catch everything so there is
+      // no point in keeping any following clauses or marking the landingpad
+      // as having a cleanup.  The case of the original filter being empty was
+      // already handled above.
+      if (MakeNewFilter && !NewFilterElts.size()) {
+        assert(MakeNewInstruction && "New filter but not a new instruction!");
+        CleanupFlag = false;
+        break;
+      }
+    }
+  }
+
+  // If several filters occur in a row then reorder them so that the shortest
+  // filters come first (those with the smallest number of elements).  This is
+  // advantageous because shorter filters are more likely to match, speeding up
+  // unwinding, but mostly because it increases the effectiveness of the other
+  // filter optimizations below.
+  for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
+    unsigned j;
+    // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
+    for (j = i; j != e; ++j)
+      if (!isa<ArrayType>(NewClauses[j]->getType()))
+        break;
+
+    // Check whether the filters are already sorted by length.  We need to know
+    // if sorting them is actually going to do anything so that we only make a
+    // new landingpad instruction if it does.
+    for (unsigned k = i; k + 1 < j; ++k)
+      if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
+        // Not sorted, so sort the filters now.  Doing an unstable sort would be
+        // correct too but reordering filters pointlessly might confuse users.
+        std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
+                         shorter_filter);
+        MakeNewInstruction = true;
+        break;
+      }
+
+    // Look for the next batch of filters.
+    i = j + 1;
+  }
+
+  // If typeinfos matched if and only if equal, then the elements of a filter L
+  // that occurs later than a filter F could be replaced by the intersection of
+  // the elements of F and L.  In reality two typeinfos can match without being
+  // equal (for example if one represents a C++ class, and the other some class
+  // derived from it) so it would be wrong to perform this transform in general.
+  // However the transform is correct and useful if F is a subset of L.  In that
+  // case L can be replaced by F, and thus removed altogether since repeating a
+  // filter is pointless.  So here we look at all pairs of filters F and L where
+  // L follows F in the list of clauses, and remove L if every element of F is
+  // an element of L.  This can occur when inlining C++ functions with exception
+  // specifications.
+  for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
+    // Examine each filter in turn.
+    Value *Filter = NewClauses[i];
+    ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
+    if (!FTy)
+      // Not a filter - skip it.
+      continue;
+    unsigned FElts = FTy->getNumElements();
+    // Examine each filter following this one.  Doing this backwards means that
+    // we don't have to worry about filters disappearing under us when removed.
+    for (unsigned j = NewClauses.size() - 1; j != i; --j) {
+      Value *LFilter = NewClauses[j];
+      ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
+      if (!LTy)
+        // Not a filter - skip it.
+        continue;
+      // If Filter is a subset of LFilter, i.e. every element of Filter is also
+      // an element of LFilter, then discard LFilter.
+      SmallVector<Value *, 16>::iterator J = NewClauses.begin() + j;
+      // If Filter is empty then it is a subset of LFilter.
+      if (!FElts) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+        // Move on to the next filter.
+        continue;
+      }
+      unsigned LElts = LTy->getNumElements();
+      // If Filter is longer than LFilter then it cannot be a subset of it.
+      if (FElts > LElts)
+        // Move on to the next filter.
+        continue;
+      // At this point we know that LFilter has at least one element.
+      if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
+        // Filter is a subset of LFilter iff Filter contains only zeros (as we
+        // already know that Filter is not longer than LFilter).
+        if (isa<ConstantAggregateZero>(Filter)) {
+          assert(FElts <= LElts && "Should have handled this case earlier!");
+          // Discard LFilter.
+          NewClauses.erase(J);
+          MakeNewInstruction = true;
+        }
+        // Move on to the next filter.
+        continue;
+      }
+      ConstantArray *LArray = cast<ConstantArray>(LFilter);
+      if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
+        // Since Filter is non-empty and contains only zeros, it is a subset of
+        // LFilter iff LFilter contains a zero.
+        assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
+        for (unsigned l = 0; l != LElts; ++l)
+          if (LArray->getOperand(l)->isNullValue()) {
+            // LFilter contains a zero - discard it.
+            NewClauses.erase(J);
+            MakeNewInstruction = true;
+            break;
+          }
+        // Move on to the next filter.
+        continue;
+      }
+      // At this point we know that both filters are ConstantArrays.  Loop over
+      // operands to see whether every element of Filter is also an element of
+      // LFilter.  Since filters tend to be short this is probably faster than
+      // using a method that scales nicely.
+      ConstantArray *FArray = cast<ConstantArray>(Filter);
+      bool AllFound = true;
+      for (unsigned f = 0; f != FElts; ++f) {
+        Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
+        AllFound = false;
+        for (unsigned l = 0; l != LElts; ++l) {
+          Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
+          if (LTypeInfo == FTypeInfo) {
+            AllFound = true;
+            break;
+          }
+        }
+        if (!AllFound)
+          break;
+      }
+      if (AllFound) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+      }
+      // Move on to the next filter.
+    }
+  }
+
+  // If we changed any of the clauses, replace the old landingpad instruction
+  // with a new one.
+  if (MakeNewInstruction) {
+    LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
+                                                 LI.getPersonalityFn(),
+                                                 NewClauses.size());
+    for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
+      NLI->addClause(NewClauses[i]);
+    // A landing pad with no clauses must have the cleanup flag set.  It is
+    // theoretically possible, though highly unlikely, that we eliminated all
+    // clauses.  If so, force the cleanup flag to true.
+    if (NewClauses.empty())
+      CleanupFlag = true;
+    NLI->setCleanup(CleanupFlag);
+    return NLI;
+  }
+
+  // Even if none of the clauses changed, we may nonetheless have understood
+  // that the cleanup flag is pointless.  Clear it if so.
+  if (LI.isCleanup() != CleanupFlag) {
+    assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
+    LI.setCleanup(CleanupFlag);
+    return &LI;
+  }
+
+  return 0;
+}
+
 
 
 
@@ -1350,7 +1758,8 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   assert(I->hasOneUse() && "Invariants didn't hold!");
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-  if (isa<PHINode>(I) || I->mayHaveSideEffects() || isa<TerminatorInst>(I))
+  if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() ||
+      isa<TerminatorInst>(I))
     return false;
 
   // Do not sink alloca instructions out of the entry block.
@@ -1367,8 +1776,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
         return false;
   }
 
-  BasicBlock::iterator InsertPos = DestBlock->getFirstNonPHI();
-
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
   I->moveBefore(InsertPos);
   ++NumSunkInst;
   return true;
@@ -1503,27 +1911,29 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Do a quick scan over the function.  If we find any blocks that are
     // unreachable, remove any instructions inside of them.  This prevents
     // the instcombine code from having to deal with some bad special cases.
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      if (!Visited.count(BB)) {
-        Instruction *Term = BB->getTerminator();
-        while (Term != BB->begin()) {   // Remove instrs bottom-up
-          BasicBlock::iterator I = Term; --I;
-
-          DEBUG(errs() << "IC: DCE: " << *I << '\n');
-          // A debug intrinsic shouldn't force another iteration if we weren't
-          // going to do one without it.
-          if (!isa<DbgInfoIntrinsic>(I)) {
-            ++NumDeadInst;
-            MadeIRChange = true;
-          }
-
-          // If I is not void type then replaceAllUsesWith undef.
-          // This allows ValueHandlers and custom metadata to adjust itself.
-          if (!I->getType()->isVoidTy())
-            I->replaceAllUsesWith(UndefValue::get(I->getType()));
-          I->eraseFromParent();
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      if (Visited.count(BB)) continue;
+
+      // Delete the instructions backwards, as it has a reduced likelihood of
+      // having to update as many def-use and use-def chains.
+      Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+      while (EndInst != BB->begin()) {
+        // Delete the next to last instruction.
+        BasicBlock::iterator I = EndInst;
+        Instruction *Inst = --I;
+        if (!Inst->use_empty())
+          Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+        if (isa<LandingPadInst>(Inst)) {
+          EndInst = Inst;
+          continue;
         }
+        if (!isa<DbgInfoIntrinsic>(Inst)) {
+          ++NumDeadInst;
+          MadeIRChange = true;
+        }
+        Inst->eraseFromParent();
       }
+    }
   }
 
   while (!Worklist.isEmpty()) {
@@ -1604,13 +2014,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         // Everything uses the new instruction now.
         I->replaceAllUsesWith(Result);
 
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
         // Push the new instruction and any users onto the worklist.
         Worklist.Add(Result);
         Worklist.AddUsersToWorkList(*Result);
 
-        // Move the name to the new instruction first.
-        Result->takeName(I);
-
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
         BasicBlock::iterator InsertPos = I;
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 5700ac8..7b3a927a 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -6,3 +6,10 @@ add_llvm_library(LLVMInstrumentation
   PathProfiling.cpp
   ProfilingUtils.cpp
   )
+
+add_llvm_library_dependencies(LLVMInstrumentation
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index 1d31fcc..e8ef265 100644
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -74,7 +74,7 @@ bool EdgeProfiler::runOnModule(Module &M) {
     }
   }
 
-  const Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
+  Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
   GlobalVariable *Counters =
     new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
                        Constant::getNullValue(ATy), "EdgeProfCounters");
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 3f2c412..ccf7e11 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -60,11 +60,11 @@ namespace {
     bool runOnModule(Module &M);
 
     // Create the GCNO files for the Module based on DebugInfo.
-    void emitGCNO(DebugInfoFinder &DIF);
+    void emitGCNO();
 
     // Modify the program to track transitions along edges and call into the
     // profiling runtime to emit .gcda files when run.
-    bool emitProfileArcs(DebugInfoFinder &DIF);
+    bool emitProfileArcs();
 
     // Get pointers to the functions in the runtime library.
     Constant *getStartFileFunc();
@@ -86,8 +86,7 @@ namespace {
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(DebugInfoFinder &,
-                               SmallVector<std::pair<GlobalVariable *,
+    void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
                                                      MDNode *>, 8> &);
 
     std::string mangleName(DICompileUnit CU, std::string NewStem);
@@ -110,15 +109,6 @@ ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
   return new GCOVProfiler(EmitNotes, EmitData, Use402Format);
 }
 
-static DISubprogram findSubprogram(DIScope Scope) {
-  while (!Scope.isSubprogram()) {
-    assert(Scope.isLexicalBlock() &&
-           "Debug location not lexical block or subprogram");
-    Scope = DILexicalBlock(Scope).getContext();
-  }
-  return DISubprogram(Scope);
-}
-
 namespace {
   class GCOVRecord {
    protected:
@@ -177,18 +167,24 @@ namespace {
     }
 
     uint32_t length() {
+      // Here 2 = 1 for string lenght + 1 for '0' id#.
       return lengthOfGCOVString(Filename) + 2 + Lines.size();
     }
 
-   private:
-    friend class GCOVBlock;
+    void writeOut() {
+      write(0);
+      writeGCOVString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
 
-    GCOVLines(std::string Filename, raw_ostream *os)
-        : Filename(Filename) {
+    GCOVLines(StringRef F, raw_ostream *os) 
+      : Filename(F) {
       this->os = os;
     }
 
-    std::string Filename;
+   private:
+    StringRef Filename;
     SmallVector<uint32_t, 32> Lines;
   };
 
@@ -197,7 +193,7 @@ namespace {
   // other blocks.
   class GCOVBlock : public GCOVRecord {
    public:
-    GCOVLines &getFile(std::string Filename) {
+    GCOVLines &getFile(StringRef Filename) {
       GCOVLines *&Lines = LinesByFile[Filename];
       if (!Lines) {
         Lines = new GCOVLines(Filename, os);
@@ -220,13 +216,8 @@ namespace {
       write(Len);
       write(Number);
       for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
-               E = LinesByFile.end(); I != E; ++I) {
-        write(0);
-        writeGCOVString(I->second->Filename);
-        for (int i = 0, e = I->second->Lines.size(); i != e; ++i) {
-          write(I->second->Lines[i]);
-        }
-      }
+               E = LinesByFile.end(); I != E; ++I) 
+        I->second->writeOut();
       write(0);
       write(0);
     }
@@ -353,66 +344,66 @@ bool GCOVProfiler::runOnModule(Module &M) {
   this->M = &M;
   Ctx = &M.getContext();
 
-  DebugInfoFinder DIF;
-  DIF.processModule(M);
-
-  if (EmitNotes) emitGCNO(DIF);
-  if (EmitData) return emitProfileArcs(DIF);
+  if (EmitNotes) emitGCNO();
+  if (EmitData) return emitProfileArcs();
   return false;
 }
 
-void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
+void GCOVProfiler::emitGCNO() {
   DenseMap<const MDNode *, raw_fd_ostream *> GcnoFiles;
-  for (DebugInfoFinder::iterator I = DIF.compile_unit_begin(),
-           E = DIF.compile_unit_end(); I != E; ++I) {
-    // Each compile unit gets its own .gcno file. This means that whether we run
-    // this pass over the original .o's as they're produced, or run it after
-    // LTO, we'll generate the same .gcno files.
-
-    DICompileUnit CU(*I);
-    raw_fd_ostream *&out = GcnoFiles[CU];
-    std::string ErrorInfo;
-    out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
-                             raw_fd_ostream::F_Binary);
-    if (!Use402Format)
-      out->write("oncg*404MVLL", 12);
-    else
-      out->write("oncg*402MVLL", 12);
-  }
-
-  for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
-           SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
-    DISubprogram SP(*SPI);
-    raw_fd_ostream *&os = GcnoFiles[SP.getCompileUnit()];
-
-    Function *F = SP.getFunction();
-    if (!F) continue;
-    GCOVFunction Func(SP, os, Use402Format);
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      GCOVBlock &Block = Func.getBlock(BB);
-      TerminatorInst *TI = BB->getTerminator();
-      if (int successors = TI->getNumSuccessors()) {
-        for (int i = 0; i != successors; ++i) {
-          Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      // Each compile unit gets its own .gcno file. This means that whether we run
+      // this pass over the original .o's as they're produced, or run it after
+      // LTO, we'll generate the same .gcno files.
+      
+      DICompileUnit CU(CU_Nodes->getOperand(i));
+      raw_fd_ostream *&out = GcnoFiles[CU];
+      std::string ErrorInfo;
+      out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
+                               raw_fd_ostream::F_Binary);
+      if (!Use402Format)
+        out->write("oncg*404MVLL", 12);
+      else
+        out->write("oncg*204MVLL", 12);
+  
+      DIArray SPs = CU.getSubprograms();
+      for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+        DISubprogram SP(SPs.getElement(i));
+        if (!SP.Verify()) continue;
+        raw_fd_ostream *&os = GcnoFiles[CU];
+        
+        Function *F = SP.getFunction();
+        if (!F) continue;
+        GCOVFunction Func(SP, os, Use402Format);
+        
+        for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+          GCOVBlock &Block = Func.getBlock(BB);
+          TerminatorInst *TI = BB->getTerminator();
+          if (int successors = TI->getNumSuccessors()) {
+            for (int i = 0; i != successors; ++i) {
+              Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+            }
+          } else if (isa<ReturnInst>(TI)) {
+            Block.addEdge(Func.getReturnBlock());
+          }
+          
+          uint32_t Line = 0;
+          for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+            const DebugLoc &Loc = I->getDebugLoc();
+            if (Loc.isUnknown()) continue;
+            if (Line == Loc.getLine()) continue;
+            Line = Loc.getLine();
+            if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
+            
+            GCOVLines &Lines = Block.getFile(SP.getFilename());
+            Lines.addLine(Loc.getLine());
+          }
         }
-      } else if (isa<ReturnInst>(TI)) {
-        Block.addEdge(Func.getReturnBlock());
-      }
-
-      uint32_t Line = 0;
-      for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) {
-        const DebugLoc &Loc = I->getDebugLoc();
-        if (Loc.isUnknown()) continue;
-        if (Line == Loc.getLine()) continue;
-        Line = Loc.getLine();
-        if (SP != findSubprogram(DIScope(Loc.getScope(*Ctx)))) continue;
-
-        GCOVLines &Lines = Block.getFile(SP.getFilename());
-        Lines.addLine(Loc.getLine());
+        Func.writeOut();
       }
     }
-    Func.writeOut();
   }
 
   for (DenseMap<const MDNode *, raw_fd_ostream *>::iterator
@@ -424,103 +415,107 @@ void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
   }
 }
 
-bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) {
-  if (DIF.subprogram_begin() == DIF.subprogram_end())
-    return false;
-
-  SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
-  for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
-           SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
-    DISubprogram SP(*SPI);
-    Function *F = SP.getFunction();
-    if (!F) continue;
-
-    unsigned Edges = 0;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      if (isa<ReturnInst>(TI))
-        ++Edges;
-      else
-        Edges += TI->getNumSuccessors();
-    }
-
-    const ArrayType *CounterTy =
+bool GCOVProfiler::emitProfileArcs() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return false;
+
+  bool Result = false;  
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    DICompileUnit CU(CU_Nodes->getOperand(i));
+    DIArray SPs = CU.getSubprograms();
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram SP(SPs.getElement(i));
+      if (!SP.Verify()) continue;
+      Function *F = SP.getFunction();
+      if (!F) continue;
+      if (!Result) Result = true;
+      unsigned Edges = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        if (isa<ReturnInst>(TI))
+          ++Edges;
+        else
+          Edges += TI->getNumSuccessors();
+      }
+      
+      ArrayType *CounterTy =
         ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
-    GlobalVariable *Counters =
+      GlobalVariable *Counters =
         new GlobalVariable(*M, CounterTy, false,
                            GlobalValue::InternalLinkage,
                            Constant::getNullValue(CounterTy),
                            "__llvm_gcov_ctr", 0, false, 0);
-    CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
-
-    UniqueVector<BasicBlock *> ComplexEdgePreds;
-    UniqueVector<BasicBlock *> ComplexEdgeSuccs;
-
-    unsigned Edge = 0;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
-      if (Successors) {
-        IRBuilder<> Builder(TI);
-
-        if (Successors == 1) {
-          Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
-                                                              Edge);
-          Value *Count = Builder.CreateLoad(Counter);
-          Count = Builder.CreateAdd(Count,
-                                    ConstantInt::get(Type::getInt64Ty(*Ctx),1));
-          Builder.CreateStore(Count, Counter);
-        } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-          Value *Sel = Builder.CreateSelect(
+      CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
+      
+      UniqueVector<BasicBlock *> ComplexEdgePreds;
+      UniqueVector<BasicBlock *> ComplexEdgeSuccs;
+      
+      unsigned Edge = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+        if (Successors) {
+          IRBuilder<> Builder(TI);
+          
+          if (Successors == 1) {
+            Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                                Edge);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count,
+                                      ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+            Builder.CreateStore(Count, Counter);
+          } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            Value *Sel = Builder.CreateSelect(
               BI->getCondition(),
               ConstantInt::get(Type::getInt64Ty(*Ctx), Edge),
               ConstantInt::get(Type::getInt64Ty(*Ctx), Edge + 1));
-          SmallVector<Value *, 2> Idx;
-          Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx)));
-          Idx.push_back(Sel);
-          Value *Counter = Builder.CreateInBoundsGEP(Counters,
-                                                     Idx.begin(), Idx.end());
-          Value *Count = Builder.CreateLoad(Counter);
-          Count = Builder.CreateAdd(Count,
-                                    ConstantInt::get(Type::getInt64Ty(*Ctx),1));
-          Builder.CreateStore(Count, Counter);
-        } else {
-          ComplexEdgePreds.insert(BB);
-          for (int i = 0; i != Successors; ++i)
-            ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+            SmallVector<Value *, 2> Idx;
+            Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx)));
+            Idx.push_back(Sel);
+            Value *Counter = Builder.CreateInBoundsGEP(Counters, Idx);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count,
+                                      ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+            Builder.CreateStore(Count, Counter);
+          } else {
+            ComplexEdgePreds.insert(BB);
+            for (int i = 0; i != Successors; ++i)
+              ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+          }
+          Edge += Successors;
         }
-        Edge += Successors;
       }
-    }
-
-    if (!ComplexEdgePreds.empty()) {
-      GlobalVariable *EdgeTable =
+      
+      if (!ComplexEdgePreds.empty()) {
+        GlobalVariable *EdgeTable =
           buildEdgeLookupTable(F, Counters,
                                ComplexEdgePreds, ComplexEdgeSuccs);
-      GlobalVariable *EdgeState = getEdgeStateValue();
-
-      const Type *Int32Ty = Type::getInt32Ty(*Ctx);
-      for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
-        IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
-        Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState);
-      }
-      for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
-        // call runtime to perform increment
-        IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstNonPHI());
-        Value *CounterPtrArray =
+        GlobalVariable *EdgeState = getEdgeStateValue();
+        
+        Type *Int32Ty = Type::getInt32Ty(*Ctx);
+        for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
+          IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+          Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState);
+        }
+        for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
+          // call runtime to perform increment
+          BasicBlock::iterator InsertPt =
+            ComplexEdgeSuccs[i+1]->getFirstInsertionPt();
+          IRBuilder<> Builder(InsertPt);
+          Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
-        Builder.CreateCall2(getIncrementIndirectCounterFunc(),
-                            EdgeState, CounterPtrArray);
-        // clear the predecessor number
-        Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
+          Builder.CreateCall2(getIncrementIndirectCounterFunc(),
+                              EdgeState, CounterPtrArray);
+          // clear the predecessor number
+          Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
+        }
       }
     }
+    insertCounterWriteout(CountersBySP);
   }
-
-  insertCounterWriteout(DIF, CountersBySP);
-
-  return true;
+  return Result;
 }
 
 // All edges with successors that aren't branches are "complex", because it
@@ -535,8 +530,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
   // read it. Threads and invoke make this untrue.
 
   // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
-  const Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
-  const ArrayType *EdgeTableTy = ArrayType::get(
+  Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  ArrayType *EdgeTableTy = ArrayType::get(
       Int64PtrTy, Succs.size() * Preds.size());
 
   Constant **EdgeTable = new Constant*[Succs.size() * Preds.size()];
@@ -572,7 +567,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
 }
 
 Constant *GCOVProfiler::getStartFileFunc() {
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Type::getInt8PtrTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
 }
@@ -582,7 +577,7 @@ Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
     Type::getInt32PtrTy(*Ctx),                  // uint32_t *predecessor
     Type::getInt64PtrTy(*Ctx)->getPointerTo(),  // uint64_t **state_table_row
   };
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy);
 }
@@ -592,7 +587,7 @@ Constant *GCOVProfiler::getEmitFunctionFunc() {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
   };
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
@@ -602,13 +597,13 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
     Type::getInt32Ty(*Ctx),     // uint32_t num_counters
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
   };
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
 Constant *GCOVProfiler::getEndFileFunc() {
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
 }
 
@@ -626,9 +621,8 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
 }
 
 void GCOVProfiler::insertCounterWriteout(
-    DebugInfoFinder &DIF,
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
-  const FunctionType *WriteoutFTy =
+  FunctionType *WriteoutFTy =
       FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = Function::Create(WriteoutFTy,
                                          GlobalValue::InternalLinkage,
@@ -642,29 +636,31 @@ void GCOVProfiler::insertCounterWriteout(
   Constant *EmitArcs = getEmitArcsFunc();
   Constant *EndFile = getEndFileFunc();
 
-  for (DebugInfoFinder::iterator CUI = DIF.compile_unit_begin(),
-           CUE = DIF.compile_unit_end(); CUI != CUE; ++CUI) {
-    DICompileUnit compile_unit(*CUI);
-    std::string FilenameGcda = mangleName(compile_unit, "gcda");
-    Builder.CreateCall(StartFile,
-                       Builder.CreateGlobalStringPtr(FilenameGcda));
-    for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit compile_unit(CU_Nodes->getOperand(i));
+      std::string FilenameGcda = mangleName(compile_unit, "gcda");
+      Builder.CreateCall(StartFile,
+                         Builder.CreateGlobalStringPtr(FilenameGcda));
+      for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
              I = CountersBySP.begin(), E = CountersBySP.end();
-         I != E; ++I) {
-      DISubprogram SP(I->second);
-      intptr_t ident = reinterpret_cast<intptr_t>(I->second);
-      Builder.CreateCall2(EmitFunction,
-                          ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
-                          Builder.CreateGlobalStringPtr(SP.getName()));
-                                                        
-      GlobalVariable *GV = I->first;
-      unsigned Arcs =
+           I != E; ++I) {
+        DISubprogram SP(I->second);
+        intptr_t ident = reinterpret_cast<intptr_t>(I->second);
+        Builder.CreateCall2(EmitFunction,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
+                            Builder.CreateGlobalStringPtr(SP.getName()));
+        
+        GlobalVariable *GV = I->first;
+        unsigned Arcs =
           cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
-      Builder.CreateCall2(EmitArcs,
-                          ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs),
-                          Builder.CreateConstGEP2_64(GV, 0, 0));
+        Builder.CreateCall2(EmitArcs,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs),
+                            Builder.CreateConstGEP2_64(GV, 0, 0));
+      }
+      Builder.CreateCall(EndFile);
     }
-    Builder.CreateCall(EndFile);
   }
   Builder.CreateRetVoid();
 
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index e09f882..62c21b8 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -112,8 +112,8 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
   // be calculated from other edge counters on reading the profile info back
   // in.
 
-  const Type *Int32 = Type::getInt32Ty(M.getContext());
-  const ArrayType *ATy = ArrayType::get(Int32, NumEdges);
+  Type *Int32 = Type::getInt32Ty(M.getContext());
+  ArrayType *ATy = ArrayType::get(Int32, NumEdges);
   GlobalVariable *Counters =
     new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
                        Constant::getNullValue(ATy), "OptEdgeProfCounters");
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 7541663..23915d3 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -374,7 +374,7 @@ namespace llvm {
   template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable,
                                             xcompile> {
   public:
-    static const StructType *get(LLVMContext& C) {
+    static StructType *get(LLVMContext& C) {
       return( StructType::get(
                 TypeBuilder<types::i<32>, xcompile>::get(C), // type
                 TypeBuilder<types::i<32>, xcompile>::get(C), // array size
@@ -909,7 +909,7 @@ BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
                                                      pathNumber) {
   if(pathNumber == NULL || isa<ConstantInt>(pathNumber)
      || (((Instruction*)(pathNumber))->getParent()) != block) {
-    return(block->getFirstNonPHI());
+    return(block->getFirstInsertionPt());
   } else {
     Instruction* pathNumberInst = (Instruction*) (pathNumber);
     BasicBlock::iterator insertPoint;
@@ -930,7 +930,7 @@ BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
 // A PHINode is created in the node, and its values initialized to -1U.
 void PathProfiler::preparePHI(BLInstrumentationNode* node) {
   BasicBlock* block = node->getBlock();
-  BasicBlock::iterator insertPoint = block->getFirstNonPHI();
+  BasicBlock::iterator insertPoint = block->getFirstInsertionPt();
   pred_iterator PB = pred_begin(node->getBlock()),
           PE = pred_end(node->getBlock());
   PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context),
@@ -999,7 +999,7 @@ void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node,
   BasicBlock::iterator insertPoint;
 
   if( atBeginning )
-    insertPoint = block->getFirstNonPHI();
+    insertPoint = block->getFirstInsertionPt();
   else
     insertPoint = block->getTerminator();
 
@@ -1029,8 +1029,7 @@ void PathProfiler::insertCounterIncrement(Value* incValue,
     gepIndices[1] = incValue;
 
     GetElementPtrInst* pcPointer =
-      GetElementPtrInst::Create(dag->getCounterArray(),
-                                gepIndices.begin(), gepIndices.end(),
+      GetElementPtrInst::Create(dag->getCounterArray(), gepIndices,
                                 "counterInc", insertPoint);
 
     // Load from the array - call it oldPC
@@ -1140,7 +1139,7 @@ void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
     }
 
     BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstNonPHI() :
+      instrumentNode->getBlock()->getFirstInsertionPt() :
       instrumentNode->getBlock()->getTerminator();
 
     // add information from the bottom edge, if it exists
@@ -1172,7 +1171,7 @@ void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
   // Insert instrumentation if this is a normal edge
   else {
     BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstNonPHI() :
+      instrumentNode->getBlock()->getFirstInsertionPt() :
       instrumentNode->getBlock()->getTerminator();
 
     if( edge->isInitialization() ) { // initialize path number
@@ -1233,7 +1232,7 @@ void PathProfiler::insertInstrumentation(
          end = callEdges.end(); edge != end; edge++ ) {
     BLInstrumentationNode* node =
       (BLInstrumentationNode*)(*edge)->getSource();
-    BasicBlock::iterator insertPoint = node->getBlock()->getFirstNonPHI();
+    BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt();
 
     // Find the first function call
     while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call )
@@ -1289,7 +1288,7 @@ void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
 
   // Should we store the information in an array or hash
   if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    const Type* t = ArrayType::get(Type::getInt32Ty(*Context),
+    Type* t = ArrayType::get(Type::getInt32Ty(*Context),
                                    dag.getNumberOfPaths());
 
     dag.setCounterArray(new GlobalVariable(M, t, false,
@@ -1301,7 +1300,7 @@ void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
 
   // Add to global function reference table
   unsigned type;
-  const Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
+  Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
 
   if( dag.getNumberOfPaths() <= HASH_THRESHHOLD )
     type = ProfilingArray;
@@ -1315,7 +1314,7 @@ void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
     ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) :
     Constant::getNullValue(voidPtr);
 
-  const StructType* at = ftEntryTypeBuilder::get(*Context);
+  StructType* at = ftEntryTypeBuilder::get(*Context);
   ConstantStruct* functionEntry =
     (ConstantStruct*)ConstantStruct::get(at, entryArray);
   ftInit.push_back(functionEntry);
@@ -1379,8 +1378,8 @@ bool PathProfiler::runOnModule(Module &M) {
     runOnFunction(ftInit, *F, M);
   }
 
-  const Type *t = ftEntryTypeBuilder::get(*Context);
-  const ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
+  Type *t = ftEntryTypeBuilder::get(*Context);
+  ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
   Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit);
 
   DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n");
@@ -1388,7 +1387,7 @@ bool PathProfiler::runOnModule(Module &M) {
   GlobalVariable* functionTable =
     new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage,
                        ftInitConstant, "functionPathTable");
-  const Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
+  Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
   InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable,
                           PointerType::getUnqual(eltType));
 
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
index 445a5b6..de57cd1 100644
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -25,9 +25,9 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
                                    GlobalValue *Array,
                                    PointerType *arrayType) {
   LLVMContext &Context = MainFn->getContext();
-  const Type *ArgVTy =
+  Type *ArgVTy =
     PointerType::getUnqual(Type::getInt8PtrTy(Context));
-  const PointerType *UIntPtr = arrayType ? arrayType :
+  PointerType *UIntPtr = arrayType ? arrayType :
     Type::getInt32PtrTy(Context);
   Module &M = *MainFn->getParent();
   Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context),
@@ -51,8 +51,7 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
                              Constant::getNullValue(Type::getInt32Ty(Context)));
   unsigned NumElements = 0;
   if (Array) {
-    Args[2] = ConstantExpr::getGetElementPtr(Array, &GEPIndices[0],
-                                             GEPIndices.size());
+    Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices);
     NumElements =
       cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
   } else {
@@ -108,7 +107,7 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
 void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                    GlobalValue *CounterArray, bool beginning) {
   // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = beginning ? BB->getFirstNonPHI() :
+  BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() :
                                    BB->getTerminator();
   while (isa<AllocaInst>(InsertPos))
     ++InsertPos;
@@ -120,7 +119,7 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
   Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
   Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
   Constant *ElementPtr =
-    ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], Indices.size());
+    ConstantExpr::getGetElementPtr(CounterArray, Indices);
 
   // Load, increment and store the value back.
   Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
@@ -137,7 +136,7 @@ void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
     Type::getInt32Ty(Mod->getContext()),
     FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
   };
-  const StructType *GlobalDtorElemTy =
+  StructType *GlobalDtorElemTy =
       StructType::get(Mod->getContext(), GlobalDtorElems, false);
 
   // Construct the new element we'll be adding.
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index a5adb5e..ba214d1 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -57,6 +57,7 @@ bool ADCE::runOnFunction(Function& F) {
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
     if (isa<TerminatorInst>(I.getInstructionIterator()) ||
         isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
+        isa<LandingPadInst>(I.getInstructionIterator()) ||
         I->mayHaveSideEffects()) {
       alive.insert(I.getInstructionIterator());
       worklist.push_back(I.getInstructionIterator());
@@ -65,7 +66,6 @@ bool ADCE::runOnFunction(Function& F) {
   // Propagate liveness backwards to operands.
   while (!worklist.empty()) {
     Instruction* curr = worklist.pop_back_val();
-    
     for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
          OI != OE; ++OI)
       if (Instruction* Inst = dyn_cast<Instruction>(OI))
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index c223da6..79bcae5 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -29,6 +29,14 @@ add_llvm_library(LLVMScalarOpts
   SimplifyCFGPass.cpp
   SimplifyLibCalls.cpp
   Sink.cpp
-  TailDuplication.cpp
   TailRecursionElimination.cpp
   )
+
+add_llvm_library_dependencies(LLVMScalarOpts
+  LLVMAnalysis
+  LLVMCore
+  LLVMInstCombine
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 0af14ed..f8f18b2 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -58,6 +58,7 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
+STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -104,12 +105,13 @@ namespace {
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
     bool OptimizeBlock(BasicBlock &BB);
     bool OptimizeInst(Instruction *I);
-    bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy);
+    bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
     bool OptimizeInlineAsmInst(CallInst *CS);
     bool OptimizeCallInst(CallInst *CI);
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     bool DupRetToEnableTailCallOpts(ReturnInst *RI);
+    bool PlaceDbgValues(Function &F);
   };
 }
 
@@ -132,6 +134,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // find a node corresponding to the value.
+  EverMadeChange |= PlaceDbgValues(F);
+
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
@@ -410,8 +417,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
     CastInst *&InsertedCast = InsertedCasts[UserBB];
 
     if (!InsertedCast) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedCast =
         CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
                          InsertPt);
@@ -467,8 +473,7 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
 
     if (!InsertedCmp) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedCmp =
         CmpInst::Create(CI->getOpcode(),
                         CI->getPredicate(),  CI->getOperand(0),
@@ -528,7 +533,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
     bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
-    const Type *ReturnTy = CI->getType();
+    Type *ReturnTy = CI->getType();
     Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);    
     
     // Substituting this can cause recursive simplifications, which can
@@ -551,22 +556,6 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
 
-  // llvm.dbg.value is far away from the value then iSel may not be able
-  // handle it properly. iSel will drop llvm.dbg.value if it can not 
-  // find a node corresponding to the value.
-  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI))
-    if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()))
-      if (!VI->isTerminator() &&
-          (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) {
-        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
-        DVI->removeFromParent();
-        if (isa<PHINode>(VI))
-          DVI->insertBefore(VI->getParent()->getFirstNonPHI());
-        else
-          DVI->insertAfter(VI);
-        return true;
-      }
-
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
@@ -724,7 +713,7 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
 /// This method is used to optimize both load/store and inline asms with memory
 /// operands.
 bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
-                                        const Type *AccessTy) {
+                                        Type *AccessTy) {
   Value *Repl = Addr;
   
   // Try to collapse single-value PHI nodes.  This is necessary to undo 
@@ -746,13 +735,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     worklist.pop_back();
     
     // Break use-def graph loops.
-    if (Visited.count(V)) {
+    if (!Visited.insert(V)) {
       Consensus = 0;
       break;
     }
     
-    Visited.insert(V);
-    
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
@@ -763,7 +750,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode =
-      AddressingModeMatcher::Match(V, AccessTy,MemoryInst,
+      AddressingModeMatcher::Match(V, AccessTy, MemoryInst,
                                    NewAddrModeInsts, *TLI);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -822,7 +809,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Insert this computation right after this user.  Since our caller is
   // scanning from the top of the BB to the bottom, reuse of the expr are
   // guaranteed to happen later.
-  BasicBlock::iterator InsertPt = MemoryInst;
+  IRBuilder<> Builder(MemoryInst);
 
   // Now that we determined the addressing expression we want to use and know
   // that we have to sink it into this block.  Check to see if we have already
@@ -833,11 +820,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
     if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
-    const Type *IntPtrTy =
+    Type *IntPtrTy =
           TLI->getTargetData()->getIntPtrType(AccessTy->getContext());
 
     Value *Result = 0;
@@ -850,10 +837,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (AddrMode.BaseReg) {
       Value *V = AddrMode.BaseReg;
       if (V->getType()->isPointerTy())
-        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
       if (V->getType() != IntPtrTy)
-        V = CastInst::CreateIntegerCast(V, IntPtrTy, /*isSigned=*/true,
-                                        "sunkaddr", InsertPt);
+        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
       Result = V;
     }
 
@@ -863,29 +849,27 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       if (V->getType() == IntPtrTy) {
         // done.
       } else if (V->getType()->isPointerTy()) {
-        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
                  cast<IntegerType>(V->getType())->getBitWidth()) {
-        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
       } else {
-        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr");
       }
       if (AddrMode.Scale != 1)
-        V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy,
-                                                                AddrMode.Scale),
-                                      "sunkaddr", InsertPt);
+        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                              "sunkaddr");
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
 
     // Add in the BaseGV if present.
     if (AddrMode.BaseGV) {
-      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
-                                  InsertPt);
+      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
@@ -894,7 +878,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (AddrMode.BaseOffs) {
       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
@@ -902,7 +886,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (Result == 0)
       SunkAddr = Constant::getNullValue(Addr->getType());
     else
-      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
   }
 
   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
@@ -1059,8 +1043,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
 
     if (!InsertedTrunc) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
     }
 
@@ -1159,3 +1142,34 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 
   return MadeChange;
 }
+
+// llvm.dbg.value is far away from the value then iSel may not be able
+// handle it properly. iSel will drop llvm.dbg.value if it can not 
+// find a node corresponding to the value.
+bool CodeGenPrepare::PlaceDbgValues(Function &F) {
+  bool MadeChange = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    Instruction *PrevNonDbgInst = NULL;
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      Instruction *Insn = BI; ++BI;
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
+      if (!DVI) {
+        PrevNonDbgInst = Insn;
+        continue;
+      }
+
+      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
+      if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
+        MadeChange = true;
+        ++NumDbgValueMoved;
+      }
+    }
+  }
+  return MadeChange;
+}
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index cb9b5be..a593d0f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -52,18 +52,18 @@ namespace {
       AA = &getAnalysis<AliasAnalysis>();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
       DominatorTree &DT = getAnalysis<DominatorTree>();
-      
+
       bool Changed = false;
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
         // Only check non-dead blocks.  Dead blocks may have strange pointer
         // cycles that will confuse alias analysis.
         if (DT.isReachableFromEntry(I))
           Changed |= runOnBasicBlock(*I);
-      
+
       AA = 0; MD = 0;
       return Changed;
     }
-    
+
     bool runOnBasicBlock(BasicBlock &BB);
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
@@ -105,34 +105,34 @@ static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
                                   SmallPtrSet<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
-  
+
   NowDeadInsts.push_back(I);
   --NumFastOther;
-  
+
   // Before we touch this instruction, remove it from memdep!
   do {
     Instruction *DeadInst = NowDeadInsts.pop_back_val();
     ++NumFastOther;
-    
+
     // This instruction is dead, zap it, in stages.  Start by removing it from
     // MemDep, which needs to know the operands and needs it to be in the
     // function.
     MD.removeInstruction(DeadInst);
-    
+
     for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
       Value *Op = DeadInst->getOperand(op);
       DeadInst->setOperand(op, 0);
-      
+
       // If this operand just became dead, add it to the NowDeadInsts list.
       if (!Op->use_empty()) continue;
-      
+
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
         if (isInstructionTriviallyDead(OpI))
           NowDeadInsts.push_back(OpI);
     }
-    
+
     DeadInst->eraseFromParent();
-    
+
     if (ValueSet) ValueSet->erase(DeadInst);
   } while (!NowDeadInsts.empty());
 }
@@ -159,11 +159,13 @@ static bool hasMemoryWrite(Instruction *I) {
 }
 
 /// getLocForWrite - Return a Location stored to by the specified instruction.
+/// If isRemovable returns true, this function and getLocForRead completely
+/// describe the memory operations for this instruction.
 static AliasAnalysis::Location
 getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return AA.getLocation(SI);
-  
+
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
     // memcpy/memmove/memset.
     AliasAnalysis::Location Loc = AA.getLocationForDest(MI);
@@ -174,10 +176,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
       return AliasAnalysis::Location();
     return Loc;
   }
-  
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
   if (II == 0) return AliasAnalysis::Location();
-  
+
   switch (II->getIntrinsicID()) {
   default: return AliasAnalysis::Location(); // Unhandled intrinsic.
   case Intrinsic::init_trampoline:
@@ -185,7 +187,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // that we should use the size of the pointee type.  This isn't valid for
     // init.trampoline, which writes more than an i8.
     if (AA.getTargetData() == 0) return AliasAnalysis::Location();
-      
+
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
     return AliasAnalysis::Location(II->getArgOperand(0));
@@ -198,10 +200,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
 
 /// getLocForRead - Return the location read by the specified "hasMemoryWrite"
 /// instruction if any.
-static AliasAnalysis::Location 
+static AliasAnalysis::Location
 getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
   assert(hasMemoryWrite(Inst) && "Unknown instruction case");
-  
+
   // The only instructions that both read and write are the mem transfer
   // instructions (memcpy/memmove).
   if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
@@ -213,10 +215,10 @@ getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
 /// isRemovable - If the value of this instruction and the memory it writes to
 /// is unused, may we delete this instruction?
 static bool isRemovable(Instruction *I) {
-  // Don't remove volatile stores.
+  // Don't remove volatile/atomic stores.
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return !SI->isVolatile();
-  
+    return SI->isUnordered();
+
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
   default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate");
@@ -227,7 +229,7 @@ static bool isRemovable(Instruction *I) {
   case Intrinsic::init_trampoline:
     // Always safe to remove init_trampoline.
     return true;
-    
+
   case Intrinsic::memset:
   case Intrinsic::memmove:
   case Intrinsic::memcpy:
@@ -255,16 +257,16 @@ static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) {
   const TargetData *TD = AA.getTargetData();
   if (TD == 0)
     return AliasAnalysis::UnknownSize;
-  
+
   if (AllocaInst *A = dyn_cast<AllocaInst>(V)) {
     // Get size information for the alloca
     if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
       return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
     return AliasAnalysis::UnknownSize;
   }
-  
+
   assert(isa<Argument>(V) && "Expected AllocaInst or Argument!");
-  const PointerType *PT = cast<PointerType>(V->getType());
+  PointerType *PT = cast<PointerType>(V->getType());
   return TD->getTypeAllocSize(PT->getElementType());
 }
 
@@ -287,7 +289,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
                                 AliasAnalysis &AA) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
-  
+
   // If the start pointers are the same, we just have to compare sizes to see if
   // the later store was larger than the earlier store.
   if (P1 == P2) {
@@ -302,33 +304,33 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
         return Later.Ptr->getType() == Earlier.Ptr->getType();
       return false;
     }
-    
+
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size < Earlier.Size)
       return false;
     return true;
   }
-  
+
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
       Earlier.Size == AliasAnalysis::UnknownSize ||
       Later.Size <= Earlier.Size || AA.getTargetData() == 0)
     return false;
-  
+
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
   const TargetData &TD = *AA.getTargetData();
-  
+
   const Value *UO1 = GetUnderlyingObject(P1, &TD),
               *UO2 = GetUnderlyingObject(P2, &TD);
-  
+
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
     return false;
-  
+
   // If the "Later" store is to a recognizable object, get its size.
   if (isObjectPointerWithTrustworthySize(UO2)) {
     uint64_t ObjectSize =
@@ -336,26 +338,26 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
     if (ObjectSize == Later.Size)
       return true;
   }
-  
+
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
   // pointers are equal, then we can reason about the two stores.
   int64_t EarlierOff = 0, LaterOff = 0;
   const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD);
   const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD);
-  
+
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
     return false;
 
   // The later store completely overlaps the earlier store if:
-  // 
+  //
   // 1. Both start at the same offset and the later one's size is greater than
   //    or equal to the earlier one's, or
   //
   //      |--earlier--|
   //      |--   later   --|
-  //      
+  //
   // 2. The earlier store has an offset greater than the later offset, but which
   //    still lies completely within the later store.
   //
@@ -373,7 +375,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
 
 /// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
 /// memory region into an identical pointer) then it doesn't actually make its
-/// input dead in the traditional sense.  Consider this case: 
+/// input dead in the traditional sense.  Consider this case:
 ///
 ///   memcpy(A <- B)
 ///   memcpy(A <- A)
@@ -391,10 +393,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // location read.
   AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA);
   if (InstReadLoc.Ptr == 0) return false;  // Not a reading instruction.
-  
+
   // If the read and written loc obviously don't alias, it isn't a read.
   if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
-  
+
   // Okay, 'Inst' may copy over itself.  However, we can still remove a the
   // DepWrite instruction if we can prove that it reads from the same location
   // as Inst.  This handles useful cases like:
@@ -404,10 +406,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // aliases, so removing the first memcpy is safe (assuming it writes <= #
   // bytes as the second one.
   AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA);
-  
+
   if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
     return false;
-  
+
   // If DepWrite doesn't read memory or if we can't prove it is a must alias,
   // then it can't be considered dead.
   return true;
@@ -420,43 +422,43 @@ static bool isPossibleSelfRead(Instruction *Inst,
 
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
   bool MadeChange = false;
-  
+
   // Do a top-down walk on the BB.
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
     Instruction *Inst = BBI++;
-    
+
     // Handle 'free' calls specially.
     if (CallInst *F = isFreeCall(Inst)) {
       MadeChange |= HandleFree(F);
       continue;
     }
-    
+
     // If we find something that writes memory, get its memory dependence.
     if (!hasMemoryWrite(Inst))
       continue;
 
     MemDepResult InstDep = MD->getDependency(Inst);
-    
+
     // Ignore any store where we can't find a local dependence.
     // FIXME: cross-block DSE would be fun. :)
-    if (InstDep.isNonLocal() || InstDep.isUnknown())
+    if (!InstDep.isDef() && !InstDep.isClobber())
       continue;
-     
+
     // If we're storing the same value back to a pointer that we just
     // loaded from, then the store can be removed.
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
         if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-            SI->getOperand(0) == DepLoad && !SI->isVolatile()) {
+            SI->getOperand(0) == DepLoad && isRemovable(SI)) {
           DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
                        << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
-          
+
           // DeleteDeadInstruction can delete the current instruction.  Save BBI
           // in case we need it.
           WeakVH NextInst(BBI);
-          
+
           DeleteDeadInstruction(SI, *MD);
-          
+
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
           else if (BBI != BB.begin())  // Revisit this instruction if possible.
@@ -467,15 +469,15 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
         }
       }
     }
-    
+
     // Figure out what location is being stored to.
     AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA);
 
     // If we didn't get a useful location, fail.
     if (Loc.Ptr == 0)
       continue;
-    
-    while (!InstDep.isNonLocal() && !InstDep.isUnknown()) {
+
+    while (InstDep.isDef() || InstDep.isClobber()) {
       // Get the memory clobbered by the instruction we depend on.  MemDep will
       // skip any instructions that 'Loc' clearly doesn't interact with.  If we
       // end up depending on a may- or must-aliased load, then we can't optimize
@@ -496,12 +498,12 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
         DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
               << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-        
+
         // Delete the store and now-dead instructions that feed it.
         DeleteDeadInstruction(DepWrite, *MD);
         ++NumFastStores;
         MadeChange = true;
-        
+
         // DeleteDeadInstruction can delete the current instruction in loop
         // cases, reset BBI.
         BBI = Inst;
@@ -509,7 +511,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           --BBI;
         break;
       }
-      
+
       // If this is a may-aliased store that is clobbering the store value, we
       // can keep searching past it for another must-aliased pointer that stores
       // to the same location.  For example, in:
@@ -519,20 +521,20 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // we can remove the first store to P even though we don't know if P and Q
       // alias.
       if (DepWrite == &BB.front()) break;
-      
+
       // Can't look past this instruction if it might read 'Loc'.
       if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref)
         break;
-        
+
       InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB);
     }
   }
-  
+
   // If this block ends in a return, unwind, or unreachable, all allocas are
   // dead at its end, which means stores to them are also dead.
   if (BB.getTerminator()->getNumSuccessors() == 0)
     MadeChange |= handleEndBlock(BB);
-  
+
   return MadeChange;
 }
 
@@ -543,18 +545,18 @@ bool DSE::HandleFree(CallInst *F) {
 
   MemDepResult Dep = MD->getDependency(F);
 
-  while (!Dep.isNonLocal() && !Dep.isUnknown()) {
+  while (Dep.isDef() || Dep.isClobber()) {
     Instruction *Dependency = Dep.getInst();
     if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
       return MadeChange;
-  
+
     Value *DepPointer =
       GetUnderlyingObject(getStoredPointerOperand(Dependency));
 
     // Check for aliasing.
     if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
       return MadeChange;
-  
+
     // DCE instructions only used to calculate that store
     DeleteDeadInstruction(Dependency, *MD);
     ++NumFastStores;
@@ -567,7 +569,7 @@ bool DSE::HandleFree(CallInst *F) {
     //    free(s);
     Dep = MD->getDependency(F);
   };
-  
+
   return MadeChange;
 }
 
@@ -579,28 +581,28 @@ bool DSE::HandleFree(CallInst *F) {
 /// ret void
 bool DSE::handleEndBlock(BasicBlock &BB) {
   bool MadeChange = false;
-  
+
   // Keep track of all of the stack objects that are dead at the end of the
   // function.
   SmallPtrSet<Value*, 16> DeadStackObjects;
-  
+
   // Find all of the alloca'd pointers in the entry block.
   BasicBlock *Entry = BB.getParent()->begin();
   for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       DeadStackObjects.insert(AI);
-  
+
   // Treat byval arguments the same, stores to them are dead at the end of the
   // function.
   for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
        AE = BB.getParent()->arg_end(); AI != AE; ++AI)
     if (AI->hasByValAttr())
       DeadStackObjects.insert(AI);
-  
+
   // Scan the basic block backwards
   for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
     --BBI;
-    
+
     // If we find a store, check to see if it points into a dead stack value.
     if (hasMemoryWrite(BBI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
@@ -609,10 +611,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // Stores to stack values are valid candidates for removal.
       if (DeadStackObjects.count(Pointer)) {
         Instruction *Dead = BBI++;
-        
+
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
                      << *Dead << "\n  Object: " << *Pointer << '\n');
-        
+
         // DCE instructions only used to calculate that store.
         DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
         ++NumFastStores;
@@ -620,7 +622,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
         continue;
       }
     }
-    
+
     // Remove any dead non-memory-mutating instructions.
     if (isInstructionTriviallyDead(BBI)) {
       Instruction *Inst = BBI++;
@@ -629,55 +631,61 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       MadeChange = true;
       continue;
     }
-    
+
     if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) {
       DeadStackObjects.erase(A);
       continue;
     }
-    
+
     if (CallSite CS = cast<Value>(BBI)) {
       // If this call does not access memory, it can't be loading any of our
       // pointers.
       if (AA->doesNotAccessMemory(CS))
         continue;
-      
+
       // If the call might load from any of our allocas, then any store above
       // the call is live.
       SmallVector<Value*, 8> LiveAllocas;
       for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
            E = DeadStackObjects.end(); I != E; ++I) {
         // See if the call site touches it.
-        AliasAnalysis::ModRefResult A = 
+        AliasAnalysis::ModRefResult A =
           AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA));
-        
+
         if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
           LiveAllocas.push_back(*I);
       }
-      
+
       for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
            E = LiveAllocas.end(); I != E; ++I)
         DeadStackObjects.erase(*I);
-      
+
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
       if (DeadStackObjects.empty())
         return MadeChange;
-      
+
       continue;
     }
-    
+
     AliasAnalysis::Location LoadedLoc;
-    
+
     // If we encounter a use of the pointer, it is no longer considered dead
     if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
       LoadedLoc = AA->getLocation(L);
     } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
       LoadedLoc = AA->getLocation(V);
     } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
       LoadedLoc = AA->getLocationForSource(MTI);
-    } else {
-      // Not a loading instruction.
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
       continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
     }
 
     // Remove any allocas from the DeadPointer set that are loaded, as this
@@ -689,7 +697,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (DeadStackObjects.empty())
       break;
   }
-  
+
   return MadeChange;
 }
 
@@ -703,14 +711,14 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   // A constant can't be in the dead pointer set.
   if (isa<Constant>(UnderlyingPointer))
     return;
-  
+
   // If the kill pointer can be easily reduced to an alloca, don't bother doing
   // extraneous AA queries.
   if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
     DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer));
     return;
   }
-  
+
   SmallVector<Value*, 16> NowLive;
   for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
        E = DeadStackObjects.end(); I != E; ++I) {
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 3d3f17b..c0223d2 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -92,7 +92,7 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   // Hash in all of the operands as pointers.
   unsigned Res = 0;
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
-    Res ^= getHash(Inst->getOperand(i)) << i;
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
 
   if (CastInst *CI = dyn_cast<CastInst>(Inst))
     Res ^= getHash(CI->getType());
@@ -185,7 +185,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
     assert(!Inst->getOperand(i)->getType()->isMetadataTy() &&
            "Cannot value number calls with metadata operands");
-    Res ^= getHash(Inst->getOperand(i)) << i;
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
   }
   
   // Mix in the opcode.
@@ -357,7 +357,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // If this is a non-volatile load, process it.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Ignore volatile loads.
-      if (LI->isVolatile()) {
+      if (!LI->isSimple()) {
         LastStore = 0;
         continue;
       }
@@ -437,7 +437,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
          std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
         
         // Remember that this was the last store we saw for DSE.
-        if (!SI->isVolatile())
+        if (SI->isSimple())
           LastStore = SI;
       }
     }
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 87b7317..cbfdbcd 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -41,12 +41,16 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 STATISTIC(NumGVNInstr,  "Number of instructions deleted");
 STATISTIC(NumGVNLoad,   "Number of loads deleted");
 STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
 STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
 STATISTIC(NumPRELoad,   "Number of loads PRE'd");
 
 static cl::opt<bool> EnablePRE("enable-pre",
@@ -63,7 +67,7 @@ static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
 namespace {
   struct Expression {
     uint32_t opcode;
-    const Type *type;
+    Type *type;
     SmallVector<uint32_t, 4> varargs;
 
     Expression(uint32_t o = ~2U) : opcode(o) { }
@@ -548,6 +552,9 @@ namespace {
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
+    unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
+                                         BasicBlock *Root);
+    bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root);
   };
 
   char GVN::ID = 0;
@@ -655,7 +662,7 @@ SpeculationFailure:
 /// CanCoerceMustAliasedValueToLoad - Return true if
 /// CoerceAvailableValueToLoadType will succeed.
 static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
-                                            const Type *LoadTy,
+                                            Type *LoadTy,
                                             const TargetData &TD) {
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
@@ -680,17 +687,17 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 ///
 /// If we can't do it, return null.
 static Value *CoerceAvailableValueToLoadType(Value *StoredVal, 
-                                             const Type *LoadedTy,
+                                             Type *LoadedTy,
                                              Instruction *InsertPt,
                                              const TargetData &TD) {
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
     return 0;
   
   // If this is already the right type, just return it.
-  const Type *StoredValTy = StoredVal->getType();
+  Type *StoredValTy = StoredVal->getType();
   
-  uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy);
-  uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy);
+  uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
   
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
@@ -704,7 +711,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
     
-    const Type *TypeToCastTo = LoadedTy;
+    Type *TypeToCastTo = LoadedTy;
     if (TypeToCastTo->isPointerTy())
       TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext());
     
@@ -743,7 +750,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   }
   
   // Truncate the integer to the right size now.
-  const Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
   StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt);
   
   if (LoadedTy == NewIntTy)
@@ -765,7 +772,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 /// Check this case to see if there is anything more we can do before we give
 /// up.  This returns -1 if we have to give up, or a byte number in the stored
 /// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const TargetData &TD) {
@@ -839,7 +846,7 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
 
 /// AnalyzeLoadFromClobberingStore - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                           StoreInst *DepSI,
                                           const TargetData &TD) {
   // Cannot handle reading from store of first-class aggregate yet.
@@ -856,7 +863,7 @@ static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr,
 /// AnalyzeLoadFromClobberingLoad - This function is called when we have a
 /// memdep query of a load that ends up being clobbered by another load.  See if
 /// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
                                          LoadInst *DepLI, const TargetData &TD){
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
@@ -883,7 +890,7 @@ static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr,
 
 
 
-static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                             MemIntrinsic *MI,
                                             const TargetData &TD) {
   // If the mem operation is a non-constant size, we can't handle it.
@@ -920,7 +927,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
   Constant *OffsetCst = 
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
@@ -934,7 +941,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
 /// mustalias.  Check this case to see if there is anything more we can do
 /// before we give up.
 static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
-                                   const Type *LoadTy,
+                                   Type *LoadTy,
                                    Instruction *InsertPt, const TargetData &TD){
   LLVMContext &Ctx = SrcVal->getType()->getContext();
   
@@ -946,10 +953,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp");
+    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
   if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8),
-                                   "tmp");
+    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
   
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
@@ -959,11 +965,10 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
   
   if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp");
+    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
   
   if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8),
-                                 "tmp");
+    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
   
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
@@ -974,7 +979,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 /// because the pointers don't mustalias.  Check this case to see if there is
 /// anything more we can do before we give up.
 static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
-                                  const Type *LoadTy, Instruction *InsertPt,
+                                  Type *LoadTy, Instruction *InsertPt,
                                   GVN &gvn) {
   const TargetData &TD = *gvn.getTargetData();
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
@@ -982,8 +987,8 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
   unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
   if (Offset+LoadSize > SrcValSize) {
-    assert(!SrcVal->isVolatile() && "Cannot widen volatile load!");
-    assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load");
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
     // If we have a load/load clobber an DepLI can be widened to cover this
     // load, then we should widen it to the next power of 2 size big enough!
     unsigned NewLoadSize = Offset+LoadSize;
@@ -996,7 +1001,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     // memdep queries will find the new load.  We can't easily remove the old
     // load completely because it is already in the value numbering table.
     IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    const Type *DestPTy = 
+    Type *DestPTy = 
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy, 
                        cast<PointerType>(PtrVal->getType())->getAddressSpace());
@@ -1034,7 +1039,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 /// GetMemInstValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering mem intrinsic.
 static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                                     const Type *LoadTy, Instruction *InsertPt,
+                                     Type *LoadTy, Instruction *InsertPt,
                                      const TargetData &TD){
   LLVMContext &Ctx = LoadTy->getContext();
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
@@ -1081,7 +1086,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
   Constant *OffsetCst = 
   ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
@@ -1154,7 +1159,7 @@ struct AvailableValueInBlock {
   
   /// MaterializeAdjustedValue - Emit code into this block to adjust the value
   /// defined here to the specified type.  This handles various coercion cases.
-  Value *MaterializeAdjustedValue(const Type *LoadTy, GVN &gvn) const {
+  Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
     Value *Res;
     if (isSimpleValue()) {
       Res = getSimpleValue();
@@ -1213,7 +1218,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI->getType(), LI->getName());
   
-  const Type *LoadTy = LI->getType();
+  Type *LoadTy = LI->getType();
   
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
     const AvailableValueInBlock &AV = ValuesPerBlock[i];
@@ -1274,7 +1279,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
 
   // If we had a phi translation failure, we'll have a single entry which is a
   // clobber in the current block.  Reject this early.
-  if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) {
+  if (Deps.size() == 1
+      && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber())
+  {
     DEBUG(
       dbgs() << "GVN: non-local load ";
       WriteAsOperand(dbgs(), LI);
@@ -1294,7 +1301,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
-    if (DepInfo.isUnknown()) {
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
     }
@@ -1359,7 +1366,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       continue;
     }
 
-    assert(DepInfo.isDef() && "Expecting def here");
+    // DepInfo.isDef() here
 
     Instruction *DepInst = DepInfo.getInst();
 
@@ -1446,8 +1453,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     Blockers.insert(UnavailableBlocks[i]);
 
-  // Lets find first basic block with more than one predecessor.  Walk backwards
-  // through predecessors if needed.
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
@@ -1519,10 +1526,19 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
               << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
+
+      if (LoadBB->isLandingPad()) {
+        DEBUG(dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
       unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
       NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
     }
   }
+
   if (!NeedToSplit.empty()) {
     toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
     return false;
@@ -1660,7 +1676,7 @@ bool GVN::processLoad(LoadInst *L) {
   if (!MD)
     return false;
 
-  if (L->isVolatile())
+  if (!L->isSimple())
     return false;
 
   if (L->use_empty()) {
@@ -1747,7 +1763,11 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  if (Dep.isUnknown()) {
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  if (!Dep.isDef()) {
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
@@ -1757,12 +1777,6 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  // If it is defined in another block, try harder.
-  if (Dep.isNonLocal())
-    return processNonLocalLoad(L);
-
-  assert(Dep.isDef() && "Expecting def here");
-
   Instruction *DepInst = Dep.getInst();
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
@@ -1874,6 +1888,133 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
   return Val;
 }
 
+/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the
+/// use is dominated by the given basic block.  Returns the number of uses that
+/// were replaced.
+unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
+                                          BasicBlock *Root) {
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ) {
+    Instruction *User = cast<Instruction>(*UI);
+    unsigned OpNum = UI.getOperandNo();
+    ++UI;
+
+    if (DT->dominates(Root, User->getParent())) {
+      User->setOperand(OpNum, To);
+      ++Count;
+    }
+  }
+  return Count;
+}
+
+/// propagateEquality - The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
+  if (LHS == RHS) return false;
+  assert(LHS->getType() == RHS->getType() && "Equal but types differ!");
+
+  // Don't try to propagate equalities between constants.
+  if (isa<Constant>(LHS) && isa<Constant>(RHS))
+    return false;
+
+  // Make sure that any constants are on the right-hand side.  In general the
+  // best results are obtained by placing the longest lived value on the RHS.
+  if (isa<Constant>(LHS))
+    std::swap(LHS, RHS);
+
+  // If neither term is constant then bail out.  This is not for correctness,
+  // it's just that the non-constant case is much less useful: it occurs just
+  // as often as the constant case but handling it hardly ever results in an
+  // improvement.
+  if (!isa<Constant>(RHS))
+    return false;
+
+  // If value numbering later deduces that an instruction in the scope is equal
+  // to 'LHS' then ensure it will be turned into 'RHS'.
+  addToLeaderTable(VN.lookup_or_add(LHS), RHS, Root);
+
+  // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.
+  unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
+  bool Changed = NumReplacements > 0;
+  NumGVNEqProp += NumReplacements;
+
+  // Now try to deduce additional equalities from this one.  For example, if the
+  // known equality was "(A != B)" == "false" then it follows that A and B are
+  // equal in the scope.  Only boolean equalities with an explicit true or false
+  // RHS are currently supported.
+  if (!RHS->getType()->isIntegerTy(1))
+    // Not a boolean equality - bail out.
+    return Changed;
+  ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+  if (!CI)
+    // RHS neither 'true' nor 'false' - bail out.
+    return Changed;
+  // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+  bool isKnownTrue = CI->isAllOnesValue();
+  bool isKnownFalse = !isKnownTrue;
+
+  // If "A && B" is known true then both A and B are known true.  If "A || B"
+  // is known false then both A and B are known false.
+  Value *A, *B;
+  if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+      (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+    Changed |= propagateEquality(A, RHS, Root);
+    Changed |= propagateEquality(B, RHS, Root);
+    return Changed;
+  }
+
+  // If we are propagating an equality like "(A == B)" == "true" then also
+  // propagate the equality A == B.
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+    // Only equality comparisons are supported.
+    if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+        (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+      Changed |= propagateEquality(Op0, Op1, Root);
+    }
+    return Changed;
+  }
+
+  return Changed;
+}
+
+/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(BasicBlock *Src, BasicBlock *Dst,
+                                       DominatorTree *DT) {
+  // First off, there must not be more than one edge from Src to Dst, there
+  // should be exactly one.  So keep track of the number of times Src occurs
+  // as a predecessor of Dst and fail if it's more than once.  Secondly, any
+  // other predecessors of Dst should be dominated by Dst (see logic below).
+  bool SawEdgeFromSrc = false;
+  for (pred_iterator PI = pred_begin(Dst), PE = pred_end(Dst); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (Pred == Src) {
+      // An edge from Src to Dst.
+      if (SawEdgeFromSrc)
+        // There are multiple edges from Src to Dst - fail.
+        return false;
+      SawEdgeFromSrc = true;
+      continue;
+    }
+    // If the predecessor is not dominated by Dst, then it must be possible to
+    // reach it either without passing through Src (and thus not via the edge)
+    // or by passing through Src but taking a different edge out of Src.  Either
+    // way it is possible to reach Dst without passing via the edge, so fail.
+    if (!DT->dominates(Dst, *PI))
+      return false;
+  }
+  assert(SawEdgeFromSrc && "No edge between these basic blocks!");
+
+  // Every path from the entry block to Dst must at some point pass to Dst from
+  // a predecessor that is not dominated by Dst.  This predecessor can only be
+  // Src, since all others are dominated by Dst.  As there is only one edge from
+  // Src to Dst, the path passes by this edge.
+  return true;
+}
 
 /// processInstruction - When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
@@ -1891,6 +2032,7 @@ bool GVN::processInstruction(Instruction *I) {
     if (MD && V->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(I);
+    ++NumGVNSimpl;
     return true;
   }
 
@@ -1903,30 +2045,45 @@ bool GVN::processInstruction(Instruction *I) {
     return false;
   }
 
-  // For conditions branches, we can perform simple conditional propagation on
+  // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
     if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
       return false;
-    
+
     Value *BranchCond = BI->getCondition();
-    uint32_t CondVN = VN.lookup_or_add(BranchCond);
-  
+
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
-  
-    if (TrueSucc->getSinglePredecessor())
-      addToLeaderTable(CondVN,
-                   ConstantInt::getTrue(TrueSucc->getContext()),
-                   TrueSucc);
-    if (FalseSucc->getSinglePredecessor())
-      addToLeaderTable(CondVN,
-                   ConstantInt::getFalse(TrueSucc->getContext()),
-                   FalseSucc);
-    
-    return false;
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    if (isOnlyReachableViaThisEdge(Parent, TrueSucc, DT))
+      Changed |= propagateEquality(BranchCond,
+                                   ConstantInt::getTrue(TrueSucc->getContext()),
+                                   TrueSucc);
+
+    if (isOnlyReachableViaThisEdge(Parent, FalseSucc, DT))
+      Changed |= propagateEquality(BranchCond,
+                                   ConstantInt::getFalse(FalseSucc->getContext()),
+                                   FalseSucc);
+
+    return Changed;
   }
-  
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
+      BasicBlock *Dst = SI->getSuccessor(i);
+      if (isOnlyReachableViaThisEdge(Parent, Dst, DT))
+        Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst);
+    }
+    return Changed;
+  }
+
   // Instructions with void type don't return a value, so there's
   // no point in trying to find redudancies in them.
   if (I->getType()->isVoidTy()) return false;
@@ -2071,6 +2228,9 @@ bool GVN::performPRE(Function &F) {
     // Nothing to PRE in the entry block.
     if (CurrentBlock == &F.getEntryBlock()) continue;
 
+    // Don't perform PRE on a landing pad.
+    if (CurrentBlock->isLandingPad()) continue;
+
     for (BasicBlock::iterator BI = CurrentBlock->begin(),
          BE = CurrentBlock->end(); BI != BE; ) {
       Instruction *CurInst = BI++;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index dee3d38..75fa011 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -11,17 +11,6 @@
 // computations derived from them) into simpler forms suitable for subsequent
 // analysis and transformation.
 //
-// This transformation makes the following changes to each loop with an
-// identifiable induction variable:
-//   1. All loops are transformed to have a SINGLE canonical induction variable
-//      which starts at zero and steps by one.
-//   2. The canonical induction variable is guaranteed to be the first PHI node
-//      in the loop header block.
-//   3. The canonical induction variable is guaranteed to be in a wide enough
-//      type so that IV expressions need not be (directly) zero-extended or
-//      sign-extended.
-//   4. Any pointer arithmetic recurrences are raised to use array subscripts.
-//
 // If the trip count of a loop is computable, this pass also makes the following
 // changes:
 //   1. The exit condition for the loop is canonicalized to compare the
@@ -33,9 +22,6 @@
 //      purpose of the loop is to compute the exit value of some derived
 //      expression, this transformation will make the loop dead.
 //
-// This transformation should be followed by strength reduction after all of the
-// desired loop transformations have been performed.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "indvars"
@@ -57,11 +43,11 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved     , "Number of aux indvars removed");
@@ -69,15 +55,21 @@ STATISTIC(NumWidened     , "Number of indvars widened");
 STATISTIC(NumInserted    , "Number of canonical indvars added");
 STATISTIC(NumReplaced    , "Number of exit values replaced");
 STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
-STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
-STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
-STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
 
-static cl::opt<bool> DisableIVRewrite(
-  "disable-iv-rewrite", cl::Hidden,
-  cl::desc("Disable canonical induction variable rewriting"));
+namespace llvm {
+  cl::opt<bool> EnableIVRewrite(
+    "enable-iv-rewrite", cl::Hidden,
+    cl::desc("Enable canonical induction variable rewriting"));
+
+  // Trip count verification can be enabled by default under NDEBUG if we
+  // implement a strong expression equivalence checker in SCEV. Until then, we
+  // use the verify-indvars flag, which may assert in some cases.
+  cl::opt<bool> VerifyIndvars(
+    "verify-indvars", cl::Hidden,
+    cl::desc("Verify the ScalarEvolution result after running indvars"));
+}
 
 namespace {
   class IndVarSimplify : public LoopPass {
@@ -105,12 +97,12 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      if (!DisableIVRewrite)
+      if (EnableIVRewrite)
         AU.addRequired<IVUsers>();
       AU.addPreserved<ScalarEvolution>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      if (!DisableIVRewrite)
+      if (EnableIVRewrite)
         AU.addPreserved<IVUsers>();
       AU.setPreservesCFG();
     }
@@ -125,24 +117,14 @@ namespace {
     void HandleFloatingPointIV(Loop *L, PHINode *PH);
     void RewriteNonIntegerIVs(Loop *L);
 
-    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
-
-    void SimplifyIVUsers(SCEVExpander &Rewriter);
-    void SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter);
+    void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM);
 
-    bool EliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
-    void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
-    void EliminateIVRemainder(BinaryOperator *Rem,
-                              Value *IVOperand,
-                              bool IsSigned);
-
-    void SimplifyCongruentIVs(Loop *L);
+    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
     void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
 
-    ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
-                                        PHINode *IndVar,
-                                        SCEVExpander &Rewriter);
+    Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+                                     PHINode *IndVar, SCEVExpander &Rewriter);
 
     void SinkUnusedInvariants(Loop *L);
   };
@@ -211,6 +193,36 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
   return true;
 }
 
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+                                          DominatorTree *DT) {
+  PHINode *PHI = dyn_cast<PHINode>(User);
+  if (!PHI)
+    return User;
+
+  Instruction *InsertPt = 0;
+  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+    if (PHI->getIncomingValue(i) != Def)
+      continue;
+
+    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+    if (!InsertPt) {
+      InsertPt = InsertBB->getTerminator();
+      continue;
+    }
+    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+    InsertPt = InsertBB->getTerminator();
+  }
+  assert(InsertPt && "Missing phi operand");
+  assert((!isa<Instruction>(Def) ||
+          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
+         "def does not dominate all uses");
+  return InsertPt;
+}
+
 //===----------------------------------------------------------------------===//
 // RewriteNonIntegerIVs and helpers. Prefer integer IVs.
 //===----------------------------------------------------------------------===//
@@ -337,14 +349,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Positive and negative strides have different safety conditions.
   if (IncValue > 0) {
     // If we have a positive stride, we require the init to be less than the
-    // exit value and an equality or less than comparison.
-    if (InitValue >= ExitValue ||
-        NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE)
+    // exit value.
+    if (InitValue >= ExitValue)
       return;
 
     uint32_t Range = uint32_t(ExitValue-InitValue);
-    if (NewPred == CmpInst::ICMP_SLE) {
-      // Normalize SLE -> SLT, check for infinite loop.
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
       if (++Range == 0) return;  // Range overflows.
     }
 
@@ -364,14 +376,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
 
   } else {
     // If we have a negative stride, we require the init to be greater than the
-    // exit value and an equality or greater than comparison.
-    if (InitValue >= ExitValue ||
-        NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE)
+    // exit value.
+    if (InitValue <= ExitValue)
       return;
 
     uint32_t Range = uint32_t(InitValue-ExitValue);
-    if (NewPred == CmpInst::ICMP_SGE) {
-      // Normalize SGE -> SGT, check for infinite loop.
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
       if (++Range == 0) return;  // Range overflows.
     }
 
@@ -390,7 +402,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
       return;
   }
 
-  const IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
 
   // Insert new integer induction variable.
   PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
@@ -429,7 +441,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // platforms.
   if (WeakPH) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
-                                 PN->getParent()->getFirstNonPHI());
+                                 PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN);
   }
@@ -437,6 +449,8 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Add a new IVUsers entry for the newly-created integer PHI.
   if (IU)
     IU->AddUsersIfInteresting(NewPHI);
+
+  Changed = true;
 }
 
 void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
@@ -582,45 +596,15 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
 //===----------------------------------------------------------------------===//
 //  Rewrite IV users based on a canonical IV.
-//  To be replaced by -disable-iv-rewrite.
+//  Only for use with -enable-iv-rewrite.
 //===----------------------------------------------------------------------===//
 
-/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this
-/// loop. IVUsers is treated as a worklist. Each successive simplification may
-/// push more users which may themselves be candidates for simplification.
-///
-/// This is the old approach to IV simplification to be replaced by
-/// SimplifyIVUsersNoRewrite.
-///
-void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) {
-  // Each round of simplification involves a round of eliminating operations
-  // followed by a round of widening IVs. A single IVUsers worklist is used
-  // across all rounds. The inner loop advances the user. If widening exposes
-  // more uses, then another pass through the outer loop is triggered.
-  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
-    Instruction *UseInst = I->getUser();
-    Value *IVOperand = I->getOperandValToReplace();
-
-    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      EliminateIVComparison(ICmp, IVOperand);
-      continue;
-    }
-    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-        EliminateIVRemainder(Rem, IVOperand, IsSigned);
-        continue;
-      }
-    }
-  }
-}
-
-// FIXME: It is an extremely bad idea to indvar substitute anything more
-// complex than affine induction variables.  Doing so will put expensive
-// polynomial evaluations inside of the loop, and the str reduction pass
-// currently can only reduce affine polynomials.  For now just disable
-// indvar subst on anything more complex than an affine addrec, unless
-// it can be expanded to a trivial value.
+/// FIXME: It is an extremely bad idea to indvar substitute anything more
+/// complex than affine induction variables.  Doing so will put expensive
+/// polynomial evaluations inside of the loop, and the str reduction pass
+/// currently can only reduce affine polynomials.  For now just disable
+/// indvar subst on anything more complex than an affine addrec, unless
+/// it can be expanded to a trivial value.
 static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
   // Loop-invariant values are safe.
   if (SE->isLoopInvariant(S, L)) return true;
@@ -631,7 +615,8 @@ static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
     return AR->isAffine();
 
   // An add is safe it all its operands are safe.
-  if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) {
+  if (const SCEVCommutativeExpr *Commutative
+      = dyn_cast<SCEVCommutativeExpr>(S)) {
     for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(),
          E = Commutative->op_end(); I != E; ++I)
       if (!isSafe(*I, L, SE)) return false;
@@ -665,7 +650,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) {
   // of different sizes.
   for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) {
     Value *Op = UI->getOperandValToReplace();
-    const Type *UseTy = Op->getType();
+    Type *UseTy = Op->getType();
     Instruction *User = UI->getUser();
 
     // Compute the final addrec to expand into code.
@@ -692,18 +677,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) {
     // hoist loop invariants out of the loop. For PHI nodes, there may be
     // multiple uses, so compute the nearest common dominator for the
     // incoming blocks.
-    Instruction *InsertPt = User;
-    if (PHINode *PHI = dyn_cast<PHINode>(InsertPt))
-      for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-        if (PHI->getIncomingValue(i) == Op) {
-          if (InsertPt == User)
-            InsertPt = PHI->getIncomingBlock(i)->getTerminator();
-          else
-            InsertPt =
-              DT->findNearestCommonDominator(InsertPt->getParent(),
-                                             PHI->getIncomingBlock(i))
-                    ->getTerminator();
-        }
+    Instruction *InsertPt = getInsertPointForUses(User, Op, DT);
 
     // Now expand it into actual Instructions and patch it into place.
     Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt);
@@ -747,19 +721,38 @@ namespace {
   // extend operations. This information is recorded by CollectExtend and
   // provides the input to WidenIV.
   struct WideIVInfo {
-    const Type *WidestNativeType; // Widest integer type created [sz]ext
-    bool IsSigned;                // Was an sext user seen before a zext?
+    PHINode *NarrowIV;
+    Type *WidestNativeType; // Widest integer type created [sz]ext
+    bool IsSigned;          // Was an sext user seen before a zext?
 
-    WideIVInfo() : WidestNativeType(0), IsSigned(false) {}
+    WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {}
+  };
+
+  class WideIVVisitor : public IVVisitor {
+    ScalarEvolution *SE;
+    const TargetData *TD;
+
+  public:
+    WideIVInfo WI;
+
+    WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV,
+                  const TargetData *TData) :
+      SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; }
+
+    // Implement the interface used by simplifyUsersOfIV.
+    virtual void visitCast(CastInst *Cast);
   };
 }
 
-/// CollectExtend - Update information about the induction variable that is
+/// visitCast - Update information about the induction variable that is
 /// extended by this sign or zero extend operation. This is used to determine
 /// the final width of the IV before actually widening it.
-static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI,
-                          ScalarEvolution *SE, const TargetData *TD) {
-  const Type *Ty = Cast->getType();
+void WideIVVisitor::visitCast(CastInst *Cast) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
+  Type *Ty = Cast->getType();
   uint64_t Width = SE->getTypeSizeInBits(Ty);
   if (TD && !TD->isLegalInteger(Width))
     return;
@@ -779,6 +772,21 @@ static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI,
 }
 
 namespace {
+
+/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the
+/// WideIV that computes the same value as the Narrow IV def.  This avoids
+/// caching Use* pointers.
+struct NarrowIVDefUse {
+  Instruction *NarrowDef;
+  Instruction *NarrowUse;
+  Instruction *WideDef;
+
+  NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {}
+
+  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD):
+    NarrowDef(ND), NarrowUse(NU), WideDef(WD) {}
+};
+
 /// WidenIV - The goal of this transform is to remove sign and zero extends
 /// without creating any new induction variables. To do this, it creates a new
 /// phi of the wider type and redirects all users, either removing extends or
@@ -787,7 +795,7 @@ namespace {
 class WidenIV {
   // Parameters
   PHINode *OrigPhi;
-  const Type *WideType;
+  Type *WideType;
   bool IsSigned;
 
   // Context
@@ -803,13 +811,13 @@ class WidenIV {
   SmallVectorImpl<WeakVH> &DeadInsts;
 
   SmallPtrSet<Instruction*,16> Widened;
-  SmallVector<std::pair<Use *, Instruction *>, 8> NarrowIVUsers;
+  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
 public:
-  WidenIV(PHINode *PN, const WideIVInfo &WI, LoopInfo *LInfo,
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
           ScalarEvolution *SEv, DominatorTree *DTree,
           SmallVectorImpl<WeakVH> &DI) :
-    OrigPhi(PN),
+    OrigPhi(WI.NarrowIV),
     WideType(WI.WidestNativeType),
     IsSigned(WI.IsSigned),
     LI(LInfo),
@@ -826,21 +834,42 @@ public:
   PHINode *CreateWideIV(SCEVExpander &Rewriter);
 
 protected:
-  Instruction *CloneIVUser(Instruction *NarrowUse,
-                           Instruction *NarrowDef,
-                           Instruction *WideDef);
+  Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                   Instruction *Use);
+
+  Instruction *CloneIVUser(NarrowIVDefUse DU);
 
   const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
 
-  Instruction *WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
-                          Instruction *WideDef);
+  const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+  Instruction *WidenIVUse(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
 } // anonymous namespace
 
-static Value *getExtend( Value *NarrowOper, const Type *WideType,
-                               bool IsSigned, IRBuilder<> &Builder) {
+/// isLoopInvariant - Perform a quick domtree based check for loop invariance
+/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems
+/// gratuitous for this purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return true;
+
+  return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
   return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
                     Builder.CreateZExt(NarrowOper, WideType);
 }
@@ -848,10 +877,8 @@ static Value *getExtend( Value *NarrowOper, const Type *WideType,
 /// CloneIVUser - Instantiate a wide operation to replace a narrow
 /// operation. This only needs to handle operations that can evaluation to
 /// SCEVAddRec. It can safely return 0 for any operation we decide not to clone.
-Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
-                                  Instruction *NarrowDef,
-                                  Instruction *WideDef) {
-  unsigned Opcode = NarrowUse->getOpcode();
+Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
+  unsigned Opcode = DU.NarrowUse->getOpcode();
   switch (Opcode) {
   default:
     return 0;
@@ -865,24 +892,23 @@ Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
-    DEBUG(dbgs() << "Cloning IVUser: " << *NarrowUse << "\n");
-
-    IRBuilder<> Builder(NarrowUse);
+    DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n");
 
     // Replace NarrowDef operands with WideDef. Otherwise, we don't know
     // anything about the narrow operand yet so must insert a [sz]ext. It is
     // probably loop invariant and will be folded or hoisted. If it actually
     // comes from a widened IV, it should be removed during a future call to
     // WidenIVUse.
-    Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef :
-      getExtend(NarrowUse->getOperand(0), WideType, IsSigned, Builder);
-    Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef :
-      getExtend(NarrowUse->getOperand(1), WideType, IsSigned, Builder);
+    Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef :
+      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse);
+    Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef :
+      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse);
 
-    BinaryOperator *NarrowBO = cast<BinaryOperator>(NarrowUse);
+    BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse);
     BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
                                                     LHS, RHS,
                                                     NarrowBO->getName());
+    IRBuilder<> Builder(DU.NarrowUse);
     Builder.Insert(WideBO);
     if (const OverflowingBinaryOperator *OBO =
         dyn_cast<OverflowingBinaryOperator>(NarrowBO)) {
@@ -894,45 +920,46 @@ Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
   llvm_unreachable(0);
 }
 
-/// HoistStep - Attempt to hoist an IV increment above a potential use.
-///
-/// To successfully hoist, two criteria must be met:
-/// - IncV operands dominate InsertPos and
-/// - InsertPos dominates IncV
-///
-/// Meeting the second condition means that we don't need to check all of IncV's
-/// existing uses (it's moving up in the domtree).
-///
-/// This does not yet recursively hoist the operands, although that would
-/// not be difficult.
-static bool HoistStep(Instruction *IncV, Instruction *InsertPos,
-                      const DominatorTree *DT)
-{
-  if (DT->dominates(IncV, InsertPos))
-    return true;
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return it.
+const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+  // Handle the common case of add<nsw/nuw>
+  if (DU.NarrowUse->getOpcode() != Instruction::Add)
+    return 0;
 
-  if (!DT->dominates(InsertPos->getParent(), IncV->getParent()))
-    return false;
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = 0;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  if (IsSigned && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(!IsSigned && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return 0;
 
-  if (IncV->mayHaveSideEffects())
-    return false;
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr,
+                   IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW));
 
-  // Attempt to hoist IncV
-  for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end();
-       OI != OE; ++OI) {
-    Instruction *OInst = dyn_cast<Instruction>(OI);
-    if (OInst && !DT->dominates(OInst, InsertPos))
-      return false;
-  }
-  IncV->moveBefore(InsertPos);
-  return true;
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+  return AddRec;
 }
 
-// GetWideRecurrence - Is this instruction potentially interesting from IVUsers'
-// perspective after widening it's type? In other words, can the extend be
-// safely hoisted out of the loop with SCEV reducing the value to a recurrence
-// on the same loop. If so, return the sign or zero extended
-// recurrence. Otherwise return NULL.
+/// GetWideRecurrence - Is this instruction potentially interesting from
+/// IVUsers' perspective after widening it's type? In other words, can the
+/// extend be safely hoisted out of the loop with SCEV reducing the value to a
+/// recurrence on the same loop. If so, return the sign or zero extended
+/// recurrence. Otherwise return NULL.
 const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   if (!SE->isSCEVable(NarrowUse->getType()))
     return 0;
@@ -951,47 +978,45 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
   if (!AddRec || AddRec->getLoop() != L)
     return 0;
-
   return AddRec;
 }
 
 /// WidenIVUse - Determine whether an individual user of the narrow IV can be
 /// widened. If so, return the wide clone of the user.
-Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
-                                 Instruction *WideDef) {
-  Instruction *NarrowUse = cast<Instruction>(NarrowDefUse.getUser());
+Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) {
 
   // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
-  if (isa<PHINode>(NarrowUse) && LI->getLoopFor(NarrowUse->getParent()) != L)
+  if (isa<PHINode>(DU.NarrowUse) &&
+      LI->getLoopFor(DU.NarrowUse->getParent()) != L)
     return 0;
 
   // Our raison d'etre! Eliminate sign and zero extension.
-  if (IsSigned ? isa<SExtInst>(NarrowUse) : isa<ZExtInst>(NarrowUse)) {
-    Value *NewDef = WideDef;
-    if (NarrowUse->getType() != WideType) {
-      unsigned CastWidth = SE->getTypeSizeInBits(NarrowUse->getType());
+  if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
+    Value *NewDef = DU.WideDef;
+    if (DU.NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
       unsigned IVWidth = SE->getTypeSizeInBits(WideType);
       if (CastWidth < IVWidth) {
         // The cast isn't as wide as the IV, so insert a Trunc.
-        IRBuilder<> Builder(NarrowDefUse);
-        NewDef = Builder.CreateTrunc(WideDef, NarrowUse->getType());
+        IRBuilder<> Builder(DU.NarrowUse);
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
       }
       else {
         // A wider extend was hidden behind a narrower one. This may induce
         // another round of IV widening in which the intermediate IV becomes
         // dead. It should be very rare.
         DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
-              << " not wide enough to subsume " << *NarrowUse << "\n");
-        NarrowUse->replaceUsesOfWith(NarrowDef, WideDef);
-        NewDef = NarrowUse;
+              << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+        NewDef = DU.NarrowUse;
       }
     }
-    if (NewDef != NarrowUse) {
-      DEBUG(dbgs() << "INDVARS: eliminating " << *NarrowUse
-            << " replaced by " << *WideDef << "\n");
+    if (NewDef != DU.NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+            << " replaced by " << *DU.WideDef << "\n");
       ++NumElimExt;
-      NarrowUse->replaceAllUsesWith(NewDef);
-      DeadInsts.push_back(NarrowUse);
+      DU.NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.push_back(DU.NarrowUse);
     }
     // Now that the extend is gone, we want to expose it's uses for potential
     // further simplification. We don't need to directly inform SimplifyIVUsers
@@ -1004,29 +1029,32 @@ Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
   }
 
   // Does this user itself evaluate to a recurrence after widening?
-  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(NarrowUse);
+  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+  if (!WideAddRec) {
+      WideAddRec = GetExtendedOperandRecurrence(DU);
+  }
   if (!WideAddRec) {
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
-    IRBuilder<> Builder(NarrowDefUse);
-    Value *Trunc = Builder.CreateTrunc(WideDef, NarrowDef->getType());
-    NarrowUse->replaceUsesOfWith(NarrowDef, Trunc);
+    IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+    Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+    DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
     return 0;
   }
-  // We assume that block terminators are not SCEVable. We wouldn't want to
+  // Assume block terminators cannot evaluate to a recurrence. We can't to
   // insert a Trunc after a terminator if there happens to be a critical edge.
-  assert(NarrowUse != NarrowUse->getParent()->getTerminator() &&
+  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
          "SCEV is not expected to evaluate a block terminator");
 
   // Reuse the IV increment that SCEVExpander created as long as it dominates
   // NarrowUse.
   Instruction *WideUse = 0;
-  if (WideAddRec == WideIncExpr && HoistStep(WideInc, NarrowUse, DT)) {
+  if (WideAddRec == WideIncExpr
+      && SCEVExpander::hoistStep(WideInc, DU.NarrowUse, DT))
     WideUse = WideInc;
-  }
   else {
-    WideUse = CloneIVUser(NarrowUse, NarrowDef, WideDef);
+    WideUse = CloneIVUser(DU);
     if (!WideUse)
       return 0;
   }
@@ -1051,13 +1079,13 @@ Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
 void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
   for (Value::use_iterator UI = NarrowDef->use_begin(),
          UE = NarrowDef->use_end(); UI != UE; ++UI) {
-    Use &U = UI.getUse();
+    Instruction *NarrowUse = cast<Instruction>(*UI);
 
     // Handle data flow merges and bizarre phi cycles.
-    if (!Widened.insert(cast<Instruction>(U.getUser())))
+    if (!Widened.insert(NarrowUse))
       continue;
 
-    NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WideDef));
+    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef));
   }
 }
 
@@ -1124,23 +1152,19 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
   pushNarrowIVUsers(OrigPhi, WidePhi);
 
   while (!NarrowIVUsers.empty()) {
-    Use *UsePtr;
-    Instruction *WideDef;
-    tie(UsePtr, WideDef) = NarrowIVUsers.pop_back_val();
-    Use &NarrowDefUse = *UsePtr;
+    NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
 
     // Process a def-use edge. This may replace the use, so don't hold a
     // use_iterator across it.
-    Instruction *NarrowDef = cast<Instruction>(NarrowDefUse.get());
-    Instruction *WideUse = WidenIVUse(NarrowDefUse, NarrowDef, WideDef);
+    Instruction *WideUse = WidenIVUse(DU);
 
     // Follow all def-use edges from the previous narrow use.
     if (WideUse)
-      pushNarrowIVUsers(cast<Instruction>(NarrowDefUse.getUser()), WideUse);
+      pushNarrowIVUsers(DU.NarrowUse, WideUse);
 
     // WidenIVUse may have removed the def-use edge.
-    if (NarrowDef->use_empty())
-      DeadInsts.push_back(NarrowDef);
+    if (DU.NarrowDef->use_empty())
+      DeadInsts.push_back(DU.NarrowDef);
   }
   return WidePhi;
 }
@@ -1149,187 +1173,17 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 //  Simplification of IV users based on SCEV evaluation.
 //===----------------------------------------------------------------------===//
 
-void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
-  unsigned IVOperIdx = 0;
-  ICmpInst::Predicate Pred = ICmp->getPredicate();
-  if (IVOperand != ICmp->getOperand(0)) {
-    // Swapped
-    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
-    IVOperIdx = 1;
-    Pred = ICmpInst::getSwappedPredicate(Pred);
-  }
-
-  // Get the SCEVs for the ICmp operands.
-  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
-  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
-
-  // Simplify unnecessary loops away.
-  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
-  S = SE->getSCEVAtScope(S, ICmpLoop);
-  X = SE->getSCEVAtScope(X, ICmpLoop);
-
-  // If the condition is always true or always false, replace it with
-  // a constant value.
-  if (SE->isKnownPredicate(Pred, S, X))
-    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
-  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
-    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
-  else
-    return;
-
-  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
-  ++NumElimCmp;
-  Changed = true;
-  DeadInsts.push_back(ICmp);
-}
-
-void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem,
-                                          Value *IVOperand,
-                                          bool IsSigned) {
-  // We're only interested in the case where we know something about
-  // the numerator.
-  if (IVOperand != Rem->getOperand(0))
-    return;
-
-  // Get the SCEVs for the ICmp operands.
-  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
-  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
-
-  // Simplify unnecessary loops away.
-  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
-  S = SE->getSCEVAtScope(S, ICmpLoop);
-  X = SE->getSCEVAtScope(X, ICmpLoop);
-
-  // i % n  -->  i  if i is in [0,n).
-  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
-      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                           S, X))
-    Rem->replaceAllUsesWith(Rem->getOperand(0));
-  else {
-    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
-    const SCEV *LessOne =
-      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
-    if (IsSigned && !SE->isKnownNonNegative(LessOne))
-      return;
-
-    if (!SE->isKnownPredicate(IsSigned ?
-                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                              LessOne, X))
-      return;
-
-    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
-                                  Rem->getOperand(0), Rem->getOperand(1),
-                                  "tmp");
-    SelectInst *Sel =
-      SelectInst::Create(ICmp,
-                         ConstantInt::get(Rem->getType(), 0),
-                         Rem->getOperand(0), "tmp", Rem);
-    Rem->replaceAllUsesWith(Sel);
-  }
-
-  // Inform IVUsers about the new users.
-  if (IU) {
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
-  }
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
-  ++NumElimRem;
-  Changed = true;
-  DeadInsts.push_back(Rem);
-}
-
-/// EliminateIVUser - Eliminate an operation that consumes a simple IV and has
-/// no observable side-effect given the range of IV values.
-bool IndVarSimplify::EliminateIVUser(Instruction *UseInst,
-                                     Instruction *IVOperand) {
-  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-    EliminateIVComparison(ICmp, IVOperand);
-    return true;
-  }
-  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-      EliminateIVRemainder(Rem, IVOperand, IsSigned);
-      return true;
-    }
-  }
-
-  // Eliminate any operation that SCEV can prove is an identity function.
-  if (!SE->isSCEVable(UseInst->getType()) ||
-      (UseInst->getType() != IVOperand->getType()) ||
-      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
-    return false;
-
-  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
-
-  UseInst->replaceAllUsesWith(IVOperand);
-  ++NumElimIdentity;
-  Changed = true;
-  DeadInsts.push_back(UseInst);
-  return true;
-}
-
-/// pushIVUsers - Add all uses of Def to the current IV's worklist.
-///
-static void pushIVUsers(
-  Instruction *Def,
-  SmallPtrSet<Instruction*,16> &Simplified,
-  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
-
-  for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end();
-       UI != E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Avoid infinite or exponential worklist processing.
-    // Also ensure unique worklist users.
-    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
-    // self edges first.
-    if (User != Def && Simplified.insert(User))
-      SimpleIVUsers.push_back(std::make_pair(User, Def));
-  }
-}
-
-/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
-/// expression in terms of that IV.
-///
-/// This is similar to IVUsers' isInsteresting() but processes each instruction
-/// non-recursively when the operand is already known to be a simpleIVUser.
-///
-static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
-  if (!SE->isSCEVable(I->getType()))
-    return false;
-
-  // Get the symbolic expression for this instruction.
-  const SCEV *S = SE->getSCEV(I);
-
-  // We assume that terminators are not SCEVable.
-  assert((!S || I != I->getParent()->getTerminator()) &&
-         "can't fold terminators");
-
-  // Only consider affine recurrences.
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
-  if (AR && AR->getLoop() == L)
-    return true;
-
-  return false;
-}
 
-/// SimplifyIVUsersNoRewrite - Iteratively perform simplification on a worklist
-/// of IV users. Each successive simplification may push more users which may
+/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV
+/// users. Each successive simplification may push more users which may
 /// themselves be candidates for simplification.
 ///
-/// The "NoRewrite" algorithm does not require IVUsers analysis. Instead, it
-/// simplifies instructions in-place during analysis. Rather than rewriting
-/// induction variables bottom-up from their users, it transforms a chain of
-/// IVUsers top-down, updating the IR only when it encouters a clear
-/// optimization opportunitiy. A SCEVExpander "Rewriter" instance is still
-/// needed, but only used to generate a new IV (phi) of wider type for sign/zero
-/// extend elimination.
+/// Sign/Zero extend elimination is interleaved with IV simplification.
 ///
-/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
-///
-void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) {
-  std::map<PHINode *, WideIVInfo> WideIVMap;
+void IndVarSimplify::SimplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LPPassManager &LPM) {
+  SmallVector<WideIVInfo, 8> WideIVs;
 
   SmallVector<PHINode*, 8> LoopPhis;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
@@ -1345,108 +1199,81 @@ void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) {
     // extension. The first time SCEV attempts to normalize sign/zero extension,
     // the result becomes final. So for the most predictable results, we delay
     // evaluation of sign/zero extend evaluation until needed, and avoid running
-    // other SCEV based analysis prior to SimplifyIVUsersNoRewrite.
+    // other SCEV based analysis prior to SimplifyAndExtend.
     do {
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
-      WideIVInfo WI;
-
-      // Instructions processed by SimplifyIVUsers for CurrIV.
-      SmallPtrSet<Instruction*,16> Simplified;
-
-      // Use-def pairs if IV users waiting to be processed for CurrIV.
-      SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
-
-      // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
-      // called multiple times for the same LoopPhi. This is the proper thing to
-      // do for loop header phis that use each other.
-      pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
+      WideIVVisitor WIV(CurrIV, SE, TD);
 
-      while (!SimpleIVUsers.empty()) {
-        Instruction *UseInst, *Operand;
-        tie(UseInst, Operand) = SimpleIVUsers.pop_back_val();
-        // Bypass back edges to avoid extra work.
-        if (UseInst == CurrIV) continue;
+      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV);
 
-        if (EliminateIVUser(UseInst, Operand)) {
-          pushIVUsers(Operand, Simplified, SimpleIVUsers);
-          continue;
-        }
-        if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) {
-          bool IsSigned = Cast->getOpcode() == Instruction::SExt;
-          if (IsSigned || Cast->getOpcode() == Instruction::ZExt) {
-            CollectExtend(Cast, IsSigned, WI, SE, TD);
-          }
-          continue;
-        }
-        if (isSimpleIVUser(UseInst, L, SE)) {
-          pushIVUsers(UseInst, Simplified, SimpleIVUsers);
-        }
-      }
-      if (WI.WidestNativeType) {
-        WideIVMap[CurrIV] = WI;
+      if (WIV.WI.WidestNativeType) {
+        WideIVs.push_back(WIV.WI);
       }
     } while(!LoopPhis.empty());
 
-    for (std::map<PHINode *, WideIVInfo>::const_iterator I = WideIVMap.begin(),
-           E = WideIVMap.end(); I != E; ++I) {
-      WidenIV Widener(I->first, I->second, LI, SE, DT, DeadInsts);
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
+      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
       if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) {
         Changed = true;
         LoopPhis.push_back(WidePhi);
       }
     }
-    WideIVMap.clear();
   }
 }
 
-/// SimplifyCongruentIVs - Check for congruent phis in this loop header and
-/// populate ExprToIVMap for use later.
-///
-void IndVarSimplify::SimplifyCongruentIVs(Loop *L) {
-  DenseMap<const SCEV *, PHINode *> ExprToIVMap;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    PHINode *Phi = cast<PHINode>(I);
-    if (!SE->isSCEVable(Phi->getType()))
-      continue;
+//===----------------------------------------------------------------------===//
+//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
 
-    const SCEV *S = SE->getSCEV(Phi);
-    DenseMap<const SCEV *, PHINode *>::const_iterator Pos;
-    bool Inserted;
-    tie(Pos, Inserted) = ExprToIVMap.insert(std::make_pair(S, Phi));
-    if (Inserted)
-      continue;
-    PHINode *OrigPhi = Pos->second;
-    // Replacing the congruent phi is sufficient because acyclic redundancy
-    // elimination, CSE/GVN, should handle the rest. However, once SCEV proves
-    // that a phi is congruent, it's almost certain to be the head of an IV
-    // user cycle that is isomorphic with the original phi. So it's worth
-    // eagerly cleaning up the common case of a single IV increment.
-    if (BasicBlock *LatchBlock = L->getLoopLatch()) {
-      Instruction *OrigInc =
-        cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
-      Instruction *IsomorphicInc =
-        cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
-      if (OrigInc != IsomorphicInc &&
-          SE->getSCEV(OrigInc) == SE->getSCEV(IsomorphicInc) &&
-          HoistStep(OrigInc, IsomorphicInc, DT)) {
-        DEBUG(dbgs() << "INDVARS: Eliminated congruent iv.inc: "
-              << *IsomorphicInc << '\n');
-        IsomorphicInc->replaceAllUsesWith(OrigInc);
-        DeadInsts.push_back(IsomorphicInc);
-      }
+/// Check for expressions that ScalarEvolution generates to compute
+/// BackedgeTakenInfo. If these expressions have not been reduced, then
+/// expanding them may incur additional cost (albeit in the loop preheader).
+static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
+                                ScalarEvolution *SE) {
+  // If the backedge-taken count is a UDiv, it's very likely a UDiv that
+  // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a
+  // precise expression, rather than a UDiv from the user's code. If we can't
+  // find a UDiv in the code with some simple searching, assume the former and
+  // forego rewriting the loop.
+  if (isa<SCEVUDivExpr>(S)) {
+    ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
+    if (!OrigCond) return true;
+    const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
+    R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
+    if (R != S) {
+      const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
+      L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
+      if (L != S)
+        return true;
     }
-    DEBUG(dbgs() << "INDVARS: Eliminated congruent iv: " << *Phi << '\n');
-    ++NumElimIV;
-    Phi->replaceAllUsesWith(OrigPhi);
-    DeadInsts.push_back(Phi);
   }
-}
 
-//===----------------------------------------------------------------------===//
-//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
-//===----------------------------------------------------------------------===//
+  if (EnableIVRewrite)
+    return false;
+
+  // Recurse past add expressions, which commonly occur in the
+  // BackedgeTakenCount. They may already exist in program code, and if not,
+  // they are not too expensive rematerialize.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I) {
+      if (isHighCostExpansion(*I, BI, SE))
+        return true;
+    }
+    return false;
+  }
+
+  // HowManyLessThans uses a Max expression whenever the loop is not guarded by
+  // the exit condition.
+  if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S))
+    return true;
+
+  // If we haven't recognized an expensive SCEV patter, assume its an expression
+  // produced by program code.
+  return false;
+}
 
 /// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
 /// count expression can be safely and cheaply expanded into an instruction
@@ -1465,31 +1292,17 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
   if (!BI)
     return false;
 
-  // Special case: If the backedge-taken count is a UDiv, it's very likely a
-  // UDiv that ScalarEvolution produced in order to compute a precise
-  // expression, rather than a UDiv from the user's code. If we can't find a
-  // UDiv in the code with some simple searching, assume the former and forego
-  // rewriting the loop.
-  if (isa<SCEVUDivExpr>(BackedgeTakenCount)) {
-    ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
-    if (!OrigCond) return false;
-    const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
-    R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
-    if (R != BackedgeTakenCount) {
-      const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
-      L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
-      if (L != BackedgeTakenCount)
-        return false;
-    }
-  }
+  if (isHighCostExpansion(BackedgeTakenCount, BI, SE))
+    return false;
+
   return true;
 }
 
 /// getBackedgeIVType - Get the widest type used by the loop test after peeking
 /// through Truncs.
 ///
-/// TODO: Unnecessary if LFTR does not force a canonical IV.
-static const Type *getBackedgeIVType(Loop *L) {
+/// TODO: Unnecessary when ForceLFTR is removed.
+static Type *getBackedgeIVType(Loop *L) {
   if (!L->getExitingBlock())
     return 0;
 
@@ -1502,7 +1315,7 @@ static const Type *getBackedgeIVType(Loop *L) {
   if (!Cond)
     return 0;
 
-  const Type *Ty = 0;
+  Type *Ty = 0;
   for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end();
       OI != OE; ++OI) {
     assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types");
@@ -1515,12 +1328,187 @@ static const Type *getBackedgeIVType(Loop *L) {
   return Ty;
 }
 
+/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
+/// invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return 0;
+
+  switch (IncI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+    break;
+  case Instruction::GetElementPtr:
+    // An IV counter must preserve its type.
+    if (IncI->getNumOperands() == 2)
+      break;
+  default:
+    return 0;
+  }
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(1), L, DT))
+      return Phi;
+    return 0;
+  }
+  if (IncI->getOpcode() == Instruction::GetElementPtr)
+    return 0;
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(0), L, DT))
+      return Phi;
+  }
+  return 0;
+}
+
+/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
+/// that the current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+  assert(L->getExitingBlock() && "expected loop exit");
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  // Don't bother with LFTR if the loop is not properly simplified.
+  if (!LatchBlock)
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  assert(BI && "expected exit branch");
+
+  // Do LFTR to simplify the exit condition to an ICMP.
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return true;
+
+  // Do LFTR to simplify the exit ICMP to EQ/NE
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+    return true;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!isLoopInvariant(RHS, L, DT)) {
+    if (!isLoopInvariant(LHS, L, DT))
+      return true;
+    std::swap(LHS, RHS);
+  }
+  // Look for a simple IV counter LHS
+  PHINode *Phi = dyn_cast<PHINode>(LHS);
+  if (!Phi)
+    Phi = getLoopPhiForCounter(LHS, L, DT);
+
+  if (!Phi)
+    return true;
+
+  // Do LFTR if the exit condition's IV is *not* a simple counter.
+  Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch());
+  return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to
+/// be rewritten) loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+  for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end();
+       UI != UE; ++UI) {
+    if (*UI != Cond && *UI != IncV) return false;
+  }
+
+  for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end();
+       UI != UE; ++UI) {
+    if (*UI != Cond && *UI != Phi) return false;
+  }
+  return true;
+}
+
+/// FindLoopCounter - Find an affine IV in canonical form.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *
+FindLoopCounter(Loop *L, const SCEV *BECount,
+                ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) {
+  // I'm not sure how BECount could be a pointer type, but we definitely don't
+  // want to LFTR that.
+  if (BECount->getType()->isPointerTy())
+    return 0;
+
+  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+  Value *Cond =
+    cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+  // Loop over all of the PHI nodes, looking for a simple counter.
+  PHINode *BestPhi = 0;
+  const SCEV *BestInit = 0;
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!SE->isSCEVable(Phi->getType()))
+      continue;
+
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+    if (!AR || AR->getLoop() != L || !AR->isAffine())
+      continue;
+
+    // AR may be a pointer type, while BECount is an integer type.
+    // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+    // AR may not be a narrower type, or we may never exit.
+    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+    if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth)))
+      continue;
+
+    const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+    if (!Step || !Step->isOne())
+      continue;
+
+    int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+    Value *IncV = Phi->getIncomingValue(LatchIdx);
+    if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+      continue;
+
+    const SCEV *Init = AR->getStart();
+
+    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+      // Don't force a live loop counter if another IV can be used.
+      if (AlmostDeadIV(Phi, LatchBlock, Cond))
+        continue;
+
+      // Prefer to count-from-zero. This is a more "canonical" counter form. It
+      // also prefers integer to pointer IVs.
+      if (BestInit->isZero() != Init->isZero()) {
+        if (BestInit->isZero())
+          continue;
+      }
+      // If two IVs both count from zero or both count from nonzero then the
+      // narrower is likely a dead phi that has been widened. Use the wider phi
+      // to allow the other to be eliminated.
+      if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+        continue;
+    }
+    BestPhi = Phi;
+    BestInit = Init;
+  }
+  return BestPhi;
+}
+
 /// LinearFunctionTestReplace - This method rewrites the exit condition of the
 /// loop to be a canonical != comparison against the incremented loop induction
 /// variable.  This pass is able to rewrite the exit tests of any loop where the
 /// SCEV analysis can determine a loop-invariant trip count of the loop, which
 /// is actually a much broader range than just linear tests.
-ICmpInst *IndVarSimplify::
+Value *IndVarSimplify::
 LinearFunctionTestReplace(Loop *L,
                           const SCEV *BackedgeTakenCount,
                           PHINode *IndVar,
@@ -1528,62 +1516,117 @@ LinearFunctionTestReplace(Loop *L,
   assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
   BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
 
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  Type *CntTy = !EnableIVRewrite ?
+    BackedgeTakenCount->getType() : IndVar->getType();
+
+  const SCEV *IVLimit = BackedgeTakenCount;
+
   // If the exiting block is not the same as the backedge block, we must compare
   // against the preincremented value, otherwise we prefer to compare against
   // the post-incremented value.
   Value *CmpIndVar;
-  const SCEV *RHS = BackedgeTakenCount;
   if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
     // If this addition may overflow, we have to be more pessimistic and
     // cast the induction variable before doing the add.
-    const SCEV *Zero = SE->getConstant(BackedgeTakenCount->getType(), 0);
     const SCEV *N =
-      SE->getAddExpr(BackedgeTakenCount,
-                     SE->getConstant(BackedgeTakenCount->getType(), 1));
-    if ((isa<SCEVConstant>(N) && !N->isZero()) ||
-        SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
-      // No overflow. Cast the sum.
-      RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType());
-    } else {
-      // Potential overflow. Cast before doing the add.
-      RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
-                                        IndVar->getType());
-      RHS = SE->getAddExpr(RHS,
-                           SE->getConstant(IndVar->getType(), 1));
+      SE->getAddExpr(IVLimit, SE->getConstant(IVLimit->getType(), 1));
+    if (CntTy == IVLimit->getType())
+      IVLimit = N;
+    else {
+      const SCEV *Zero = SE->getConstant(IVLimit->getType(), 0);
+      if ((isa<SCEVConstant>(N) && !N->isZero()) ||
+          SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
+        // No overflow. Cast the sum.
+        IVLimit = SE->getTruncateOrZeroExtend(N, CntTy);
+      } else {
+        // Potential overflow. Cast before doing the add.
+        IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy);
+        IVLimit = SE->getAddExpr(IVLimit, SE->getConstant(CntTy, 1));
+      }
     }
-
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
     CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
   } else {
     // We have to use the preincremented value...
-    RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
-                                      IndVar->getType());
+    IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy);
     CmpIndVar = IndVar;
   }
 
+  // For unit stride, IVLimit = Start + BECount with 2's complement overflow.
+  // So for, non-zero start compute the IVLimit here.
+  bool isPtrIV = false;
+  Type *CmpTy = CntTy;
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+  if (!AR->getStart()->isZero()) {
+    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+    const SCEV *IVInit = AR->getStart();
+
+    // For pointer types, sign extend BECount in order to materialize a GEP.
+    // Note that for without EnableIVRewrite, we never run SCEVExpander on a
+    // pointer type, because we must preserve the existing GEPs. Instead we
+    // directly generate a GEP later.
+    if (IVInit->getType()->isPointerTy()) {
+      isPtrIV = true;
+      CmpTy = SE->getEffectiveSCEVType(IVInit->getType());
+      IVLimit = SE->getTruncateOrSignExtend(IVLimit, CmpTy);
+    }
+    // For integer types, truncate the IV before computing IVInit + BECount.
+    else {
+      if (SE->getTypeSizeInBits(IVInit->getType())
+          > SE->getTypeSizeInBits(CmpTy))
+        IVInit = SE->getTruncateExpr(IVInit, CmpTy);
+
+      IVLimit = SE->getAddExpr(IVInit, IVLimit);
+    }
+  }
   // Expand the code for the iteration count.
-  assert(SE->isLoopInvariant(RHS, L) &&
+  IRBuilder<> Builder(BI);
+
+  assert(SE->isLoopInvariant(IVLimit, L) &&
          "Computed iteration count is not loop invariant!");
-  Value *ExitCnt = Rewriter.expandCodeFor(RHS, IndVar->getType(), BI);
+  Value *ExitCnt = Rewriter.expandCodeFor(IVLimit, CmpTy, BI);
+
+  // Create a gep for IVInit + IVLimit from on an existing pointer base.
+  assert(isPtrIV == IndVar->getType()->isPointerTy() &&
+         "IndVar type must match IVInit type");
+  if (isPtrIV) {
+      Value *IVStart = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+      assert(AR->getStart() == SE->getSCEV(IVStart) && "bad loop counter");
+      assert(SE->getSizeOfExpr(
+               cast<PointerType>(IVStart->getType())->getElementType())->isOne()
+             && "unit stride pointer IV must be i8*");
+
+      Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+      ExitCnt = Builder.CreateGEP(IVStart, ExitCnt, "lftr.limit");
+      Builder.SetInsertPoint(BI);
+  }
 
   // Insert a new icmp_ne or icmp_eq instruction before the branch.
-  ICmpInst::Predicate Opcode;
+  ICmpInst::Predicate P;
   if (L->contains(BI->getSuccessor(0)))
-    Opcode = ICmpInst::ICMP_NE;
+    P = ICmpInst::ICMP_NE;
   else
-    Opcode = ICmpInst::ICMP_EQ;
+    P = ICmpInst::ICMP_EQ;
 
   DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
                << "      LHS:" << *CmpIndVar << '\n'
                << "       op:\t"
-               << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
-               << "      RHS:\t" << *RHS << "\n");
+               << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+               << "      RHS:\t" << *ExitCnt << "\n"
+               << "     Expr:\t" << *IVLimit << "\n");
+
+  if (SE->getTypeSizeInBits(CmpIndVar->getType())
+      > SE->getTypeSizeInBits(CmpTy)) {
+    CmpIndVar = Builder.CreateTrunc(CmpIndVar, CmpTy, "lftr.wideiv");
+  }
 
-  ICmpInst *Cond = new ICmpInst(BI, Opcode, CmpIndVar, ExitCnt, "exitcond");
-  Cond->setDebugLoc(BI->getDebugLoc());
+  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
   Value *OrigCond = BI->getCondition();
   // It's tempting to use replaceAllUsesWith here to fully replace the old
   // comparison, but that's not immediately safe, since users of the old
@@ -1612,7 +1655,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) return;
 
-  Instruction *InsertPt = ExitBlock->getFirstNonPHI();
+  Instruction *InsertPt = ExitBlock->getFirstInsertionPt();
   BasicBlock::iterator I = Preheader->getTerminator();
   while (I != Preheader->begin()) {
     --I;
@@ -1633,6 +1676,10 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
+    // Skip landingpad instructions.
+    if (isa<LandingPadInst>(I))
+      continue;
+
     // Don't sink static AllocaInsts out of the entry block, which would
     // turn them into dynamic allocas!
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
@@ -1699,7 +1746,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  if (!DisableIVRewrite)
+  if (EnableIVRewrite)
     IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
@@ -1717,6 +1764,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
 
   // Eliminate redundant IV users.
   //
@@ -1724,9 +1774,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // attempt to avoid evaluating SCEVs for sign/zero extend operations until
   // other expressions involving loop IVs have been evaluated. This helps SCEV
   // set no-wrap flags before normalizing sign/zero extension.
-  if (DisableIVRewrite) {
+  if (!EnableIVRewrite) {
     Rewriter.disableCanonicalMode();
-    SimplifyIVUsersNoRewrite(L, Rewriter);
+    SimplifyAndExtend(L, Rewriter, LPM);
   }
 
   // Check to see if this loop has a computable loop-invariant execution count.
@@ -1739,25 +1789,25 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     RewriteLoopExitValues(L, Rewriter);
 
   // Eliminate redundant IV users.
-  if (!DisableIVRewrite)
-    SimplifyIVUsers(Rewriter);
+  if (EnableIVRewrite)
+    Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts);
 
   // Eliminate redundant IV cycles.
-  if (DisableIVRewrite)
-    SimplifyCongruentIVs(L);
+  if (!EnableIVRewrite)
+    NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
   // Compute the type of the largest recurrence expression, and decide whether
   // a canonical induction variable should be inserted.
-  const Type *LargestType = 0;
+  Type *LargestType = 0;
   bool NeedCannIV = false;
   bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
-  if (ExpandBECount) {
+  if (EnableIVRewrite && ExpandBECount) {
     // If we have a known trip count and a single exit block, we'll be
     // rewriting the loop exit test condition below, which requires a
     // canonical induction variable.
     NeedCannIV = true;
-    const Type *Ty = BackedgeTakenCount->getType();
-    if (DisableIVRewrite) {
+    Type *Ty = BackedgeTakenCount->getType();
+    if (!EnableIVRewrite) {
       // In this mode, SimplifyIVUsers may have already widened the IV used by
       // the backedge test and inserted a Trunc on the compare's operand. Get
       // the wider type to avoid creating a redundant narrow IV only used by the
@@ -1769,10 +1819,10 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
         SE->getTypeSizeInBits(LargestType))
       LargestType = SE->getEffectiveSCEVType(Ty);
   }
-  if (!DisableIVRewrite) {
+  if (EnableIVRewrite) {
     for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
       NeedCannIV = true;
-      const Type *Ty =
+      Type *Ty =
         SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
       if (!LargestType ||
           SE->getTypeSizeInBits(Ty) >
@@ -1811,18 +1861,16 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     // the end of the pass.
     while (!OldCannIVs.empty()) {
       PHINode *OldCannIV = OldCannIVs.pop_back_val();
-      OldCannIV->insertBefore(L->getHeader()->getFirstNonPHI());
+      OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt());
     }
   }
-
+  else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) {
+    IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
+  }
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
-  ICmpInst *NewICmp = 0;
-  if (ExpandBECount) {
-    assert(canExpandBackedgeTakenCount(L, SE) &&
-           "canonical IV disrupted BackedgeTaken expansion");
-    assert(NeedCannIV &&
-           "LinearFunctionTestReplace requires a canonical induction variable");
+  Value *NewICmp = 0;
+  if (ExpandBECount && IndVar) {
     // Check preconditions for proper SCEVExpander operation. SCEV does not
     // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
     // pass that uses the SCEVExpander must do it. This does not work well for
@@ -1837,7 +1885,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
         LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter);
   }
   // Rewrite IV-derived expressions.
-  if (!DisableIVRewrite)
+  if (EnableIVRewrite)
     RewriteIVExpressions(L, Rewriter);
 
   // Clear the rewriter cache, because values that are in the rewriter's cache
@@ -1860,12 +1908,34 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // For completeness, inform IVUsers of the IV use in the newly-created
   // loop exit test instruction.
-  if (NewICmp && IU)
-    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
-
+  if (IU && NewICmp) {
+    ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp);
+    if (NewICmpInst)
+      IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)));
+  }
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
   // Check a post-condition.
-  assert(L->isLCSSAForm(*DT) && "Indvars did not leave the loop in lcssa form!");
+  assert(L->isLCSSAForm(*DT) &&
+         "Indvars did not leave the loop in lcssa form!");
+
+  // Verify that LFTR, and any other change have not interfered with SCEV's
+  // ability to compute trip count.
+#ifndef NDEBUG
+  if (!EnableIVRewrite && VerifyIndvars &&
+      !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    SE->forgetLoop(L);
+    const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+        SE->getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE->getTruncateOrNoop(NewBECount,
+                                         BackedgeTakenCount->getType());
+    else
+      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+                                                 NewBECount->getType());
+    assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+  }
+#endif
+
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b500d5b..f410af3 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -811,8 +811,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
 /// important optimization that encourages jump threading, and needs to be run
 /// interlaced with other jump threading tasks.
 bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
-  // Don't hack volatile loads.
-  if (LI->isVolatile()) return false;
+  // Don't hack volatile/atomic loads.
+  if (!LI->isSimple()) return false;
 
   // If the load is defined in a block with exactly one predecessor, it can't be
   // partially redundant.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 66add6c..b79bb13 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -151,6 +151,11 @@ namespace {
     ///
     bool isSafeToExecuteUnconditionally(Instruction &I);
 
+    /// isGuaranteedToExecute - Check that the instruction is guaranteed to
+    /// execute.
+    ///
+    bool isGuaranteedToExecute(Instruction &I);
+
     /// pointerInvalidatedByLoop - Return true if the body of this loop may
     /// store into the memory location pointed to by V.
     ///
@@ -357,8 +362,8 @@ void LICM::HoistRegion(DomTreeNode *N) {
 bool LICM::canSinkOrHoistInst(Instruction &I) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-    if (LI->isVolatile())
-      return false;        // Don't hoist volatile loads!
+    if (!LI->isUnordered())
+      return false;        // Don't hoist volatile/atomic loads!
 
     // Loads from constant memory are always safe to move, even if they end up
     // in the same alias set as something that ends up being modified.
@@ -461,7 +466,7 @@ void LICM::sink(Instruction &I) {
     } else {
       // Move the instruction to the start of the exit block, after any PHI
       // nodes in it.
-      I.moveBefore(ExitBlocks[0]->getFirstNonPHI());
+      I.moveBefore(ExitBlocks[0]->getFirstInsertionPt());
 
       // This instruction is no longer in the AST for the current loop, because
       // we just sunk it out of the loop.  If we just sunk it into an outer
@@ -504,7 +509,7 @@ void LICM::sink(Instruction &I) {
       continue;
 
     // Insert the code after the last PHI node.
-    BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
 
     // If this is the first exit block processed, just move the original
     // instruction, otherwise clone the original instruction and insert
@@ -577,6 +582,10 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
   if (Inst.isSafeToSpeculativelyExecute())
     return true;
 
+  return isGuaranteedToExecute(Inst);
+}
+
+bool LICM::isGuaranteedToExecute(Instruction &Inst) {
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
@@ -635,7 +644,7 @@ namespace {
       for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
-        Instruction *InsertPos = ExitBlock->getFirstNonPHI();
+        Instruction *InsertPos = ExitBlock->getFirstInsertionPt();
         StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
@@ -713,34 +722,41 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      unsigned InstAlignment;
       if (LoadInst *load = dyn_cast<LoadInst>(Use)) {
-        assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken");
-        InstAlignment = load->getAlignment();
+        assert(!load->isVolatile() && "AST broken");
+        if (!load->isSimple())
+          return;
       } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
         if (Use->getOperand(1) != ASIV)
           continue;
-        InstAlignment = store->getAlignment();
-        assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken");
+        assert(!store->isVolatile() && "AST broken");
+        if (!store->isSimple())
+          return;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
+        // If the alignment of this instruction allows us to specify a more
+        // restrictive (and performant) alignment and if we are sure this
+        // instruction will be executed, update the alignment.
+        // Larger is better, with the exception of 0 being the best alignment.
+        unsigned InstAlignment = store->getAlignment();
+        if ((InstAlignment > Alignment || InstAlignment == 0)
+            && (Alignment != 0))
+          if (isGuaranteedToExecute(*Use)) {
+            GuaranteedToExecute = true;
+            Alignment = InstAlignment;
+          }
+
+        if (!GuaranteedToExecute)
+          GuaranteedToExecute = isGuaranteedToExecute(*Use);
+
       } else
         return; // Not a load or store.
 
-      // If the alignment of this instruction allows us to specify a more
-      // restrictive (and performant) alignment and if we are sure this
-      // instruction will be executed, update the alignment.
-      // Larger is better, with the exception of 0 being the best alignment.
-      if ((InstAlignment > Alignment || InstAlignment == 0)
-          && (Alignment != 0))
-        if (isSafeToExecuteUnconditionally(*Use)) {
-          GuaranteedToExecute = true;
-          Alignment = InstAlignment;
-        }
-
-      if (!GuaranteedToExecute)
-        GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
-
       LoopUses.push_back(Use);
     }
   }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a0e41d9..ad15cbb 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -267,7 +267,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 
 /// processLoopStore - See if this store can be promoted to a memset or memcpy.
 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
-  if (SI->isVolatile()) return false;
+  if (!SI->isSimple()) return false;
 
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
@@ -314,7 +314,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
     const SCEVAddRecExpr *LoadEv =
       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0)));
     if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() &&
-        StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile())
+        StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple())
       if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount))
         return true;
   }
@@ -463,7 +463,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     SplatValue = 0;
   } else {
     // Otherwise, this isn't an idiom we can transform.  For example, we can't
-    // do anything with a 3-byte store, for example.
+    // do anything with a 3-byte store.
     return false;
   }
 
@@ -498,7 +498,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
@@ -604,7 +604,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  const Type *IntPtr = TD->getIntPtrType(SI->getContext());
+  Type *IntPtr = TD->getIntPtrType(SI->getContext());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 509d026..3e122c2 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -70,12 +70,27 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 using namespace llvm;
 
+namespace llvm {
+cl::opt<bool> EnableNested(
+  "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
+
+cl::opt<bool> EnableRetry(
+    "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+cl::opt<bool> EnablePhiElim(
+    "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination"));
+}
+
 namespace {
 
 /// RegSortData - This class holds data which is used to order reuse candidates.
@@ -219,7 +234,7 @@ struct Formula {
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
   unsigned getNumRegs() const;
-  const Type *getType() const;
+  Type *getType() const;
 
   void DeleteBaseReg(const SCEV *&S);
 
@@ -319,7 +334,7 @@ unsigned Formula::getNumRegs() const {
 
 /// getType - Return the type of this formula, if it has one, or null
 /// otherwise. This type is meaningless except for the bit size.
-const Type *Formula::getType() const {
+Type *Formula::getType() const {
   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
          ScaledReg ? ScaledReg->getType() :
          AM.BaseGV ? AM.BaseGV->getType() :
@@ -397,7 +412,7 @@ void Formula::dump() const {
 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 /// without changing its value.
 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
-  const Type *WideTy =
+  Type *WideTy =
     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 }
@@ -405,7 +420,7 @@ static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 /// isAddSExtable - Return true if the given add can be sign-extended
 /// without changing its value.
 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
-  const Type *WideTy =
+  Type *WideTy =
     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 }
@@ -413,7 +428,7 @@ static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
 /// isMulSExtable - Return true if the given mul can be sign-extended
 /// without changing its value.
 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
-  const Type *WideTy =
+  Type *WideTy =
     IntegerType::get(SE.getContext(),
                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
@@ -594,8 +609,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 }
 
 /// getAccessType - Return the type of the memory being accessed.
-static const Type *getAccessType(const Instruction *Inst) {
-  const Type *AccessTy = Inst->getType();
+static Type *getAccessType(const Instruction *Inst) {
+  Type *AccessTy = Inst->getType();
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
     AccessTy = SI->getOperand(0)->getType();
   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
@@ -614,7 +629,7 @@ static const Type *getAccessType(const Instruction *Inst) {
 
   // All pointers have the same requirements, so canonicalize them to an
   // arbitrary pointer type to minimize variation.
-  if (const PointerType *PTy = dyn_cast<PointerType>(AccessTy))
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
                                 PTy->getAddressSpace());
 
@@ -670,6 +685,21 @@ public:
 
   void Loose();
 
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+             | ImmCost | SetupCost) != ~0u)
+      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+           & ImmCost & SetupCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return NumRegs == ~0u;
+  }
+
   void RateFormula(const Formula &F,
                    SmallPtrSet<const SCEV *, 16> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
@@ -702,34 +732,48 @@ void Cost::RateRegister(const SCEV *Reg,
     if (AR->getLoop() == L)
       AddRecCost += 1; /// TODO: This should be a function of the stride.
 
-    // If this is an addrec for a loop that's already been visited by LSR,
-    // don't second-guess its addrec phi nodes. LSR isn't currently smart
-    // enough to reason about more than one loop at a time. Consider these
-    // registers free and leave them alone.
-    else if (L->contains(AR->getLoop()) ||
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has either already run on inner loops, will not run
+    // on other loops, and cannot be expected to change sibling loops. If the
+    // AddRec exists, consider it's register free and leave it alone. Otherwise,
+    // do not consider this formula at all.
+    // FIXME: why do we need to generate such fomulae?
+    else if (!EnableNested || L->contains(AR->getLoop()) ||
              (!AR->getLoop()->contains(L) &&
               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
       for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
-           PHINode *PN = dyn_cast<PHINode>(I); ++I)
+           PHINode *PN = dyn_cast<PHINode>(I); ++I) {
         if (SE.isSCEVable(PN->getType()) &&
             (SE.getEffectiveSCEVType(PN->getType()) ==
              SE.getEffectiveSCEVType(AR->getType())) &&
             SE.getSCEV(PN) == AR)
           return;
-
+      }
+      if (!EnableNested) {
+        Loose();
+        return;
+      }
       // If this isn't one of the addrecs that the loop already has, it
       // would require a costly new phi and add. TODO: This isn't
       // precisely modeled right now.
       ++NumBaseAdds;
-      if (!Regs.count(AR->getStart()))
+      if (!Regs.count(AR->getStart())) {
         RateRegister(AR->getStart(), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
     }
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
-    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
-      if (!Regs.count(AR->getStart()))
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
+    }
   }
   ++NumRegs;
 
@@ -769,6 +813,8 @@ void Cost::RateFormula(const Formula &F,
       return;
     }
     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
+    if (isLoser())
+      return;
   }
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I) {
@@ -778,6 +824,8 @@ void Cost::RateFormula(const Formula &F,
       return;
     }
     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
+    if (isLoser())
+      return;
   }
 
   // Determine how many (unfolded) adds we'll need inside the loop.
@@ -795,6 +843,7 @@ void Cost::RateFormula(const Formula &F,
     else if (Offset != 0)
       ImmCost += APInt(64, Offset, true).getMinSignedBits();
   }
+  assert(isValid() && "invalid cost");
 }
 
 /// Loose - Set this cost to a losing value.
@@ -980,7 +1029,7 @@ public:
   };
 
   KindType Kind;
-  const Type *AccessTy;
+  Type *AccessTy;
 
   SmallVector<int64_t, 8> Offsets;
   int64_t MinOffset;
@@ -995,7 +1044,7 @@ public:
   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   /// max fixup widths to be equivalent, because the narrower one may be relying
   /// on the implicit truncation to truncate away bogus bits.
-  const Type *WidestFixupType;
+  Type *WidestFixupType;
 
   /// Formulae - A list of ways to build a value that can satisfy this user.
   /// After the list is populated, one of these is selected heuristically and
@@ -1005,7 +1054,7 @@ public:
   /// Regs - The set of register candidates used by all formulae in this LSRUse.
   SmallPtrSet<const SCEV *, 4> Regs;
 
-  LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T),
+  LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
@@ -1127,7 +1176,7 @@ void LSRUse::dump() const {
 /// be completely folded into the user instruction at isel time. This includes
 /// address-mode folding and special icmp tricks.
 static bool isLegalUse(const TargetLowering::AddrMode &AM,
-                       LSRUse::KindType Kind, const Type *AccessTy,
+                       LSRUse::KindType Kind, Type *AccessTy,
                        const TargetLowering *TLI) {
   switch (Kind) {
   case LSRUse::Address:
@@ -1156,7 +1205,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
-      if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
+      if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
       return false;
     }
 
@@ -1176,7 +1225,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
 
 static bool isLegalUse(TargetLowering::AddrMode AM,
                        int64_t MinOffset, int64_t MaxOffset,
-                       LSRUse::KindType Kind, const Type *AccessTy,
+                       LSRUse::KindType Kind, Type *AccessTy,
                        const TargetLowering *TLI) {
   // Check for overflow.
   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
@@ -1198,7 +1247,7 @@ static bool isLegalUse(TargetLowering::AddrMode AM,
 static bool isAlwaysFoldable(int64_t BaseOffs,
                              GlobalValue *BaseGV,
                              bool HasBaseReg,
-                             LSRUse::KindType Kind, const Type *AccessTy,
+                             LSRUse::KindType Kind, Type *AccessTy,
                              const TargetLowering *TLI) {
   // Fast-path: zero is always foldable.
   if (BaseOffs == 0 && !BaseGV) return true;
@@ -1224,7 +1273,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
 static bool isAlwaysFoldable(const SCEV *S,
                              int64_t MinOffset, int64_t MaxOffset,
                              bool HasBaseReg,
-                             LSRUse::KindType Kind, const Type *AccessTy,
+                             LSRUse::KindType Kind, Type *AccessTy,
                              const TargetLowering *TLI,
                              ScalarEvolution &SE) {
   // Fast-path: zero is always foldable.
@@ -1299,7 +1348,7 @@ class LSRInstance {
   SmallSetVector<int64_t, 8> Factors;
 
   /// Types - Interesting use types, to facilitate truncation reuse.
-  SmallSetVector<const Type *, 4> Types;
+  SmallSetVector<Type *, 4> Types;
 
   /// Fixups - The list of operands which are to be replaced.
   SmallVector<LSRFixup, 16> Fixups;
@@ -1330,11 +1379,11 @@ class LSRInstance {
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
-                          LSRUse::KindType Kind, const Type *AccessTy);
+                          LSRUse::KindType Kind, Type *AccessTy);
 
   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
                                     LSRUse::KindType Kind,
-                                    const Type *AccessTy);
+                                    Type *AccessTy);
 
   void DeleteUse(LSRUse &LU, size_t LUIdx);
 
@@ -1426,7 +1475,8 @@ void LSRInstance::OptimizeShadowIV() {
     IVUsers::const_iterator CandidateUI = UI;
     ++UI;
     Instruction *ShadowUse = CandidateUI->getUser();
-    const Type *DestTy = NULL;
+    Type *DestTy = NULL;
+    bool IsSigned = false;
 
     /* If shadow use is a int->float cast then insert a second IV
        to eliminate this cast.
@@ -1440,10 +1490,14 @@ void LSRInstance::OptimizeShadowIV() {
          for (unsigned i = 0; i < n; ++i, ++d)
            foo(d);
     */
-    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
       DestTy = UCast->getDestTy();
-    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
       DestTy = SCast->getDestTy();
+    }
     if (!DestTy) continue;
 
     if (TLI) {
@@ -1457,7 +1511,7 @@ void LSRInstance::OptimizeShadowIV() {
     if (!PH) continue;
     if (PH->getNumIncomingValues() != 2) continue;
 
-    const Type *SrcTy = PH->getType();
+    Type *SrcTy = PH->getType();
     int Mantissa = DestTy->getFPMantissaWidth();
     if (Mantissa == -1) continue;
     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
@@ -1474,7 +1528,9 @@ void LSRInstance::OptimizeShadowIV() {
 
     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
     if (!Init) continue;
-    Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
 
     BinaryOperator *Incr =
       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
@@ -1776,7 +1832,7 @@ LSRInstance::OptimizeLoopTermCond() {
             if (!TLI)
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
-            const Type *AccessTy = getAccessType(UI->getUser());
+            Type *AccessTy = getAccessType(UI->getUser());
             TargetLowering::AddrMode AM;
             AM.Scale = C->getSExtValue();
             if (TLI->isLegalAddressingMode(AM, AccessTy))
@@ -1840,10 +1896,10 @@ LSRInstance::OptimizeLoopTermCond() {
 /// return true.
 bool
 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
-                                LSRUse::KindType Kind, const Type *AccessTy) {
+                                LSRUse::KindType Kind, Type *AccessTy) {
   int64_t NewMinOffset = LU.MinOffset;
   int64_t NewMaxOffset = LU.MaxOffset;
-  const Type *NewAccessTy = AccessTy;
+  Type *NewAccessTy = AccessTy;
 
   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
   // something conservative, however this can pessimize in the case that one of
@@ -1882,7 +1938,7 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
 /// Either reuse an existing use or create a new one, as needed.
 std::pair<size_t, int64_t>
 LSRInstance::getUse(const SCEV *&Expr,
-                    LSRUse::KindType Kind, const Type *AccessTy) {
+                    LSRUse::KindType Kind, Type *AccessTy) {
   const SCEV *Copy = Expr;
   int64_t Offset = ExtractImmediate(Expr, SE);
 
@@ -2044,7 +2100,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     LF.PostIncLoops = UI->getPostIncLoops();
 
     LSRUse::KindType Kind = LSRUse::Basic;
-    const Type *AccessTy = 0;
+    Type *AccessTy = 0;
     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
       Kind = LSRUse::Address;
       AccessTy = getAccessType(LF.UserInst);
@@ -2464,7 +2520,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   if (LU.Kind != LSRUse::ICmpZero) return;
 
   // Determine the integer type for the base formula.
-  const Type *IntTy = Base.getType();
+  Type *IntTy = Base.getType();
   if (!IntTy) return;
   if (SE.getTypeSizeInBits(IntTy) > 64) return;
 
@@ -2538,7 +2594,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
 /// scaled-offset address modes, for example.
 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   // Determine the integer type for the base formula.
-  const Type *IntTy = Base.getType();
+  Type *IntTy = Base.getType();
   if (!IntTy) return;
 
   // If this Formula already has a scaled register, we can't add another one.
@@ -2598,13 +2654,13 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   if (Base.AM.BaseGV) return;
 
   // Determine the integer type for the base formula.
-  const Type *DstTy = Base.getType();
+  Type *DstTy = Base.getType();
   if (!DstTy) return;
   DstTy = SE.getEffectiveSCEVType(DstTy);
 
-  for (SmallSetVector<const Type *, 4>::const_iterator
+  for (SmallSetVector<Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
-    const Type *SrcTy = *I;
+    Type *SrcTy = *I;
     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
       Formula F = Base;
 
@@ -2741,7 +2797,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     int64_t Imm = WI.Imm;
     const SCEV *OrigReg = WI.OrigReg;
 
-    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
@@ -3275,6 +3331,9 @@ retry:
   skip:;
   }
 
+  if (!EnableRetry && !AnySatisfiedReqRegs)
+    return;
+
   // If none of the formulae had all of the required registers, relax the
   // constraint so that we don't exclude all formulae.
   if (!AnySatisfiedReqRegs) {
@@ -3298,6 +3357,10 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   // SolveRecurse does all the work.
   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
                CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
 
   // Ok, we've now made all our decisions.
   DEBUG(dbgs() << "\n"
@@ -3416,6 +3479,9 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   // Don't insert instructions before PHI nodes.
   while (isa<PHINode>(IP)) ++IP;
 
+  // Ignore landingpad instructions.
+  while (isa<LandingPadInst>(IP)) ++IP;
+
   // Ignore debug intrinsics.
   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
 
@@ -3440,9 +3506,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   Rewriter.setPostInc(LF.PostIncLoops);
 
   // This is the type that the user actually needs.
-  const Type *OpTy = LF.OperandValToReplace->getType();
+  Type *OpTy = LF.OperandValToReplace->getType();
   // This will be the type that we'll initially expand to.
-  const Type *Ty = F.getType();
+  Type *Ty = F.getType();
   if (!Ty)
     // No type known; just expand directly to the ultimate type.
     Ty = OpTy;
@@ -3450,7 +3516,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     // Expand directly to the ultimate type if it's the right size.
     Ty = OpTy;
   // This is the type to do integer arithmetic in.
-  const Type *IntTy = SE.getEffectiveSCEVType(Ty);
+  Type *IntTy = SE.getEffectiveSCEVType(Ty);
 
   // Build up a list of operands to add together to form the full base.
   SmallVector<const SCEV *, 8> Ops;
@@ -3527,7 +3593,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
-        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset);
@@ -3611,10 +3677,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
       // users.
       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
           !isa<IndirectBrInst>(BB->getTerminator())) {
-        Loop *PNLoop = LI.getLoopFor(PN->getParent());
-        if (!PNLoop || PN->getParent() != PNLoop->getHeader()) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
           // Split the critical edge.
-          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
+          BasicBlock *NewBB = 0;
+          if (!Parent->isLandingPad()) {
+            NewBB = SplitCriticalEdge(BB, Parent, P,
+                                      /*MergeIdenticalEdges=*/true,
+                                      /*DontDeleteUselessPhis=*/true);
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
+            NewBB = NewBBs[0];
+          }
 
           // If PN is outside of the loop and BB is in the loop, we want to
           // move the block to be immediately before the PHI block, not
@@ -3637,7 +3713,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
 
         // If this is reuse-by-noop-cast, insert the noop cast.
-        const Type *OpTy = LF.OperandValToReplace->getType();
+        Type *OpTy = LF.OperandValToReplace->getType();
         if (FullV->getType() != OpTy)
           FullV =
             CastInst::Create(CastInst::getCastOpcode(FullV, false,
@@ -3667,7 +3743,7 @@ void LSRInstance::Rewrite(const LSRFixup &LF,
     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
 
     // If this is reuse-by-noop-cast, insert the noop cast.
-    const Type *OpTy = LF.OperandValToReplace->getType();
+    Type *OpTy = LF.OperandValToReplace->getType();
     if (FullV->getType() != OpTy) {
       Instruction *Cast =
         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
@@ -3700,6 +3776,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
 
   SCEVExpander Rewriter(SE, "lsr");
   Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
 
   // Expand the new value definitions and update the users.
@@ -3740,6 +3817,23 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   OptimizeShadowIV();
   OptimizeLoopTermCond();
 
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
+  if (!EnableNested && !L->empty()) {
+
+    if (EnablePhiElim) {
+      // Remove any extra phis created by processing inner loops.
+      SmallVector<WeakVH, 16> DeadInsts;
+      SCEVExpander Rewriter(SE, "lsr");
+      Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+      Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+    }
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
   // Start collecting data and preparing for the solver.
   CollectInterestingTypesAndFactors();
   CollectFixupsAndInitialFormulae();
@@ -3763,6 +3857,9 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   Types.clear();
   RegUses.clear();
 
+  if (Solution.empty())
+    return;
+
 #ifndef NDEBUG
   // Formulae should be legal.
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
@@ -3778,6 +3875,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
 
   // Now that we've decided what we want, make it so.
   ImplementSolution(Solution, P);
+
+  if (EnablePhiElim) {
+    // Remove any extra phis created by processing inner loops.
+    SmallVector<WeakVH, 16> DeadInsts;
+    SCEVExpander Rewriter(SE, "lsr");
+    Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+    Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+  }
 }
 
 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
@@ -3793,7 +3898,7 @@ void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
     OS << '*' << *I;
   }
 
-  for (SmallSetVector<const Type *, 4>::const_iterator
+  for (SmallSetVector<Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
     if (!First) OS << ", ";
     First = false;
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index fef6bc3..91395b2 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Target/TargetData.h"
 #include <climits>
 
 using namespace llvm;
@@ -39,6 +40,11 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
   cl::desc("Allows loops to be partially unrolled until "
            "-unroll-threshold loop size is reached."));
 
+// Temporary flag to be removed in 3.0
+static cl::opt<bool>
+NoSCEVUnroll("disable-unroll-scev", cl::init(false), cl::Hidden,
+  cl::desc("Use ScalarEvolution to analyze loop trip counts for unrolling"));
+
 namespace {
   class LoopUnroll : public LoopPass {
   public:
@@ -49,7 +55,7 @@ namespace {
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
-     
+
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
 
@@ -57,11 +63,11 @@ namespace {
     /// that the loop unroll should be performed regardless of how much
     /// code expansion would result.
     static const unsigned NoThreshold = UINT_MAX;
-    
+
     // Threshold to use when optsize is specified (and there is no
     // explicit -unroll-threshold).
     static const unsigned OptSizeUnrollThreshold = 50;
-    
+
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
@@ -79,6 +85,7 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
+      AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
@@ -101,45 +108,62 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) {
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                                    const TargetData *TD) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I);
+    Metrics.analyzeBasicBlock(*I, TD);
   NumCalls = Metrics.NumInlineCandidates;
-  
+
   unsigned LoopSize = Metrics.NumInsts;
-  
+
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
   // not a problem for code quality.
   if (LoopSize == 0) LoopSize = 1;
-  
+
   return LoopSize;
 }
 
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LoopInfo *LI = &getAnalysis<LoopInfo>();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
-  
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
   unsigned Threshold = CurrentThreshold;
-  if (!UserThreshold && 
+  if (!UserThreshold &&
       Header->getParent()->hasFnAttr(Attribute::OptimizeForSize))
     Threshold = OptSizeUnrollThreshold;
 
-  // Find trip count
-  unsigned TripCount = L->getSmallConstantTripCount();
-  unsigned Count = CurrentCount;
-
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  if (!NoSCEVUnroll) {
+    // Find "latch trip count". UnrollLoop assumes that control cannot exit
+    // via the loop latch on any iteration prior to TripCount. The loop may exit
+    // early via an earlier branch.
+    BasicBlock *LatchBlock = L->getLoopLatch();
+    if (LatchBlock) {
+      TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
+      TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
+    }
+  }
+  else {
+    TripCount = L->getSmallConstantTripCount();
+    if (TripCount == 0)
+      TripMultiple = L->getSmallConstantTripMultiple();
+  }
   // Automatically select an unroll count.
+  unsigned Count = CurrentCount;
   if (Count == 0) {
     // Conservative heuristic: if we know the trip count, see if we can
     // completely unroll (subject to the threshold, checked below); otherwise
@@ -152,8 +176,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
+    const TargetData *TD = getAnalysisIfAvailable<TargetData>();
     unsigned NumInlineCandidates;
-    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates);
+    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
     if (NumInlineCandidates != 0) {
       DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
@@ -182,12 +207,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  Function *F = L->getHeader()->getParent();
-  if (!UnrollLoop(L, Count, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM))
     return false;
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
-    DT->runOnFunction(*F);
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 840c4b6..458949c 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -492,7 +492,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
   Value *BranchVal = LIC;
   if (!isa<ConstantInt>(Val) ||
       Val->getType() != Type::getInt1Ty(LIC->getContext()))
-    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val, "tmp");
+    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
   else if (Val != ConstantInt::getTrue(Val->getContext()))
     // We want to enter the new loop when the condition is true.
     std::swap(TrueDest, FalseDest);
@@ -561,10 +561,17 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
     BasicBlock *ExitBlock = ExitBlocks[i];
     SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
                                        pred_end(ExitBlock));
+
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
-    SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
-                           ".us-lcssa", this);
+    if (!ExitBlock->isLandingPad()) {
+      SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
+                             ".us-lcssa", this);
+    } else {
+      SmallVector<BasicBlock*, 2> NewBBs;
+      SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
+                                  this, NewBBs);
+    }
   }
 }
 
@@ -632,7 +639,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     // as well.
     ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
   }
-  
+
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
     // The new exit block should be in the same loop as the old one.
@@ -653,6 +660,19 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
       if (It != VMap.end()) V = It->second;
       PN->addIncoming(V, NewExit);
     }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PN = PHINode::Create(LPad->getType(), 0, "",
+                           ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
   }
 
   // Rewrite the code to refer to itself.
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 9087b46..689bbe9 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -20,98 +20,88 @@
 #include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
-static bool LowerAtomicIntrinsic(IntrinsicInst *II) {
-  IRBuilder<> Builder(II->getParent(), II);
-  unsigned IID = II->getIntrinsicID();
-  switch (IID) {
-  case Intrinsic::memory_barrier:
-    break;
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI->getParent(), CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+ 
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+ 
+  CXI->replaceAllUsesWith(Orig);
+  CXI->eraseFromParent();
+  return true;
+}
 
-  case Intrinsic::atomic_load_add:
-  case Intrinsic::atomic_load_sub:
-  case Intrinsic::atomic_load_and:
-  case Intrinsic::atomic_load_nand:
-  case Intrinsic::atomic_load_or:
-  case Intrinsic::atomic_load_xor:
-  case Intrinsic::atomic_load_max:
-  case Intrinsic::atomic_load_min:
-  case Intrinsic::atomic_load_umax:
-  case Intrinsic::atomic_load_umin: {
-    Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1);
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI->getParent(), RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
 
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Value *Res = NULL;
-    switch (IID) {
-    default: assert(0 && "Unrecognized atomic modify operation");
-    case Intrinsic::atomic_load_add:
-      Res = Builder.CreateAdd(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_sub:
-      Res = Builder.CreateSub(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_and:
-      Res = Builder.CreateAnd(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_nand:
-      Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta));
-      break;
-    case Intrinsic::atomic_load_or:
-      Res = Builder.CreateOr(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_xor:
-      Res = Builder.CreateXor(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_max:
-      Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
-                                 Delta, Orig);
-      break;
-    case Intrinsic::atomic_load_min:
-      Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
-                                 Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_umax:
-      Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
-                                 Delta, Orig);
-      break;
-    case Intrinsic::atomic_load_umin:
-      Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
-                                 Orig, Delta);
-      break;
-    }
-    Builder.CreateStore(Res, Ptr);
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Res = NULL;
 
-    II->replaceAllUsesWith(Orig);
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
     break;
-  }
-
-  case Intrinsic::atomic_swap: {
-    Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1);
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Builder.CreateStore(Val, Ptr);
-    II->replaceAllUsesWith(Orig);
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
     break;
-  }
-
-  case Intrinsic::atomic_cmp_swap: {
-    Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1);
-    Value *Val = II->getArgOperand(2);
-
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
-    Value *Res = Builder.CreateSelect(Equal, Val, Orig);
-    Builder.CreateStore(Res, Ptr);
-    II->replaceAllUsesWith(Orig);
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
     break;
   }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
 
-  default:
-    return false;
-  }
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
 
-  assert(II->use_empty() &&
-         "Lowering should have eliminated any uses of the intrinsic call!");
-  II->eraseFromParent();
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(NotAtomic);
+  return true;
+}
 
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(NotAtomic);
   return true;
 }
 
@@ -123,9 +113,22 @@ namespace {
     }
     bool runOnBasicBlock(BasicBlock &BB) {
       bool Changed = false;
-      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; )
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++))
-          Changed |= LowerAtomicIntrinsic(II);
+      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+        Instruction *Inst = DI++;
+        if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+          Changed |= LowerFenceInst(FI);
+        else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+          Changed |= LowerAtomicCmpXchgInst(CXI);
+        else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+          Changed |= LowerAtomicRMWInst(RMWI);
+        else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+          if (LI->isAtomic())
+            LowerLoadInst(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+          if (SI->isAtomic())
+            LowerStoreInst(SI);
+        }
+      }
       return Changed;
     }
   };
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 7ed3db6..eeb8931 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -54,7 +54,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
     if (OpC->isZero()) continue;  // No offset.
 
     // Handle struct indices, which add their field offset to the pointer.
-    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
       Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
       continue;
     }
@@ -384,7 +384,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     
     if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
-      if (NextStore->isVolatile()) break;
+      if (!NextStore->isSimple()) break;
     
       // Check to see if this stored value is of the same byte-splattable value.
       if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
@@ -448,7 +448,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     // Determine alignment
     unsigned Alignment = Range.Alignment;
     if (Alignment == 0) {
-      const Type *EltType = 
+      Type *EltType = 
         cast<PointerType>(StartPtr->getType())->getElementType();
       Alignment = TD->getABITypeAlignment(EltType);
     }
@@ -479,7 +479,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
-  if (SI->isVolatile()) return false;
+  if (!SI->isSimple()) return false;
   
   if (TD == 0) return false;
 
@@ -487,7 +487,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // happen to be using a load-store pair to implement it, rather than
   // a memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
-    if (!LI->isVolatile() && LI->hasOneUse() &&
+    if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
       MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = 0;
@@ -616,7 +616,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     if (!A->hasStructRetAttr())
       return false;
 
-    const Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+    Type *StructTy = cast<PointerType>(A->getType())->getElementType();
     uint64_t destSize = TD->getTypeAllocSize(StructTy);
 
     if (destSize < srcSize)
@@ -860,7 +860,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
-  const Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType();
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = TD->getTypeAllocSize(ByValTy);
   MemDepResult DepInfo =
     MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index ee132d3..da74e9c 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -180,7 +180,7 @@ static bool IsPotentialUse(const Value *Op) {
         Arg->hasStructRetAttr())
       return false;
   // Only consider values with pointer types, and not function pointers.
-  const PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
   if (!Ty || isa<FunctionType>(Ty->getElementType()))
     return false;
   // Conservatively assume anything else is a potential use.
@@ -213,8 +213,8 @@ static InstructionClass GetFunctionClass(const Function *F) {
   const Argument *A0 = AI++;
   if (AI == AE)
     // Argument is a pointer.
-    if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
-      const Type *ETy = PTy->getElementType();
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
       // Argument is i8*.
       if (ETy->isIntegerTy(8))
         return StringSwitch<InstructionClass>(F->getName())
@@ -234,7 +234,7 @@ static InstructionClass GetFunctionClass(const Function *F) {
           .Default(IC_CallOrUser);
 
       // Argument is i8**
-      if (const PointerType *Pte = dyn_cast<PointerType>(ETy))
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
         if (Pte->getElementType()->isIntegerTy(8))
           return StringSwitch<InstructionClass>(F->getName())
             .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
@@ -246,11 +246,11 @@ static InstructionClass GetFunctionClass(const Function *F) {
   // Two arguments, first is i8**.
   const Argument *A1 = AI++;
   if (AI == AE)
-    if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (const PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
         if (Pte->getElementType()->isIntegerTy(8))
-          if (const PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            const Type *ETy1 = PTy1->getElementType();
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
             // Second argument is i8*
             if (ETy1->isIntegerTy(8))
               return StringSwitch<InstructionClass>(F->getName())
@@ -258,7 +258,7 @@ static InstructionClass GetFunctionClass(const Function *F) {
                      .Case("objc_initWeak",              IC_InitWeak)
                      .Default(IC_CallOrUser);
             // Second argument is i8**.
-            if (const PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
               if (Pte1->getElementType()->isIntegerTy(8))
                 return StringSwitch<InstructionClass>(F->getName())
                        .Case("objc_moveWeak",              IC_MoveWeak)
@@ -344,6 +344,10 @@ static InstructionClass GetInstructionClass(const Value *V) {
       break;
     default:
       // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
       for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
            OI != OE; ++OI)
         if (IsPotentialUse(*OI))
@@ -421,9 +425,10 @@ static bool IsAlwaysTail(InstructionClass Class) {
 /// IsNoThrow - Test if the given class represents instructions which are always
 /// safe to mark with the nounwind attribute..
 static bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
   return Class == IC_Retain ||
          Class == IC_RetainRV ||
-         Class == IC_RetainBlock ||
          Class == IC_Release ||
          Class == IC_Autorelease ||
          Class == IC_AutoreleaseRV ||
@@ -515,6 +520,10 @@ static bool IsObjCIdentifiedObject(const Value *V) {
     const Value *Pointer =
       StripPointerCastsAndObjCCalls(LI->getPointerOperand());
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
       StringRef Name = GV->getName();
       // These special variables are known to hold values which are not
       // reference-counted pointers.
@@ -738,7 +747,6 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   switch (GetBasicInstructionClass(CS.getInstruction())) {
   case IC_Retain:
   case IC_RetainRV:
-  case IC_RetainBlock:
   case IC_Autorelease:
   case IC_AutoreleaseRV:
   case IC_NoopCast:
@@ -746,6 +754,8 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   case IC_FusedRetainAutorelease:
   case IC_FusedRetainAutoreleaseRV:
     // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, becuase it updates
+    // pointers when it copies block data.
     return NoModRef;
   default:
     break;
@@ -877,7 +887,9 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 // usually can't sink them past other calls, which would be the main
 // case where it would be useful.
 
-/// TODO: The pointer returned from objc_loadWeakRetained is retained.
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
 
 #include "llvm/GlobalAlias.h"
 #include "llvm/Constants.h"
@@ -1098,16 +1110,16 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
   if (A == S_None || B == S_None)
     return S_None;
 
-  // Note that we can't merge S_CanRelease and S_Use.
   if (A > B) std::swap(A, B);
   if (TopDown) {
     // Choose the side which is further along in the sequence.
-    if (A == S_Retain && (B == S_CanRelease || B == S_Use))
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
       return B;
   } else {
     // Choose the side which is further along in the sequence.
     if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Release || B == S_Stop || B == S_MovableRelease))
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
       return A;
     // If both sides are releases, choose the more conservative one.
     if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
@@ -1124,13 +1136,19 @@ namespace {
   /// retain-decrement-use-release sequence or release-use-decrement-retain
   /// reverese sequence.
   struct RRInfo {
-    /// KnownIncremented - After an objc_retain, the reference count of the
-    /// referenced object is known to be positive. Similarly, before an
-    /// objc_release, the reference count of the referenced object is known to
-    /// be positive. If there are retain-release pairs in code regions where the
-    /// retain count is known to be positive, they can be eliminated, regardless
-    /// of any side effects between them.
-    bool KnownIncremented;
+    /// KnownSafe - After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
 
     /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
     /// opposed to objc_retain calls).
@@ -1153,7 +1171,7 @@ namespace {
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
     RRInfo() :
-      KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false),
+      KnownSafe(false), IsRetainBlock(false), IsTailCallRelease(false),
       ReleaseMetadata(0) {}
 
     void clear();
@@ -1161,7 +1179,7 @@ namespace {
 }
 
 void RRInfo::clear() {
-  KnownIncremented = false;
+  KnownSafe = false;
   IsRetainBlock = false;
   IsTailCallRelease = false;
   ReleaseMetadata = 0;
@@ -1176,6 +1194,9 @@ namespace {
     /// RefCount - The known minimum number of reference count increments.
     unsigned RefCount;
 
+    /// NestCount - The known minimum level of retain+release nesting.
+    unsigned NestCount;
+
     /// Seq - The current position in the sequence.
     Sequence Seq;
 
@@ -1184,7 +1205,11 @@ namespace {
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : RefCount(0), Seq(S_None) {}
+    PtrState() : RefCount(0), NestCount(0), Seq(S_None) {}
+
+    void SetAtLeastOneRefCount()  {
+      if (RefCount == 0) RefCount = 1;
+    }
 
     void IncrementRefCount() {
       if (RefCount != UINT_MAX) ++RefCount;
@@ -1194,14 +1219,22 @@ namespace {
       if (RefCount != 0) --RefCount;
     }
 
-    void ClearRefCount() {
-      RefCount = 0;
-    }
-
     bool IsKnownIncremented() const {
       return RefCount > 0;
     }
 
+    void IncrementNestCount() {
+      if (NestCount != UINT_MAX) ++NestCount;
+    }
+
+    void DecrementNestCount() {
+      if (NestCount != 0) --NestCount;
+    }
+
+    bool IsKnownNested() const {
+      return NestCount > 0;
+    }
+
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1233,6 +1266,7 @@ void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   RefCount = std::min(RefCount, Other.RefCount);
+  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
@@ -1245,7 +1279,7 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
     if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
       RRI.ReleaseMetadata = 0;
 
-    RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented;
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
     RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
     RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(),
@@ -1316,7 +1350,7 @@ namespace {
     }
 
     void clearBottomUpPointers() {
-      PerPtrTopDown.clear();
+      PerPtrBottomUp.clear();
     }
 
     void clearTopDownPointers() {
@@ -1334,6 +1368,12 @@ namespace {
     unsigned GetAllPathCount() const {
       return TopDownPathCount * BottomUpPathCount;
     }
+
+    /// IsVisitedTopDown - Test whether the block for this BBState has been
+    /// visited by the top-down portion of the algorithm.
+    bool isVisitedTopDown() const {
+      return TopDownPathCount != 0;
+    }
   };
 }
 
@@ -1364,7 +1404,7 @@ void BBState::MergePred(const BBState &Other) {
                              /*TopDown=*/true);
   }
 
-  // For each entry in our set, if the other set doens't have an entry with the
+  // For each entry in our set, if the other set doesn't have an entry with the
   // same key, force it to merge with an empty entry.
   for (ptr_iterator MI = top_down_ptr_begin(),
        ME = top_down_ptr_end(); MI != ME; ++MI)
@@ -1389,7 +1429,7 @@ void BBState::MergeSucc(const BBState &Other) {
                              /*TopDown=*/false);
   }
 
-  // For each entry in our set, if the other set doens't have an entry
+  // For each entry in our set, if the other set doesn't have an entry
   // with the same key, force it to merge with an empty entry.
   for (ptr_iterator MI = bottom_up_ptr_begin(),
        ME = bottom_up_ptr_end(); MI != ME; ++MI)
@@ -1406,15 +1446,11 @@ namespace {
     /// Run - A flag indicating whether this optimization pass should run.
     bool Run;
 
-    /// RetainFunc, RelaseFunc - Declarations for objc_retain,
-    /// objc_retainBlock, and objc_release.
-    Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc;
-
     /// RetainRVCallee, etc. - Declarations for ObjC runtime
     /// functions, for use in creating calls to them. These are initialized
     /// lazily to avoid cluttering up the Module with unused declarations.
     Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
-             *RetainCallee, *AutoreleaseCallee;
+             *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee;
 
     /// UsedInThisFunciton - Flags which determine whether each of the
     /// interesting runtine functions is in fact used in the current function.
@@ -1428,6 +1464,7 @@ namespace {
     Constant *getAutoreleaseRVCallee(Module *M);
     Constant *getReleaseCallee(Module *M);
     Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
     Constant *getAutoreleaseCallee(Module *M);
 
     void OptimizeRetainCall(Function &F, Instruction *Retain);
@@ -1452,11 +1489,13 @@ namespace {
     void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                    MapVector<Value *, RRInfo> &Retains,
                    DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts);
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
 
     bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
                               MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases);
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
 
     void OptimizeWeakCalls(Function &F);
 
@@ -1501,7 +1540,7 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -1518,7 +1557,7 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -1561,6 +1600,23 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   return RetainCallee;
 }
 
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    std::vector<Type *> Params;
+    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
+    AttrListPtr Attributes;
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return RetainBlockCallee;
+}
+
 Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
@@ -1904,12 +1960,19 @@ void
 ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
   // Check for a return of the pointer value.
   const Value *Ptr = GetObjCArg(AutoreleaseRV);
-  for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
-       UI != UE; ++UI) {
-    const User *I = *UI;
-    if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
-      return;
-  }
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
 
   Changed = true;
   ++NumPeeps;
@@ -1953,7 +2016,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_DestroyWeak: {
       CallInst *CI = cast<CallInst>(Inst);
       if (isNullOrUndef(CI->getArgOperand(0))) {
-        const Type *Ty = CI->getArgOperand(0)->getType();
+        Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
                       CI);
@@ -1968,7 +2031,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       CallInst *CI = cast<CallInst>(Inst);
       if (isNullOrUndef(CI->getArgOperand(0)) ||
           isNullOrUndef(CI->getArgOperand(1))) {
-        const Type *Ty = CI->getArgOperand(0)->getType();
+        Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
                       CI);
@@ -2090,7 +2153,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           ++NumPartialNoops;
           // Clone the call into each predecessor that has a non-null value.
           CallInst *CInst = cast<CallInst>(Inst);
-          const Type *ParamTy = CInst->getArgOperand(0)->getType();
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
           for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
             Value *Incoming =
               StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
@@ -2132,41 +2195,49 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
-        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
+        switch (SuccS.GetSeq()) {
         case S_None:
-        case S_CanRelease:
-          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
-          SomeSuccHasSame = false;
-          break;
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            S.ClearSequenceProgress();
+          continue;
+        }
         case S_Use:
           SomeSuccHasSame = true;
           break;
         case S_Stop:
         case S_Release:
         case S_MovableRelease:
-          AllSuccsHaveSame = false;
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            AllSuccsHaveSame = false;
           break;
         case S_Retain:
           llvm_unreachable("bottom-up pointer in retain state!");
         }
+      }
       // If the state at the other end of any of the successor edges
       // matches the current state, require all edges to match. This
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
-        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+        S.ClearSequenceProgress();
     }
     case S_CanRelease: {
       const Value *Arg = I->first;
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
-        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
-        case S_None:
-          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
-          SomeSuccHasSame = false;
-          break;
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
+        switch (SuccS.GetSeq()) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            S.ClearSequenceProgress();
+          continue;
+        }
         case S_CanRelease:
           SomeSuccHasSame = true;
           break;
@@ -2174,16 +2245,18 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
         case S_Release:
         case S_MovableRelease:
         case S_Use:
-          AllSuccsHaveSame = false;
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            AllSuccsHaveSame = false;
           break;
         case S_Retain:
           llvm_unreachable("bottom-up pointer in retain state!");
         }
+      }
       // If the state at the other end of any of the successor edges
       // matches the current state, require all edges to match. This
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
-        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+        S.ClearSequenceProgress();
     }
     }
 }
@@ -2207,6 +2280,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       if (Succ == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+      // If we haven't seen this node yet, then we've found a CFG cycle.
+      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
       if (I == BBStates.end())
         continue;
       MyStates.InitFromSucc(I->second);
@@ -2245,11 +2320,12 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
       S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind));
       S.RRI.clear();
-      S.RRI.KnownIncremented = S.IsKnownIncremented();
+      S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
       S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
       S.RRI.Calls.insert(Inst);
 
       S.IncrementRefCount();
+      S.IncrementNestCount();
       break;
     }
     case IC_RetainBlock:
@@ -2259,6 +2335,13 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
       PtrState &S = MyStates.getPtrBottomUpState(Arg);
       S.DecrementRefCount();
+      S.SetAtLeastOneRefCount();
+      S.DecrementNestCount();
+
+      // An objc_retainBlock call with just a use still needs to be kept,
+      // because it may be copying a block from the stack to the heap.
+      if (Class == IC_RetainBlock && S.GetSeq() == S_Use)
+        S.SetSeq(S_CanRelease);
 
       switch (S.GetSeq()) {
       case S_Stop:
@@ -2281,7 +2364,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       case S_Retain:
         llvm_unreachable("bottom-up pointer in retain state!");
       }
-      break;
+      continue;
     }
     case IC_AutoreleasepoolPop:
       // Conservatively, clear MyStates for all known pointers.
@@ -2305,26 +2388,22 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       PtrState &S = MI->second;
       Sequence Seq = S.GetSeq();
 
-      // Check for possible retains and releases.
+      // Check for possible releases.
       if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        // Check for a retain (we're going bottom-up here).
         S.DecrementRefCount();
-
-        // Check for a release.
-        if (!IsRetain(Class) && Class != IC_RetainBlock)
-          switch (Seq) {
-          case S_Use:
-            S.SetSeq(S_CanRelease);
-            continue;
-          case S_CanRelease:
-          case S_Release:
-          case S_MovableRelease:
-          case S_Stop:
-          case S_None:
-            break;
-          case S_Retain:
-            llvm_unreachable("bottom-up pointer in retain state!");
-          }
+        switch (Seq) {
+        case S_Use:
+          S.SetSeq(S_CanRelease);
+          continue;
+        case S_CanRelease:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Stop:
+        case S_None:
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
       }
 
       // Check for possible direct uses.
@@ -2332,14 +2411,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       case S_Release:
       case S_MovableRelease:
         if (CanUse(Inst, Ptr, PA, Class)) {
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
           S.SetSeq(S_Use);
         } else if (Seq == S_Release &&
                    (Class == IC_User || Class == IC_CallOrUser)) {
           // Non-movable releases depend on any possible objc pointer use.
           S.SetSeq(S_Stop);
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
         }
         break;
@@ -2378,14 +2457,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       if (Pred == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-      if (I == BBStates.end())
+      assert(I != BBStates.end());
+      // If we haven't seen this node yet, then we've found a CFG cycle.
+      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
+      if (!I->second.isVisitedTopDown())
         continue;
       MyStates.InitFromPred(I->second);
       while (PI != PE) {
         Pred = *PI++;
         if (Pred != BB) {
           I = BBStates.find(Pred);
-          if (I != BBStates.end())
+          assert(I != BBStates.end());
+          if (I->second.isVisitedTopDown())
             MyStates.MergePred(I->second);
         }
       }
@@ -2422,18 +2505,23 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
         S.SetSeq(S_Retain);
         S.RRI.clear();
         S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        S.RRI.KnownIncremented = S.IsKnownIncremented();
+        // Don't check S.IsKnownIncremented() here because it's not
+        // sufficient.
+        S.RRI.KnownSafe = S.IsKnownNested();
         S.RRI.Calls.insert(Inst);
       }
 
+      S.SetAtLeastOneRefCount();
       S.IncrementRefCount();
-      break;
+      S.IncrementNestCount();
+      continue;
     }
     case IC_Release: {
       Arg = GetObjCArg(Inst);
 
       PtrState &S = MyStates.getPtrTopDownState(Arg);
       S.DecrementRefCount();
+      S.DecrementNestCount();
 
       switch (S.GetSeq()) {
       case S_Retain:
@@ -2478,16 +2566,12 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       Sequence Seq = S.GetSeq();
 
       // Check for possible releases.
-      if (!IsRetain(Class) && Class != IC_RetainBlock &&
-          CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        // Check for a release.
+      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
         S.DecrementRefCount();
-
-        // Check for a release.
         switch (Seq) {
         case S_Retain:
           S.SetSeq(S_CanRelease);
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
 
           // One call can't cause a transition from S_Retain to S_CanRelease
@@ -2511,8 +2595,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
         if (CanUse(Inst, Ptr, PA, Class))
           S.SetSeq(S_Use);
         break;
-      case S_Use:
       case S_Retain:
+        // An objc_retainBlock call may be responsible for copying the block
+        // data from the stack to the heap. Model this by moving it straight
+        // from S_Retain to S_Use.
+        if (S.RRI.IsRetainBlock &&
+            CanUse(Inst, Ptr, PA, Class)) {
+          assert(S.RRI.ReverseInsertPts.empty());
+          S.RRI.ReverseInsertPts.insert(Inst);
+          S.SetSeq(S_Use);
+        }
+        break;
+      case S_Use:
       case S_None:
         break;
       case S_Stop:
@@ -2533,28 +2627,43 @@ ObjCARCOpt::Visit(Function &F,
                   DenseMap<const BasicBlock *, BBState> &BBStates,
                   MapVector<Value *, RRInfo> &Retains,
                   DenseMap<Value *, RRInfo> &Releases) {
-  // Use postorder for bottom-up, and reverse-postorder for top-down, because we
+  // Use reverse-postorder on the reverse CFG for bottom-up, because we
   // magically know that loops will be well behaved, i.e. they won't repeatedly
-  // call retain on a single pointer without doing a release.
+  // call retain on a single pointer without doing a release. We can't use
+  // ReversePostOrderTraversal here because we want to walk up from each
+  // function exit point.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> Stack;
+  SmallVector<BasicBlock *, 16> Order;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *BB = I;
+    if (BB->getTerminator()->getNumSuccessors() == 0)
+      Stack.push_back(std::make_pair(BB, pred_begin(BB)));
+  }
+  while (!Stack.empty()) {
+    pred_iterator End = pred_end(Stack.back().first);
+    while (Stack.back().second != End) {
+      BasicBlock *BB = *Stack.back().second++;
+      if (Visited.insert(BB))
+        Stack.push_back(std::make_pair(BB, pred_begin(BB)));
+    }
+    Order.push_back(Stack.pop_back_val().first);
+  }
   bool BottomUpNestingDetected = false;
-  SmallVector<BasicBlock *, 8> PostOrder;
-  for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) {
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+         Order.rbegin(), E = Order.rend(); I != E; ++I) {
     BasicBlock *BB = *I;
-    PostOrder.push_back(BB);
-
     BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
   }
 
-  // Iterate through the post-order in reverse order, achieving a
-  // reverse-postorder traversal. We don't use the ReversePostOrderTraversal
-  // class here because it works by computing its own full postorder iteration,
-  // recording the sequence, and playing it back in reverse. Since we're already
-  // doing a full iteration above, we can just record the sequence manually and
-  // avoid the cost of having ReversePostOrderTraversal compute it.
+  // Use regular reverse-postorder for top-down.
   bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator
-       RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI)
-    TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases);
+  typedef ReversePostOrderTraversal<Function *> RPOTType;
+  RPOTType RPOT(&F);
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
+  }
 
   return TopDownNestingDetected && BottomUpNestingDetected;
 }
@@ -2565,12 +2674,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                            RRInfo &ReleasesToMove,
                            MapVector<Value *, RRInfo> &Retains,
                            DenseMap<Value *, RRInfo> &Releases,
-                           SmallVectorImpl<Instruction *> &DeadInsts) {
-  const Type *ArgTy = Arg->getType();
-  const Type *ParamTy =
-    (RetainRVFunc ? RetainRVFunc :
-     RetainFunc ? RetainFunc :
-     RetainBlockFunc)->arg_begin()->getType();
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
 
   // Insert the new retain and release calls.
   for (SmallPtrSet<Instruction *, 2>::const_iterator
@@ -2581,7 +2688,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     CallInst *Call =
       CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         RetainBlockFunc : RetainFunc,
+                         getRetainBlockCallee(M) : getRetainCallee(M),
                        MyArg, "", InsertPt);
     Call->setDoesNotThrow();
     if (!RetainsToMove.IsRetainBlock)
@@ -2598,8 +2705,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       // The invoke's return value isn't available in the unwind block,
       // but our releases will never depend on it, because they must be
       // paired with retains from before the invoke.
-      InsertPts[0] = II->getNormalDest()->getFirstNonPHI();
-      InsertPts[1] = II->getUnwindDest()->getFirstNonPHI();
+      InsertPts[0] = II->getNormalDest()->getFirstInsertionPt();
+      InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
     } else {
       // Insert code immediately after the last use.
       InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
@@ -2609,7 +2716,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       Instruction *InsertPt = *I;
       Value *MyArg = ArgTy == ParamTy ? Arg :
                      new BitCastInst(Arg, ParamTy, "", InsertPt);
-      CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt);
+      CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                        "", InsertPt);
       // Attach a clang.imprecise_release metadata tag, if appropriate.
       if (MDNode *M = ReleasesToMove.ReleaseMetadata)
         Call->setMetadata(ImpreciseReleaseMDKind, M);
@@ -2640,7 +2748,8 @@ bool
 ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
                                    &BBStates,
                                  MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases) {
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
   bool AnyPairsCompletelyEliminated = false;
   RRInfo RetainsToMove;
   RRInfo ReleasesToMove;
@@ -2649,21 +2758,36 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
   SmallVector<Instruction *, 8> DeadInsts;
 
   for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ) {
-    Value *V = (I++)->first;
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
     if (!V) continue; // blotted
 
     Instruction *Retain = cast<Instruction>(V);
     Value *Arg = GetObjCArg(Retain);
 
-    // If the object being released is in static or stack storage, we know it's
+    // If the object being released is in static storage, we know it's
     // not being managed by ObjC reference counting, so we can delete pairs
     // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+    bool KnownSafe = isa<Constant>(Arg);
+   
+    // Same for stack storage, unless this is an objc_retainBlock call,
+    // which is responsible for copying the block data from the stack to
+    // the heap.
+    if (!I->second.IsRetainBlock && isa<AllocaInst>(Arg))
+      KnownSafe = true;
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
 
     // If a pair happens in a region where it is known that the reference count
     // is already incremented, we can similarly ignore possible decrements.
-    bool KnownIncrementedTD = true, KnownIncrementedBU = true;
+    bool KnownSafeTD = true, KnownSafeBU = true;
 
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
@@ -2683,7 +2807,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
         MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
         assert(It != Retains.end());
         const RRInfo &NewRetainRRI = It->second;
-        KnownIncrementedTD &= NewRetainRRI.KnownIncremented;
+        KnownSafeTD &= NewRetainRRI.KnownSafe;
         for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewRetainRRI.Calls.begin(),
              LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
@@ -2739,7 +2863,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
           Releases.find(NewRelease);
         assert(It != Releases.end());
         const RRInfo &NewReleaseRRI = It->second;
-        KnownIncrementedBU &= NewReleaseRRI.KnownIncremented;
+        KnownSafeBU &= NewReleaseRRI.KnownSafe;
         for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewReleaseRRI.Calls.begin(),
              LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
@@ -2787,12 +2911,19 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
       if (NewRetains.empty()) break;
     }
 
-    // If the pointer is known incremented, we can safely delete the pair
-    // regardless of what's between them.
-    if (KnownIncrementedTD || KnownIncrementedBU) {
+    // If the pointer is known incremented or nested, we can safely delete the
+    // pair regardless of what's between them.
+    if (KnownSafeTD || KnownSafeBU) {
       RetainsToMove.ReverseInsertPts.clear();
       ReleasesToMove.ReverseInsertPts.clear();
       NewCount = 0;
+    } else {
+      // Determine whether the new insertion points we computed preserve the
+      // balance of retain and release calls through the program.
+      // TODO: If the fully aggressive solution isn't valid, try to find a
+      // less aggressive solution which is.
+      if (NewDelta != 0)
+        goto next_retain;
     }
 
     // Determine whether the original call points are balanced in the retain and
@@ -2803,18 +2934,12 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     if (OldDelta != 0)
       goto next_retain;
 
-    // Determine whether the new insertion points we computed preserve the
-    // balance of retain and release calls through the program.
-    // TODO: If the fully aggressive solution isn't valid, try to find a
-    // less aggressive solution which is.
-    if (NewDelta != 0)
-      goto next_retain;
-
     // Ok, everything checks out and we're all set. Let's move some code!
     Changed = true;
     AnyPairsCompletelyEliminated = NewCount == 0;
     NumRRs += OldCount - NewCount;
-    MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts);
+    MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+              Retains, Releases, DeadInsts, M);
 
   next_retain:
     NewReleases.clear();
@@ -2993,7 +3118,8 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
   bool NestingDetected = Visit(F, BBStates, Retains, Releases);
 
   // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected;
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
 }
 
 /// OptimizeReturns - Look for this pattern:
@@ -3072,7 +3198,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
 
         // Check that there is nothing that can affect the reference
         // count between the retain and the call.
-        FindDependencies(CanChangeRetainCount, Arg, BB, Retain,
+        // Note that Retain need not be in BB.
+        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
                          DependingInstructions, Visited, PA);
         if (DependingInstructions.size() != 1)
           goto next_block;
@@ -3117,12 +3244,6 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   ImpreciseReleaseMDKind =
     M.getContext().getMDKindID("clang.imprecise_release");
 
-  // Identify the declarations for objc_retain and friends.
-  RetainFunc = M.getFunction("objc_retain");
-  RetainBlockFunc = M.getFunction("objc_retainBlock");
-  RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue");
-  ReleaseFunc = M.getFunction("objc_release");
-
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
   // calls finalizers.
@@ -3132,6 +3253,7 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   AutoreleaseRVCallee = 0;
   ReleaseCallee = 0;
   RetainCallee = 0;
+  RetainBlockCallee = 0;
   AutoreleaseCallee = 0;
 
   return false;
@@ -3294,7 +3416,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -3310,7 +3432,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -3377,7 +3499,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
 void ObjCARCContract::ContractRelease(Instruction *Release,
                                       inst_iterator &Iter) {
   LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || Load->isVolatile()) return;
+  if (!Load || !Load->isSimple()) return;
 
   // For now, require everything to be in one basic block.
   BasicBlock *BB = Release->getParent();
@@ -3393,7 +3515,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
           !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod)))
     ++I;
   StoreInst *Store = dyn_cast<StoreInst>(I);
-  if (!Store || Store->isVolatile()) return;
+  if (!Store || !Store->isSimple()) return;
   if (Store->getPointerOperand() != Loc.Ptr) return;
 
   Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
@@ -3411,8 +3533,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   ++NumStoreStrongs;
 
   LLVMContext &C = Release->getContext();
-  const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  const Type *I8XX = PointerType::getUnqual(I8X);
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
 
   Value *Args[] = { Load->getPointerOperand(), New };
   if (Args[0]->getType() != I8XX)
@@ -3548,7 +3670,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
           if (Inst != UserInst && DT->dominates(Inst, UserInst)) {
             Changed = true;
             Instruction *Replacement = Inst;
-            const Type *UseTy = U.get()->getType();
+            Type *UseTy = U.get()->getType();
             if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
               // For PHI nodes, insert the bitcast in the predecessor block.
               unsigned ValNo =
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index e6341ae..8f98a5b 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -309,7 +309,7 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
     std::swap(LHS, RHS);
     bool Success = !I->swapOperands();
     assert(Success && "swapOperands failed");
-    Success = false;
+    (void)Success;
     MadeChange = true;
   } else if (RHSBO) {
     // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the RHS is not
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 083412e..196a847 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -156,7 +156,7 @@ namespace {
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const TargetData *TD;
-  SmallPtrSet<BasicBlock*, 8> BBExecutable;// The BBs that are executable.
+  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
 
   /// StructValueState - This maintains ValueState for values that have
@@ -241,7 +241,7 @@ public:
   /// this method must be called.
   void AddTrackedFunction(Function *F) {
     // Add an entry, F -> undef.
-    if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+    if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
       MRVFunctionsTracked.insert(F);
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
@@ -302,7 +302,7 @@ public:
   /// markAnythingOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
   void markAnythingOverdefined(Value *V) {
-    if (const StructType *STy = dyn_cast<StructType>(V->getType()))
+    if (StructType *STy = dyn_cast<StructType>(V->getType()))
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         markOverdefined(getStructValueState(V, i), V);
     else
@@ -417,7 +417,7 @@ private:
       else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C))
         LV.markConstant(CS->getOperand(i));      // Constants are constant.
       else if (isa<ConstantAggregateZero>(C)) {
-        const Type *FieldTy = cast<StructType>(V->getType())->getElementType(i);
+        Type *FieldTy = cast<StructType>(V->getType())->getElementType(i);
         LV.markConstant(Constant::getNullValue(FieldTy));
       } else
         LV.markOverdefined();      // Unknown sort of constant.
@@ -471,9 +471,9 @@ private:
   /// UsersOfOverdefinedPHIs map for PN, remove them now.
   void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) {
     if (UsersOfOverdefinedPHIs.empty()) return;
-    std::multimap<PHINode*, Instruction*>::iterator It, E;
-    tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN);
-    while (It != E) {
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
+    for (ItTy It = Range.first, E = Range.second; It != E;) {
       if (It->second == I)
         UsersOfOverdefinedPHIs.erase(It++);
       else
@@ -486,9 +486,9 @@ private:
   /// (Duplicate entries do not break anything directly, but can lead to
   /// exponential growth of the table in rare cases.)
   void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) {
-    std::multimap<PHINode*, Instruction*>::iterator J, E;
-    tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN);
-    for (; J != E; ++J)
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
+    for (ItTy J = Range.first, E = Range.second; J != E; ++J)
       if (J->second == I)
         return;
     UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I));
@@ -515,6 +515,7 @@ private:
   void visitShuffleVectorInst(ShuffleVectorInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
+  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
 
   // Instructions that cannot be folded away.
   void visitStoreInst     (StoreInst &I);
@@ -528,8 +529,12 @@ private:
     visitTerminatorInst(II);
   }
   void visitCallSite      (CallSite CS);
+  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+  void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
+  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
   void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
 
@@ -577,6 +582,10 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
   }
   
   if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (TI.getNumSuccessors() < 2) {
+      Succs[0] = true;
+      return;
+    }
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
     
@@ -637,6 +646,9 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     return true;
   
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumSuccessors() < 2)
+      return true;
+
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
     
@@ -692,13 +704,14 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     // There may be instructions using this PHI node that are not overdefined
     // themselves.  If so, make sure that they know that the PHI node operand
     // changed.
-    std::multimap<PHINode*, Instruction*>::iterator I, E;
-    tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN);
-    if (I == E)
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(&PN);
+    
+    if (Range.first == Range.second)
       return;
     
     SmallVector<Instruction*, 16> Users;
-    for (; I != E; ++I)
+    for (ItTy I = Range.first, E = Range.second; I != E; ++I)
       Users.push_back(I->second);
     while (!Users.empty())
       visit(Users.pop_back_val());
@@ -772,7 +785,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
   
   // Handle functions that return multiple values.
   if (!TrackedMultipleRetVals.empty()) {
-    if (const StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
+    if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
       if (MRVFunctionsTracked.count(F))
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
           mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
@@ -825,7 +838,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
 }
 
 void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
-  const StructType *STy = dyn_cast<StructType>(IVI.getType());
+  StructType *STy = dyn_cast<StructType>(IVI.getType());
   if (STy == 0)
     return markOverdefined(&IVI);
   
@@ -925,7 +938,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
         // Could annihilate value.
         if (I.getOpcode() == Instruction::And)
           markConstant(IV, &I, Constant::getNullValue(I.getType()));
-        else if (const VectorType *PT = dyn_cast<VectorType>(I.getType()))
+        else if (VectorType *PT = dyn_cast<VectorType>(I.getType()))
           markConstant(IV, &I, Constant::getAllOnesValue(PT));
         else
           markConstant(IV, &I,
@@ -1179,8 +1192,8 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
   }
 
   Constant *Ptr = Operands[0];
-  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0]+1,
-                                                  Operands.size()-1));
+  ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end());
+  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices));
 }
 
 void SCCPSolver::visitStoreInst(StoreInst &SI) {
@@ -1278,7 +1291,7 @@ CallOverdefined:
      
       // If we can constant fold this, mark the result of the call as a
       // constant.
-      if (Constant *C = ConstantFoldCall(F, Operands.data(), Operands.size()))
+      if (Constant *C = ConstantFoldCall(F, Operands))
         return markConstant(I, C);
     }
 
@@ -1303,7 +1316,7 @@ CallOverdefined:
         continue;
       }
       
-      if (const StructType *STy = dyn_cast<StructType>(AI->getType())) {
+      if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           LatticeVal CallArg = getStructValueState(*CAI, i);
           mergeInValue(getStructValueState(AI, i), AI, CallArg);
@@ -1315,7 +1328,7 @@ CallOverdefined:
   }
   
   // If this is a single/zero retval case, see if we're tracking the function.
-  if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+  if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
     if (!MRVFunctionsTracked.count(F))
       goto CallOverdefined;  // Not tracking this callee.
     
@@ -1419,67 +1432,116 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // Look for instructions which produce undef values.
       if (I->getType()->isVoidTy()) continue;
       
-      if (const StructType *STy = dyn_cast<StructType>(I->getType())) {
-        // Only a few things that can be structs matter for undef.  Just send
-        // all their results to overdefined.  We could be more precise than this
-        // but it isn't worth bothering.
-        if (isa<CallInst>(I) || isa<SelectInst>(I)) {
-          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-            LatticeVal &LV = getStructValueState(I, i);
-            if (LV.isUndefined())
-              markOverdefined(LV, I);
-          }
+      if (StructType *STy = dyn_cast<StructType>(I->getType())) {
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (CallSite CS = CallSite(I))
+          if (Function *F = CS.getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands. 
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal &LV = getStructValueState(I, i);
+          if (LV.isUndefined())
+            markOverdefined(LV, I);
         }
         continue;
       }
-      
+
       LatticeVal &LV = getValueState(I);
       if (!LV.isUndefined()) continue;
 
-      // No instructions using structs need disambiguation.
-      if (I->getOperand(0)->getType()->isStructTy())
+      // extractvalue is safe; check here because the argument is a struct.
+      if (isa<ExtractValueInst>(I))
         continue;
 
-      // Get the lattice values of the first two operands for use below.
+      // Compute the operand LatticeVals, for convenience below.
+      // Anything taking a struct is conservatively assumed to require
+      // overdefined markings.
+      if (I->getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(I);
+        return true;
+      }
       LatticeVal Op0LV = getValueState(I->getOperand(0));
       LatticeVal Op1LV;
       if (I->getNumOperands() == 2) {
-        // No instructions using structs need disambiguation.
-        if (I->getOperand(1)->getType()->isStructTy())
-          continue;
-        
-        // If this is a two-operand instruction, and if both operands are
-        // undefs, the result stays undef.
+        if (I->getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(I);
+          return true;
+        }
+
         Op1LV = getValueState(I->getOperand(1));
-        if (Op0LV.isUndefined() && Op1LV.isUndefined())
-          continue;
       }
-      
       // If this is an instructions whose result is defined even if the input is
       // not fully defined, propagate the information.
-      const Type *ITy = I->getType();
+      Type *ITy = I->getType();
       switch (I->getOpcode()) {
-      default: break;          // Leave the instruction as an undef.
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+        break; // Any undef -> undef
+      case Instruction::FSub:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        // Floating-point binary operation: be conservative.
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          markForcedConstant(I, Constant::getNullValue(ITy));
+        else
+          markOverdefined(I);
+        return true;
       case Instruction::ZExt:
-        // After a zero extend, we know the top part is zero.  SExt doesn't have
-        // to be handled here, because we don't know whether the top part is 1's
-        // or 0's.
-      case Instruction::SIToFP:  // some FP values are not possible, just use 0.
-      case Instruction::UIToFP:  // some FP values are not possible, just use 0.
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        // undef -> 0; some outputs are impossible
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Mul:
       case Instruction::And:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
         // undef * X -> 0.   X could be zero.
         // undef & X -> 0.   X could be zero.
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
 
       case Instruction::Or:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
         // undef | X -> -1.   X could be -1.
         markForcedConstant(I, Constant::getAllOnesValue(ITy));
         return true;
 
+      case Instruction::Xor:
+        // undef ^ undef -> 0; strictly speaking, this is not strictly
+        // necessary, but we try to be nice to people who expect this
+        // behavior in simple cases
+        if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+          markForcedConstant(I, Constant::getNullValue(ITy));
+          return true;
+        }
+        // undef ^ X -> undef
+        break;
+
       case Instruction::SDiv:
       case Instruction::UDiv:
       case Instruction::SRem:
@@ -1494,26 +1556,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         return true;
         
       case Instruction::AShr:
-        // undef >>s X -> undef.  No change.
-        if (Op0LV.isUndefined()) break;
-        
-        // X >>s undef -> X.  X could be 0, X could have the high-bit known set.
-        if (Op0LV.isConstant())
-          markForcedConstant(I, Op0LV.getConstant());
-        else
-          markOverdefined(I);
+        // X >>a undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef >>a X -> all ones
+        markForcedConstant(I, Constant::getAllOnesValue(ITy));
         return true;
       case Instruction::LShr:
       case Instruction::Shl:
-        // undef >> X -> undef.  No change.
-        // undef << X -> undef.  No change.
-        if (Op0LV.isUndefined()) break;
-        
-        // X >> undef -> 0.  X could be 0.
-        // X << undef -> 0.  X could be 0.
+        // X << undef -> undef.
+        // X >> undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef << X -> 0
+        // undef >> X -> 0
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Select:
+        Op1LV = getValueState(I->getOperand(1));
         // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
         if (Op0LV.isUndefined()) {
           if (!Op1LV.isConstant())  // Pick the constant one if there is any.
@@ -1533,9 +1593,35 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         else
           markOverdefined(I);
         return true;
+      case Instruction::Load:
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        break;
+      case Instruction::ICmp:
+        // X == undef -> undef.  Other comparisons get more complicated.
+        if (cast<ICmpInst>(I)->isEquality())
+          break;
+        markOverdefined(I);
+        return true;
       case Instruction::Call:
-        // If a call has an undef result, it is because it is constant foldable
-        // but one of the inputs was undef.  Just force the result to
+      case Instruction::Invoke: {
+        // There are two reasons a call can have an undef result
+        // 1. It could be tracked.
+        // 2. It could be constant-foldable.
+        // Because of the way we solve return values, tracked calls must
+        // never be marked overdefined in ResolvedUndefsIn.
+        if (Function *F = CallSite(I).getCalledFunction())
+          if (TrackedRetVals.count(F))
+            break;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(I);
+        return true;
+      }
+      default:
+        // If we don't know what should happen here, conservatively mark it
         // overdefined.
         markOverdefined(I);
         return true;
@@ -1621,15 +1707,25 @@ FunctionPass *llvm::createSCCPPass() {
 static void DeleteInstructionInBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumDeadBlocks;
-  
-  // Delete the instructions backwards, as it has a reduced likelihood of
-  // having to update as many def-use and use-def chains.
-  while (!isa<TerminatorInst>(BB->begin())) {
-    Instruction *I = --BasicBlock::iterator(BB->getTerminator());
-    
-    if (!I->use_empty())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-    BB->getInstList().erase(I);
+
+  // Check to see if there are non-terminating instructions to delete.
+  if (isa<TerminatorInst>(BB->begin()))
+    return;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != BB->begin()) {
+    // Delete the next to last instruction.
+    BasicBlock::iterator I = EndInst;
+    Instruction *Inst = --I;
+    if (!Inst->use_empty())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (isa<LandingPadInst>(Inst)) {
+      EndInst = Inst;
+      continue;
+    }
+    BB->getInstList().erase(Inst);
     ++NumInstRemoved;
   }
 }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 302c287..f6918de 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -63,7 +63,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeCFGSimplifyPassPass(Registry);
   initializeSimplifyLibCallsPass(Registry);
   initializeSinkingPass(Registry);
-  initializeTailDupPass(Registry);
   initializeTailCallElimPass(Registry);
 }
 
@@ -187,3 +186,7 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
 void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBasicAliasAnalysisPass());
 }
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 7d6349c..c6d9123 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -129,11 +129,11 @@ namespace {
                                          AllocaInfo &Info);
     void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
     void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                         const Type *MemOpType, bool isStore, AllocaInfo &Info,
+                         Type *MemOpType, bool isStore, AllocaInfo &Info,
                          Instruction *TheAccess, bool AllowWholeAccess);
-    bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size);
-    uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset,
-                                  const Type *&IdxTy);
+    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size);
+    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                  Type *&IdxTy);
 
     void DoScalarReplacement(AllocaInst *AI,
                              std::vector<AllocaInst*> &WorkList);
@@ -145,6 +145,9 @@ namespace {
                         SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                     SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                  uint64_t Offset,
+                                  SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                       AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
@@ -253,7 +256,7 @@ class ConvertToScalarInfo {
   /// VectorTy - This tracks the type that we should promote the vector to if
   /// it is possible to turn it into a vector.  This starts out null, and if it
   /// isn't possible to turn into a vector type, it gets set to VoidTy.
-  const VectorType *VectorTy;
+  VectorType *VectorTy;
 
   /// HadNonMemTransferAccess - True if there is at least one access to the 
   /// alloca that is not a MemTransferInst.  We don't want to turn structs into
@@ -269,11 +272,11 @@ public:
 
 private:
   bool CanConvertToScalar(Value *V, uint64_t Offset);
-  void MergeInTypeForLoadOrStore(const Type *In, uint64_t Offset);
-  bool MergeInVectorType(const VectorType *VInTy, uint64_t Offset);
+  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
+  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
   void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
 
-  Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType,
+  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
                                     uint64_t Offset, IRBuilder<> &Builder);
   Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
                                    uint64_t Offset, IRBuilder<> &Builder);
@@ -295,8 +298,6 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   if (ScalarKind == Unknown)
     ScalarKind = Integer;
 
-  // FIXME: It should be possible to promote the vector type up to the alloca's
-  // size.
   if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
     ScalarKind = Integer;
 
@@ -306,7 +307,7 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // random stuff that doesn't use vectors (e.g. <9 x double>) because then
   // we just get a lot of insert/extracts.  If at least one vector is
   // involved, then we probably really do have a union of vector/array.
-  const Type *NewTy;
+  Type *NewTy;
   if (ScalarKind == Vector) {
     assert(VectorTy && "Missing type for vector scalar.");
     DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
@@ -331,20 +332,16 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
 /// (VectorTy) so far at the offset specified by Offset (which is specified in
 /// bytes).
 ///
-/// There are three cases we handle here:
+/// There are two cases we handle here:
 ///   1) A union of vector types of the same size and potentially its elements.
 ///      Here we turn element accesses into insert/extract element operations.
 ///      This promotes a <4 x float> with a store of float to the third element
 ///      into a <4 x float> that uses insert element.
-///   2) A union of vector types with power-of-2 size differences, e.g. a float,
-///      <2 x float> and <4 x float>.  Here we turn element accesses into insert
-///      and extract element operations, and <2 x float> accesses into a cast to
-///      <2 x double>, an extract, and a cast back to <2 x float>.
-///   3) A fully general blob of memory, which we turn into some (potentially
+///   2) A fully general blob of memory, which we turn into some (potentially
 ///      large) integer type with extract and insert operations where the loads
 ///      and stores would mutate the memory.  We mark this by setting VectorTy
 ///      to VoidTy.
-void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
                                                     uint64_t Offset) {
   // If we already decided to turn this into a blob of integer memory, there is
   // nothing to be done.
@@ -355,7 +352,7 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
 
   // If the In type is a vector that is the same size as the alloca, see if it
   // matches the existing VecTy.
-  if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
+  if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
     if (MergeInVectorType(VInTy, Offset))
       return;
   } else if (In->isFloatTy() || In->isDoubleTy() ||
@@ -371,20 +368,13 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
     // if the implied vector agrees with what we already have and if Offset is
     // compatible with it.
     if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
-        (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) {
+        (!VectorTy || EltSize == VectorTy->getElementType()
+                                         ->getPrimitiveSizeInBits()/8)) {
       if (!VectorTy) {
         ScalarKind = ImplicitVector;
         VectorTy = VectorType::get(In, AllocaSize/EltSize);
-        return;
       }
-
-      unsigned CurrentEltSize = VectorTy->getElementType()
-                                ->getPrimitiveSizeInBits()/8;
-      if (EltSize == CurrentEltSize)
-        return;
-
-      if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize))
-        return;
+      return;
     }
   }
 
@@ -395,74 +385,21 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
 
 /// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
 /// returning true if the type was successfully merged and false otherwise.
-bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
+bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
                                             uint64_t Offset) {
-  // TODO: Support nonzero offsets?
-  if (Offset != 0)
-    return false;
-
-  // Only allow vectors that are a power-of-2 away from the size of the alloca.
-  if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8)))
-    return false;
-
-  // If this the first vector we see, remember the type so that we know the
-  // element size.
-  if (!VectorTy) {
+  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+    // If we're storing/loading a vector of the right size, allow it as a
+    // vector.  If this the first vector we see, remember the type so that
+    // we know the element size. If this is a subsequent access, ignore it
+    // even if it is a differing type but the same size. Worst case we can
+    // bitcast the resultant vectors.
+    if (!VectorTy)
+      VectorTy = VInTy;
     ScalarKind = Vector;
-    VectorTy = VInTy;
     return true;
   }
 
-  unsigned BitWidth = VectorTy->getBitWidth();
-  unsigned InBitWidth = VInTy->getBitWidth();
-
-  // Vectors of the same size can be converted using a simple bitcast.
-  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) {
-    ScalarKind = Vector;
-    return true;
-  }
-
-  const Type *ElementTy = VectorTy->getElementType();
-  const Type *InElementTy = VInTy->getElementType();
-
-  // Do not allow mixed integer and floating-point accesses from vectors of
-  // different sizes.
-  if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy())
-    return false;
-
-  if (ElementTy->isFloatingPointTy()) {
-    // Only allow floating-point vectors of different sizes if they have the
-    // same element type.
-    // TODO: This could be loosened a bit, but would anything benefit?
-    if (ElementTy != InElementTy)
-      return false;
-
-    // There are no arbitrary-precision floating-point types, which limits the
-    // number of legal vector types with larger element types that we can form
-    // to bitcast and extract a subvector.
-    // TODO: We could support some more cases with mixed fp128 and double here.
-    if (!(BitWidth == 64 || BitWidth == 128) ||
-        !(InBitWidth == 64 || InBitWidth == 128))
-      return false;
-  } else {
-    assert(ElementTy->isIntegerTy() && "Vector elements must be either integer "
-                                       "or floating-point.");
-    unsigned BitWidth = ElementTy->getPrimitiveSizeInBits();
-    unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits();
-
-    // Do not allow integer types smaller than a byte or types whose widths are
-    // not a multiple of a byte.
-    if (BitWidth < 8 || InBitWidth < 8 ||
-        BitWidth % 8 != 0 || InBitWidth % 8 != 0)
-      return false;
-  }
-
-  // Pick the largest of the two vector types.
-  ScalarKind = Vector;
-  if (InBitWidth > BitWidth)
-    VectorTy = VInTy;
-
-  return true;
+  return false;
 }
 
 /// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
@@ -480,7 +417,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       // Don't break volatile loads.
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return false;
       // Don't touch MMX operations.
       if (LI->getType()->isX86_MMXTy())
@@ -492,7 +429,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Storing the pointer, not into the value?
-      if (SI->getOperand(0) == V || SI->isVolatile()) return false;
+      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
       // Don't touch MMX operations.
       if (SI->getOperand(0)->getType()->isX86_MMXTy())
         return false;
@@ -502,7 +439,8 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     }
 
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
-      IsNotTrivial = true;  // Can't be mem2reg'd.
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        IsNotTrivial = true;  // Can't be mem2reg'd.
       if (!CanConvertToScalar(BCI, Offset))
         return false;
       continue;
@@ -516,7 +454,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
-                                               &Indices[0], Indices.size());
+                                               Indices);
       // See if all uses can be converted.
       if (!CanConvertToScalar(GEP, Offset+GEPOffset))
         return false;
@@ -560,6 +498,14 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       continue;
     }
 
+    // If this is a lifetime intrinsic, we can handle it.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        continue;
+      }
+    }
+
     // Otherwise, we cannot handle this!
     return false;
   }
@@ -589,7 +535,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
-                                               &Indices[0], Indices.size());
+                                               Indices);
       ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
       GEP->eraseFromParent();
       continue;
@@ -599,7 +545,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       // The load is a bit extract from NewAI shifted right by Offset bits.
-      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
+      Value *LoadedVal = Builder.CreateLoad(NewAI);
       Value *NewLoadVal
         = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
       LI->replaceAllUsesWith(NewLoadVal);
@@ -668,8 +614,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         // pointer (bitcasted), then a store to our new alloca.
         assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
         Value *SrcPtr = MTI->getSource();
-        const PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
-        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
         if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
           AIPTy = PointerType::get(AIPTy->getElementType(),
                                    SPTy->getAddressSpace());
@@ -685,8 +631,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
         LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
 
-        const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
-        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
         if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
           AIPTy = PointerType::get(AIPTy->getElementType(),
                                    DPTy->getAddressSpace());
@@ -703,65 +649,18 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       continue;
     }
 
-    llvm_unreachable("Unsupported operation!");
-  }
-}
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        // There's no need to preserve these, as the resulting alloca will be
+        // converted to a register anyways.
+        II->eraseFromParent();
+        continue;
+      }
+    }
 
-/// getScaledElementType - Gets a scaled element type for a partial vector
-/// access of an alloca. The input types must be integer or floating-point
-/// scalar or vector types, and the resulting type is an integer, float or
-/// double.
-static const Type *getScaledElementType(const Type *Ty1, const Type *Ty2,
-                                        unsigned NewBitWidth) {
-  bool IsFP1 = Ty1->isFloatingPointTy() ||
-               (Ty1->isVectorTy() &&
-                cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy());
-  bool IsFP2 = Ty2->isFloatingPointTy() ||
-               (Ty2->isVectorTy() &&
-                cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy());
-
-  LLVMContext &Context = Ty1->getContext();
-
-  // Prefer floating-point types over integer types, as integer types may have
-  // been created by earlier scalar replacement.
-  if (IsFP1 || IsFP2) {
-    if (NewBitWidth == 32)
-      return Type::getFloatTy(Context);
-    if (NewBitWidth == 64)
-      return Type::getDoubleTy(Context);
+    llvm_unreachable("Unsupported operation!");
   }
-
-  return Type::getIntNTy(Context, NewBitWidth);
-}
-
-/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector
-/// to another vector of the same element type which has the same allocation
-/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>).
-static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType,
-                                      IRBuilder<> &Builder) {
-  const Type *FromType = FromVal->getType();
-  const VectorType *FromVTy = cast<VectorType>(FromType);
-  const VectorType *ToVTy = cast<VectorType>(ToType);
-  assert((ToVTy->getElementType() == FromVTy->getElementType()) &&
-         "Vectors must have the same element type");
-   Value *UnV = UndefValue::get(FromType);
-   unsigned numEltsFrom = FromVTy->getNumElements();
-   unsigned numEltsTo = ToVTy->getNumElements();
-
-   SmallVector<Constant*, 3> Args;
-   const Type* Int32Ty = Builder.getInt32Ty();
-   unsigned minNumElts = std::min(numEltsFrom, numEltsTo);
-   unsigned i;
-   for (i=0; i != minNumElts; ++i)
-     Args.push_back(ConstantInt::get(Int32Ty, i));
-
-   if (i < numEltsTo) {
-     Constant* UnC = UndefValue::get(Int32Ty);
-     for (; i != numEltsTo; ++i)
-       Args.push_back(UnC);
-   }
-   Constant *Mask = ConstantVector::get(Args);
-   return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV");
 }
 
 /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
@@ -775,50 +674,20 @@ static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType,
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.
 Value *ConvertToScalarInfo::
-ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
+ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
                            uint64_t Offset, IRBuilder<> &Builder) {
   // If the load is of the whole new alloca, no conversion is needed.
-  const Type *FromType = FromVal->getType();
+  Type *FromType = FromVal->getType();
   if (FromType == ToType && Offset == 0)
     return FromVal;
 
   // If the result alloca is a vector type, this is either an element
   // access or a bitcast to another vector type of the same size.
-  if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
     unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
     unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
-    if (FromTypeSize == ToTypeSize) {
-      // If the two types have the same primitive size, use a bit cast.
-      // Otherwise, it is two vectors with the same element type that has
-      // the same allocation size but different number of elements so use
-      // a shuffle vector.
-      if (FromType->getPrimitiveSizeInBits() ==
-          ToType->getPrimitiveSizeInBits())
-        return Builder.CreateBitCast(FromVal, ToType, "tmp");
-      else
-        return CreateShuffleVectorCast(FromVal, ToType, Builder);
-    }
-
-    if (isPowerOf2_64(FromTypeSize / ToTypeSize)) {
-      assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value "
-             "of a smaller vector type at a nonzero offset.");
-
-      const Type *CastElementTy = getScaledElementType(FromType, ToType,
-                                                       ToTypeSize * 8);
-      unsigned NumCastVectorElements = FromTypeSize / ToTypeSize;
-
-      LLVMContext &Context = FromVal->getContext();
-      const Type *CastTy = VectorType::get(CastElementTy,
-                                           NumCastVectorElements);
-      Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
-
-      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
-      unsigned Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-      Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), Elt), "tmp");
-      return Builder.CreateBitCast(Extract, ToType, "tmp");
-    }
+    if (FromTypeSize == ToTypeSize)
+        return Builder.CreateBitCast(FromVal, ToType);
 
     // Otherwise it must be an element access.
     unsigned Elt = 0;
@@ -828,40 +697,39 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
     // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get(
-                    Type::getInt32Ty(FromVal->getContext()), Elt), "tmp");
+    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
     if (V->getType() != ToType)
-      V = Builder.CreateBitCast(V, ToType, "tmp");
+      V = Builder.CreateBitCast(V, ToType);
     return V;
   }
 
   // If ToType is a first class aggregate, extract out each of the pieces and
   // use insertvalue's to form the FCA.
-  if (const StructType *ST = dyn_cast<StructType>(ToType)) {
+  if (StructType *ST = dyn_cast<StructType>(ToType)) {
     const StructLayout &Layout = *TD.getStructLayout(ST);
     Value *Res = UndefValue::get(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
                                               Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
-  if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
                                               Offset+i*EltSize, Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
   // Otherwise, this must be a union that was converted to an integer value.
-  const IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+  IntegerType *NTy = cast<IntegerType>(FromVal->getType());
 
   // If this is a big-endian system and the load is narrower than the
   // full alloca type, we need to do a shift to get the right bits.
@@ -881,33 +749,31 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
   // only some bits are used.
   if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
     FromVal = Builder.CreateLShr(FromVal,
-                                 ConstantInt::get(FromVal->getType(),
-                                                           ShAmt), "tmp");
+                                 ConstantInt::get(FromVal->getType(), ShAmt));
   else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
     FromVal = Builder.CreateShl(FromVal,
-                                ConstantInt::get(FromVal->getType(),
-                                                          -ShAmt), "tmp");
+                                ConstantInt::get(FromVal->getType(), -ShAmt));
 
   // Finally, unconditionally truncate the integer to the right width.
   unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
   if (LIBitWidth < NTy->getBitWidth())
     FromVal =
       Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
   else if (LIBitWidth > NTy->getBitWidth())
     FromVal =
        Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
 
   // If the result is an integer, this is a trunc or bitcast.
   if (ToType->isIntegerTy()) {
     // Should be done.
   } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
     // Just do a bitcast, we know the sizes match up.
-    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateBitCast(FromVal, ToType);
   } else {
     // Otherwise must be a pointer.
-    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
   }
   assert(FromVal->getType() == ToType && "Didn't convert right?");
   return FromVal;
@@ -927,65 +793,30 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
                           uint64_t Offset, IRBuilder<> &Builder) {
   // Convert the stored type to the actual type, shift it left to insert
   // then 'or' into place.
-  const Type *AllocaType = Old->getType();
+  Type *AllocaType = Old->getType();
   LLVMContext &Context = Old->getContext();
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
     uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy);
     uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType());
 
     // Changing the whole vector with memset or with an access of a different
     // vector type?
-    if (ValSize == VecSize) {
-      // If the two types have the same primitive size, use a bit cast.
-      // Otherwise, it is two vectors with the same element type that has
-      // the same allocation size but different number of elements so use
-      // a shuffle vector.
-      if (VTy->getPrimitiveSizeInBits() ==
-          SV->getType()->getPrimitiveSizeInBits())
-        return Builder.CreateBitCast(SV, AllocaType, "tmp");
-      else
-        return CreateShuffleVectorCast(SV, VTy, Builder);
-    }
-
-    if (isPowerOf2_64(VecSize / ValSize)) {
-      assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a "
-             "value of a smaller vector type at a nonzero offset.");
-
-      const Type *CastElementTy = getScaledElementType(VTy, SV->getType(),
-                                                       ValSize);
-      unsigned NumCastVectorElements = VecSize / ValSize;
-
-      LLVMContext &Context = SV->getContext();
-      const Type *OldCastTy = VectorType::get(CastElementTy,
-                                              NumCastVectorElements);
-      Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
-
-      Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
-
-      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
-      unsigned Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-      Value *Insert =
-        Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), Elt), "tmp");
-      return Builder.CreateBitCast(Insert, AllocaType, "tmp");
-    }
+    if (ValSize == VecSize)
+        return Builder.CreateBitCast(SV, AllocaType);
 
     // Must be an element insertion.
     assert(SV->getType() == VTy->getElementType());
     uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
     unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV,
-                     ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt),
-                                     "tmp");
+    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
-  if (const StructType *ST = dyn_cast<StructType>(SV->getType())) {
+  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
     const StructLayout &Layout = *TD.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
                                       Builder);
@@ -993,10 +824,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
     return Old;
   }
 
-  if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
     }
     return Old;
@@ -1009,20 +840,19 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
   unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
-    SV = Builder.CreateBitCast(SV,
-                            IntegerType::get(SV->getContext(),SrcWidth), "tmp");
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp");
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
     if (SV->getType()->getPrimitiveSizeInBits() <
              AllocaType->getPrimitiveSizeInBits())
-      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
+      SV = Builder.CreateZExt(SV, AllocaType);
     else {
       // Truncation may be needed if storing more than the alloca can hold
       // (undefined behavior).
-      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
+      SV = Builder.CreateTrunc(SV, AllocaType);
       SrcWidth = DestWidth;
       SrcStoreWidth = DestStoreWidth;
     }
@@ -1045,12 +875,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   // only some bits in the structure are set.
   APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
   if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(),
-                           ShAmt), "tmp");
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
     Mask <<= ShAmt;
   } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(),
-                            -ShAmt), "tmp");
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
     Mask = Mask.lshr(-ShAmt);
   }
 
@@ -1196,7 +1024,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
   for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
     
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
@@ -1237,7 +1065,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
   for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
     
     // For now we only allow loads in the same block as the PHI.  This is a
     // common case that happens when instcombine merges two loads through a PHI.
@@ -1258,17 +1086,21 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
   // trapping load in the predecessor if it is a critical edge.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *Pred = PN->getIncomingBlock(i);
+    Value *InVal = PN->getIncomingValue(i);
+
+    // If the terminator of the predecessor has side-effects (an invoke),
+    // there is no safe place to put a load in the predecessor.
+    if (Pred->getTerminator()->mayHaveSideEffects())
+      return false;
+
+    // If the value is produced by the terminator of the predecessor
+    // (an invoke), there is no valid place to put a load in the predecessor.
+    if (Pred->getTerminator() == InVal)
+      return false;
 
     // If the predecessor has a single successor, then the edge isn't critical.
     if (Pred->getTerminator()->getNumSuccessors() == 1)
       continue;
-    
-    Value *InVal = PN->getIncomingValue(i);
-    
-    // If the InVal is an invoke in the pred, we can't put a load on the edge.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
-      if (II->getParent() == Pred)
-        return false;
 
     // If this pointer is always safe to load, or if we can prove that there is
     // already a load in the block, then we can move the load to the pred block.
@@ -1295,13 +1127,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
        UI != UE; ++UI) {
     User *U = *UI;
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return false;
       continue;
     }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI || SI->isVolatile())
+      if (SI->getOperand(0) == AI || !SI->isSimple())
         return false;   // Don't allow a store OF the AI, only INTO the AI.
       continue;
     }
@@ -1343,6 +1175,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       continue;
     }
     
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (onlyUsedByLifetimeMarkers(BCI)) {
+        InstsToRewrite.insert(BCI);
+        continue;
+      }
+    }
+    
     return false;
   }
 
@@ -1354,6 +1193,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   // If we have instructions that need to be rewritten for this to be promotable
   // take care of it now.
   for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+      // This could only be a bitcast used by nothing but lifetime intrinsics.
+      for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end();
+           I != E;) {
+        Use &U = I.getUse();
+        ++I;
+        cast<Instruction>(U.getUser())->eraseFromParent();
+      }
+      BCI->eraseFromParent();
+      continue;
+    }
+
     if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
       // Selects in InstsToRewrite only have load uses.  Rewrite each as two
       // loads with a new select.
@@ -1393,7 +1244,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       continue;
     }
     
-    const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
+    Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
     PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
                                      PN->getName()+".ld", PN);
 
@@ -1483,13 +1334,13 @@ bool SROA::performPromotion(Function &F) {
 /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
 /// SROA.  It must be a struct or array type with a small number of elements.
 static bool ShouldAttemptScalarRepl(AllocaInst *AI) {
-  const Type *T = AI->getAllocatedType();
+  Type *T = AI->getAllocatedType();
   // Do not promote any struct into more than 32 separate vars.
-  if (const StructType *ST = dyn_cast<StructType>(T))
+  if (StructType *ST = dyn_cast<StructType>(T))
     return ST->getNumElements() <= 32;
   // Arrays are much less likely to be safe for SROA; only consider
   // them if they are very small.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(T))
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
     return AT->getNumElements() <= 8;
   return false;
 }
@@ -1594,7 +1445,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
                                std::vector<AllocaInst*> &WorkList) {
   DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
   SmallVector<AllocaInst*, 32> ElementAllocas;
-  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+  if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
     ElementAllocas.reserve(ST->getNumContainedTypes());
     for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
       AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0,
@@ -1604,9 +1455,9 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
       WorkList.push_back(NA);  // Add to worklist for recursive processing
     }
   } else {
-    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
     ElementAllocas.reserve(AT->getNumElements());
-    const Type *ElTy = AT->getElementType();
+    Type *ElTy = AT->getElementType();
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
                                       AI->getName() + "." + Twine(i), AI);
@@ -1670,22 +1521,26 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
                       UI.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return MarkUnsafe(Info, User);
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
         
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
         
-      const Type *SIType = SI->getOperand(0)->getType();
+      Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return MarkUnsafe(Info, User);
     } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
       isSafePHISelectUseForScalarRepl(User, Offset, Info);
     } else {
@@ -1725,19 +1580,19 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, User);
       isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return MarkUnsafe(Info, User);
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
       
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
       
-      const Type *SIType = SI->getOperand(0)->getType();
+      Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
@@ -1776,8 +1631,7 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
   // Compute the offset due to this GEP and check if the alloca has a
   // component element at that offset.
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
-                                 &Indices[0], Indices.size());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
   if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0))
     MarkUnsafe(Info, GEPI);
 }
@@ -1786,14 +1640,14 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
 /// elements of the same type (which is always true for arrays).  If so,
 /// return true with NumElts and EltTy set to the number of elements and the
 /// element type, respectively.
-static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
-                                   const Type *&EltTy) {
-  if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
+                                   Type *&EltTy) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     NumElts = AT->getNumElements();
     EltTy = (NumElts == 0 ? 0 : AT->getElementType());
     return true;
   }
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
     NumElts = ST->getNumContainedTypes();
     EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0));
     for (unsigned n = 1; n < NumElts; ++n) {
@@ -1807,12 +1661,12 @@ static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
 
 /// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
 /// "homogeneous" aggregates with the same element type and number of elements.
-static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
+static bool isCompatibleAggregate(Type *T1, Type *T2) {
   if (T1 == T2)
     return true;
 
   unsigned NumElts1, NumElts2;
-  const Type *EltTy1, *EltTy2;
+  Type *EltTy1, *EltTy2;
   if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
       isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
       NumElts1 == NumElts2 &&
@@ -1830,7 +1684,7 @@ static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
 /// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
 /// unit.  If false, it only allows accesses known to be in a single element.
 void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                           const Type *MemOpType, bool isStore,
+                           Type *MemOpType, bool isStore,
                            AllocaInfo &Info, Instruction *TheAccess,
                            bool AllowWholeAccess) {
   // Check if this is a load/store of the entire alloca.
@@ -1857,7 +1711,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
     }
   }
   // Check if the offset/size correspond to a component within the alloca type.
-  const Type *T = Info.AI->getAllocatedType();
+  Type *T = Info.AI->getAllocatedType();
   if (TypeHasComponent(T, Offset, MemSize)) {
     Info.hasSubelementAccess = true;
     return;
@@ -1868,16 +1722,16 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
 
 /// TypeHasComponent - Return true if T has a component type with the
 /// specified offset and size.  If Size is zero, do not check the size.
-bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) {
-  const Type *EltTy;
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
+  Type *EltTy;
   uint64_t EltSize;
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
     const StructLayout *Layout = TD->getStructLayout(ST);
     unsigned EltIdx = Layout->getElementContainingOffset(Offset);
     EltTy = ST->getContainedType(EltIdx);
     EltSize = TD->getTypeAllocSize(EltTy);
     Offset -= Layout->getElementOffset(EltIdx);
-  } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     EltTy = AT->getElementType();
     EltSize = TD->getTypeAllocSize(EltTy);
     if (Offset >= AT->getNumElements() * EltSize)
@@ -1924,9 +1778,17 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       // address operand will be updated, so nothing else needs to be done.
       continue;
     }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+      }
+      continue;
+    }
     
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
       
       if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
         // Replace:
@@ -1956,7 +1818,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
     
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       Value *Val = SI->getOperand(0);
-      const Type *SIType = Val->getType();
+      Type *SIType = Val->getType();
       if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
         // Replace:
         //   store { i32, i32 } %val, { i32, i32 }* %alloc
@@ -2026,10 +1888,10 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
 /// Sets T to the type of the element and Offset to the offset within that
 /// element.  IdxTy is set to the type of the index result to be used in a
 /// GEP instruction.
-uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
-                                    const Type *&IdxTy) {
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                    Type *&IdxTy) {
   uint64_t Idx = 0;
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
     const StructLayout *Layout = TD->getStructLayout(ST);
     Idx = Layout->getElementContainingOffset(Offset);
     T = ST->getContainedType(Idx);
@@ -2037,7 +1899,7 @@ uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
     IdxTy = Type::getInt32Ty(T->getContext());
     return Idx;
   }
-  const ArrayType *AT = cast<ArrayType>(T);
+  ArrayType *AT = cast<ArrayType>(T);
   T = AT->getElementType();
   uint64_t EltSize = TD->getTypeAllocSize(T);
   Idx = Offset / EltSize;
@@ -2053,13 +1915,12 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                       SmallVector<AllocaInst*, 32> &NewElts) {
   uint64_t OldOffset = Offset;
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
-                                 &Indices[0], Indices.size());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
 
   RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
 
-  const Type *T = AI->getAllocatedType();
-  const Type *IdxTy;
+  Type *T = AI->getAllocatedType();
+  Type *IdxTy;
   uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
   if (GEPI->getOperand(0) == AI)
     OldIdx = ~0ULL; // Force the GEP to be rewritten.
@@ -2073,7 +1934,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   if (Idx == OldIdx)
     return;
 
-  const Type *i32Ty = Type::getInt32Ty(AI->getContext());
+  Type *i32Ty = Type::getInt32Ty(AI->getContext());
   SmallVector<Value*, 8> NewArgs;
   NewArgs.push_back(Constant::getNullValue(i32Ty));
   while (EltOffset != 0) {
@@ -2082,8 +1943,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   }
   Instruction *Val = NewElts[Idx];
   if (NewArgs.size() > 1) {
-    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(),
-                                            NewArgs.end(), "", GEPI);
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
     Val->takeName(GEPI);
   }
   if (Val->getType() != GEPI->getType())
@@ -2092,6 +1952,62 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   DeadInsts.push_back(GEPI);
 }
 
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                    uint64_t Offset,
+                                    SmallVector<AllocaInst*, 32> &NewElts) {
+  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+  // Put matching lifetime markers on everything from Offset up to
+  // Offset+OldSize.
+  Type *AIType = AI->getAllocatedType();
+  uint64_t NewOffset = Offset;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy);
+
+  IRBuilder<> Builder(II);
+  uint64_t Size = OldSize->getLimitedValue();
+
+  if (NewOffset) {
+    // Splice the first element and index 'NewOffset' bytes in.  SROA will
+    // split the alloca again later.
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy());
+    V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
+
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset;
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+    ++Idx;
+  }
+
+  for (; Idx != NewElts.size() && Size; ++Idx) {
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy);
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(NewElts[Idx],
+                                  Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(NewElts[Idx],
+                                Builder.getInt64(EltSize));
+  }
+  DeadInsts.push_back(II);
+}
+
 /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
 /// Rewrite it to copy or set the elements of the scalarized memory.
 void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
@@ -2139,7 +2055,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
 
     // If the pointer is not the right type, insert a bitcast to the right
     // type.
-    const Type *NewTy =
+    Type *NewTy =
       PointerType::get(AI->getType()->getElementType(), AddrSpace);
 
     if (OtherPtr->getType() != NewTy)
@@ -2159,16 +2075,16 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     if (OtherPtr) {
       Value *Idx[2] = { Zero,
                       ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
-      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2,
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
                                               OtherPtr->getName()+"."+Twine(i),
                                                    MI);
       uint64_t EltOffset;
-      const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
-      const Type *OtherTy = OtherPtrTy->getElementType();
-      if (const StructType *ST = dyn_cast<StructType>(OtherTy)) {
+      PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      Type *OtherTy = OtherPtrTy->getElementType();
+      if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
         EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
       } else {
-        const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
+        Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
         EltOffset = TD->getTypeAllocSize(EltTy)*i;
       }
 
@@ -2181,7 +2097,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     }
 
     Value *EltPtr = NewElts[i];
-    const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+    Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
 
     // If we got down to a scalar, insert a load or store as appropriate.
     if (EltTy->isSingleValueType()) {
@@ -2207,7 +2123,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
         } else {
           // If EltTy is a vector type, get the element type.
-          const Type *ValTy = EltTy->getScalarType();
+          Type *ValTy = EltTy->getScalarType();
 
           // Construct an integer with the right value.
           unsigned EltSize = TD->getTypeSizeInBits(ValTy);
@@ -2228,8 +2144,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           assert(StoreVal->getType() == ValTy && "Type mismatch!");
 
           // If the requested value was a vector constant, create it.
-          if (EltTy != ValTy) {
-            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+          if (EltTy->isVectorTy()) {
+            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
             SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
             StoreVal = ConstantVector::get(Elts);
           }
@@ -2271,7 +2187,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // Extract each element out of the integer according to its structure offset
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
-  const Type *AllocaEltTy = AI->getAllocatedType();
+  Type *AllocaEltTy = AI->getAllocatedType();
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
 
   IRBuilder<> Builder(SI);
@@ -2286,12 +2202,12 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
 
   // There are two forms here: AI could be an array or struct.  Both cases
   // have different ways to compute the element offset.
-  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
     const StructLayout *Layout = TD->getStructLayout(EltSTy);
 
     for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
       // Get the number of bits to shift SrcVal to get the value.
-      const Type *FieldTy = EltSTy->getElementType(i);
+      Type *FieldTy = EltSTy->getElementType(i);
       uint64_t Shift = Layout->getElementOffsetInBits(i);
 
       if (TD->isBigEndian())
@@ -2327,8 +2243,8 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
     }
 
   } else {
-    const ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
-    const Type *ArrayEltTy = ATy->getElementType();
+    ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    Type *ArrayEltTy = ATy->getElementType();
     uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
     uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
 
@@ -2384,7 +2300,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                         SmallVector<AllocaInst*, 32> &NewElts) {
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
-  const Type *AllocaEltTy = AI->getAllocatedType();
+  Type *AllocaEltTy = AI->getAllocatedType();
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
 
   DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
@@ -2394,10 +2310,10 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   // have different ways to compute the element offset.
   const StructLayout *Layout = 0;
   uint64_t ArrayEltBitOffset = 0;
-  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
     Layout = TD->getStructLayout(EltSTy);
   } else {
-    const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
     ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
   }
 
@@ -2408,14 +2324,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     // Load the value from the alloca.  If the NewElt is an aggregate, cast
     // the pointer to an integer of the same size before doing the load.
     Value *SrcField = NewElts[i];
-    const Type *FieldTy =
+    Type *FieldTy =
       cast<PointerType>(SrcField->getType())->getElementType();
     uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
 
     // Ignore zero sized fields like {}, they obviously contain no data.
     if (FieldSizeBits == 0) continue;
 
-    const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
+    IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
                                                      FieldSizeBits);
     if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
         !FieldTy->isVectorTy())
@@ -2468,14 +2384,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding in between the elements that would be split apart
 /// by SROA; return false otherwise.
-static bool HasPadding(const Type *Ty, const TargetData &TD) {
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+static bool HasPadding(Type *Ty, const TargetData &TD) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Ty = ATy->getElementType();
     return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
   }
 
   // SROA currently handles only Arrays and Structs.
-  const StructType *STy = cast<StructType>(Ty);
+  StructType *STy = cast<StructType>(Ty);
   const StructLayout *SL = TD.getStructLayout(STy);
   unsigned PrevFieldBitOffset = 0;
   for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -2530,7 +2446,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   // and fusion code.
   if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
     // If the struct/array just has one element, use basic SRoA.
-    if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
       if (ST->getNumElements() > 1) return false;
     } else {
       if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
@@ -2576,7 +2492,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       // Ignore non-volatile loads, they are always ok.
-      if (LI->isVolatile()) return false;
+      if (!LI->isSimple()) return false;
       continue;
     }
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 7c415e5..fbb9465 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -134,7 +134,7 @@ namespace {
 struct StrCatOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcat" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -184,7 +184,7 @@ struct StrCatOpt : public LibCallOptimization {
 struct StrNCatOpt : public StrCatOpt {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strncat" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -232,7 +232,7 @@ struct StrNCatOpt : public StrCatOpt {
 struct StrChrOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strchr" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -282,7 +282,7 @@ struct StrChrOpt : public LibCallOptimization {
 struct StrRChrOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strrchr" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -323,7 +323,7 @@ struct StrRChrOpt : public LibCallOptimization {
 struct StrCmpOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcmp" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
@@ -338,16 +338,17 @@ struct StrCmpOpt : public LibCallOptimization {
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
 
-    if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x
-      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
-
-    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
     // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
     if (HasStr1 && HasStr2)
       return ConstantInt::get(CI->getType(),
-                                     strcmp(Str1.c_str(),Str2.c_str()));
+                              StringRef(Str1).compare(Str2));
+
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
     // strcmp(P, "x") -> memcmp(P, "x", 2)
     uint64_t Len1 = GetStringLength(Str1P);
@@ -371,7 +372,7 @@ struct StrCmpOpt : public LibCallOptimization {
 struct StrNCmpOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strncmp" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 ||
         !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
@@ -400,16 +401,20 @@ struct StrNCmpOpt : public LibCallOptimization {
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
 
-    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> *x
-      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2) {
+      StringRef SubStr1 = StringRef(Str1).substr(0, Length);
+      StringRef SubStr2 = StringRef(Str2).substr(0, Length);
+      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+    }
+
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
 
     if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
       return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
-    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2)
-      return ConstantInt::get(CI->getType(),
-                              strncmp(Str1.c_str(), Str2.c_str(), Length));
     return 0;
   }
 };
@@ -426,7 +431,7 @@ struct StrCpyOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcpy" function prototype.
     unsigned NumParams = OptChkCall ? 3 : 2;
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != NumParams ||
         FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
@@ -462,7 +467,7 @@ struct StrCpyOpt : public LibCallOptimization {
 
 struct StrNCpyOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
@@ -511,7 +516,7 @@ struct StrNCpyOpt : public LibCallOptimization {
 
 struct StrLenOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         !FT->getReturnType()->isIntegerTy())
@@ -537,7 +542,7 @@ struct StrLenOpt : public LibCallOptimization {
 
 struct StrPBrkOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
@@ -575,7 +580,7 @@ struct StrPBrkOpt : public LibCallOptimization {
 
 struct StrToOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy())
@@ -597,7 +602,7 @@ struct StrToOpt : public LibCallOptimization {
 
 struct StrSpnOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
@@ -626,7 +631,7 @@ struct StrSpnOpt : public LibCallOptimization {
 
 struct StrCSpnOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
@@ -658,7 +663,7 @@ struct StrCSpnOpt : public LibCallOptimization {
 
 struct StrStrOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
@@ -722,7 +727,7 @@ struct StrStrOpt : public LibCallOptimization {
 
 struct MemCmpOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy(32))
@@ -773,7 +778,7 @@ struct MemCpyOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
@@ -795,7 +800,7 @@ struct MemMoveOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
@@ -817,7 +822,7 @@ struct MemSetOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
@@ -840,7 +845,7 @@ struct MemSetOpt : public LibCallOptimization {
 
 struct PowOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
@@ -874,8 +879,8 @@ struct PowOpt : public LibCallOptimization {
                                          Callee->getAttributes());
       Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
                                          Callee->getAttributes());
-      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf, "tmp");
-      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs, "tmp");
+      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
       return Sel;
     }
 
@@ -895,7 +900,7 @@ struct PowOpt : public LibCallOptimization {
 
 struct Exp2Opt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
@@ -908,10 +913,10 @@ struct Exp2Opt : public LibCallOptimization {
     Value *LdExpArg = 0;
     if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp");
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
     } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp");
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
     }
 
     if (LdExpArg) {
@@ -946,7 +951,7 @@ struct Exp2Opt : public LibCallOptimization {
 
 struct UnaryDoubleFPOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
         !FT->getParamType(0)->isDoubleTy())
       return 0;
@@ -973,7 +978,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
 
 struct FFSOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 1 ||
@@ -996,10 +1001,10 @@ struct FFSOpt : public LibCallOptimization {
     Value *F = Intrinsic::getDeclaration(Callee->getParent(),
                                          Intrinsic::cttz, ArgType);
     Value *V = B.CreateCall(F, Op, "cttz");
-    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp");
-    V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+    V = B.CreateIntCast(V, B.getInt32Ty(), false);
 
-    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
     return B.CreateSelect(Cond, V, B.getInt32(0));
   }
 };
@@ -1009,7 +1014,7 @@ struct FFSOpt : public LibCallOptimization {
 
 struct IsDigitOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require integer(i32)
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         !FT->getParamType(0)->isIntegerTy(32))
@@ -1028,7 +1033,7 @@ struct IsDigitOpt : public LibCallOptimization {
 
 struct IsAsciiOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require integer(i32)
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         !FT->getParamType(0)->isIntegerTy(32))
@@ -1046,7 +1051,7 @@ struct IsAsciiOpt : public LibCallOptimization {
 
 struct AbsOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require integer(integer) where the types agree.
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         FT->getParamType(0) != FT->getReturnType())
@@ -1067,7 +1072,7 @@ struct AbsOpt : public LibCallOptimization {
 
 struct ToAsciiOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require i32(i32)
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isIntegerTy(32))
@@ -1147,7 +1152,7 @@ struct PrintFOpt : public LibCallOptimization {
 
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require one fixed pointer argument and an integer/void result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
         !(FT->getReturnType()->isIntegerTy() ||
           FT->getReturnType()->isVoidTy()))
@@ -1241,7 +1246,7 @@ struct SPrintFOpt : public LibCallOptimization {
 
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require two fixed pointer arguments and an integer result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
@@ -1272,7 +1277,7 @@ struct SPrintFOpt : public LibCallOptimization {
 struct FWriteOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require a pointer, an integer, an integer, a pointer, returning integer.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
         !FT->getParamType(2)->isIntegerTy() ||
@@ -1310,7 +1315,7 @@ struct FPutsOpt : public LibCallOptimization {
     if (!TD) return 0;
 
     // Require two pointers.  Also, we can't optimize if return value is used.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !CI->use_empty())
@@ -1379,7 +1384,7 @@ struct FPrintFOpt : public LibCallOptimization {
 
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require two fixed paramters as pointers and integer result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
@@ -1410,7 +1415,7 @@ struct FPrintFOpt : public LibCallOptimization {
 struct PutsOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require one fixed pointer argument and an integer/void result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
         !(FT->getReturnType()->isIntegerTy() ||
           FT->getReturnType()->isVoidTy()))
@@ -1685,7 +1690,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
 
 
 void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
-  const FunctionType *FTy = F.getFunctionType();
+  FunctionType *FTy = F.getFunctionType();
   
   StringRef Name = F.getName();
   switch (Name[0]) {
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 705f442..c83f56c 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -153,9 +153,13 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
 
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
                          SmallPtrSet<Instruction *, 8> &Stores) {
-  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
-    if (L->isVolatile()) return false;
 
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     AliasAnalysis::Location Loc = AA->getLocation(L);
     for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(),
          E = Stores.end(); I != E; ++I)
@@ -163,11 +167,6 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
         return false;
   }
 
-  if (Inst->mayWriteToMemory()) {
-    Stores.insert(Inst);
-    return false;
-  }
-
   if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst))
     return false;
 
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
deleted file mode 100644
index 9dd83c0..0000000
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs a limited form of tail duplication, intended to simplify
-// CFGs by removing some unconditional branches.  This pass is necessary to
-// straighten out loops created by the C front-end, but also is capable of
-// making other code nicer.  After this pass is run, the CFG simplify pass
-// should be run to clean up the mess.
-//
-// This pass could be enhanced in the future to use profile information to be
-// more aggressive.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "tailduplicate"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Constant.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <map>
-using namespace llvm;
-
-STATISTIC(NumEliminated, "Number of unconditional branches eliminated");
-
-static cl::opt<unsigned>
-TailDupThreshold("taildup-threshold",
-                 cl::desc("Max block size to tail duplicate"),
-                 cl::init(1), cl::Hidden);
-
-namespace {
-  class TailDup : public FunctionPass {
-    bool runOnFunction(Function &F);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    TailDup() : FunctionPass(ID) {
-      initializeTailDupPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned);
-    inline void eliminateUnconditionalBranch(BranchInst *BI);
-    SmallPtrSet<BasicBlock*, 4> CycleDetector;
-  };
-}
-
-char TailDup::ID = 0;
-INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false)
-
-// Public interface to the Tail Duplication pass
-FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
-
-/// runOnFunction - Top level algorithm - Loop over each unconditional branch in
-/// the function, eliminating it if it looks attractive enough.  CycleDetector
-/// prevents infinite loops by checking that we aren't redirecting a branch to
-/// a place it already pointed to earlier; see PR 2323.
-bool TailDup::runOnFunction(Function &F) {
-  bool Changed = false;
-  CycleDetector.clear();
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    if (shouldEliminateUnconditionalBranch(I->getTerminator(),
-                                           TailDupThreshold)) {
-      eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator()));
-      Changed = true;
-    } else {
-      ++I;
-      CycleDetector.clear();
-    }
-  }
-  return Changed;
-}
-
-/// shouldEliminateUnconditionalBranch - Return true if this branch looks
-/// attractive to eliminate.  We eliminate the branch if the destination basic
-/// block has <= 5 instructions in it, not counting PHI nodes.  In practice,
-/// since one of these is a terminator instruction, this means that we will add
-/// up to 4 instructions to the new block.
-///
-/// We don't count PHI nodes in the count since they will be removed when the
-/// contents of the block are copied over.
-///
-bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI,
-                                                 unsigned Threshold) {
-  BranchInst *BI = dyn_cast<BranchInst>(TI);
-  if (!BI || !BI->isUnconditional()) return false;  // Not an uncond branch!
-
-  BasicBlock *Dest = BI->getSuccessor(0);
-  if (Dest == BI->getParent()) return false;        // Do not loop infinitely!
-
-  // Do not inline a block if we will just get another branch to the same block!
-  TerminatorInst *DTI = Dest->getTerminator();
-  if (BranchInst *DBI = dyn_cast<BranchInst>(DTI))
-    if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest)
-      return false;                                 // Do not loop infinitely!
-
-  // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack,
-  // because doing so would require breaking critical edges.  This should be
-  // fixed eventually.
-  if (!DTI->use_empty())
-    return false;
-
-  // Do not bother with blocks with only a single predecessor: simplify
-  // CFG will fold these two blocks together!
-  pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest);
-  ++PI;
-  if (PI == PE) return false;  // Exactly one predecessor!
-
-  BasicBlock::iterator I = Dest->getFirstNonPHI();
-
-  for (unsigned Size = 0; I != Dest->end(); ++I) {
-    if (Size == Threshold) return false;  // The block is too large.
-    
-    // Don't tail duplicate call instructions.  They are very large compared to
-    // other instructions.
-    if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false;
-
-    // Also alloca and malloc.
-    if (isa<AllocaInst>(I)) return false;
-
-    // Some vector instructions can expand into a number of instructions.
-    if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) ||
-        isa<InsertElementInst>(I)) return false;
-    
-    // Only count instructions that are not debugger intrinsics.
-    if (!isa<DbgInfoIntrinsic>(I)) ++Size;
-  }
-
-  // Do not tail duplicate a block that has thousands of successors into a block
-  // with a single successor if the block has many other predecessors.  This can
-  // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in
-  // cases that have a large number of indirect gotos.
-  unsigned NumSuccs = DTI->getNumSuccessors();
-  if (NumSuccs > 8) {
-    unsigned TooMany = 128;
-    if (NumSuccs >= TooMany) return false;
-    TooMany = TooMany/NumSuccs;
-    for (; PI != PE; ++PI)
-      if (TooMany-- == 0) return false;
-  }
-  
-  // If this unconditional branch is a fall-through, be careful about
-  // tail duplicating it.  In particular, we don't want to taildup it if the
-  // original block will still be there after taildup is completed: doing so
-  // would eliminate the fall-through, requiring unconditional branches.
-  Function::iterator DestI = Dest;
-  if (&*--DestI == BI->getParent()) {
-    // The uncond branch is a fall-through.  Tail duplication of the block is
-    // will eliminate the fall-through-ness and end up cloning the terminator
-    // at the end of the Dest block.  Since the original Dest block will
-    // continue to exist, this means that one or the other will not be able to
-    // fall through.  One typical example that this helps with is code like:
-    // if (a)
-    //   foo();
-    // if (b)
-    //   foo();
-    // Cloning the 'if b' block into the end of the first foo block is messy.
-    
-    // The messy case is when the fall-through block falls through to other
-    // blocks.  This is what we would be preventing if we cloned the block.
-    DestI = Dest;
-    if (++DestI != Dest->getParent()->end()) {
-      BasicBlock *DestSucc = DestI;
-      // If any of Dest's successors are fall-throughs, don't do this xform.
-      for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest);
-           SI != SE; ++SI)
-        if (*SI == DestSucc)
-          return false;
-    }
-  }
-
-  // Finally, check that we haven't redirected to this target block earlier;
-  // there are cases where we loop forever if we don't check this (PR 2323).
-  if (!CycleDetector.insert(Dest))
-    return false;
-
-  return true;
-}
-
-/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to
-/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock.  If we
-/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and
-/// DstBlock, return it.
-static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
-                                          BasicBlock *DstBlock) {
-  // SrcBlock must have a single predecessor.
-  pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock);
-  if (PI == PE || ++PI != PE) return 0;
-
-  BasicBlock *SrcPred = *pred_begin(SrcBlock);
-
-  // Look at the predecessors of DstBlock.  One of them will be SrcBlock.  If
-  // there is only one other pred, get it, otherwise we can't handle it.
-  PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
-  BasicBlock *DstOtherPred = 0;
-  BasicBlock *P = *PI;
-  if (P == SrcBlock) {
-    if (++PI == PE) return 0;
-    DstOtherPred = *PI;
-    if (++PI != PE) return 0;
-  } else {
-    DstOtherPred = P;
-    if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
-  }
-
-  // We can handle two situations here: "if then" and "if then else" blocks.  An
-  // 'if then' situation is just where DstOtherPred == SrcPred.
-  if (DstOtherPred == SrcPred)
-    return SrcPred;
-
-  // Check to see if we have an "if then else" situation, which means that
-  // DstOtherPred will have a single predecessor and it will be SrcPred.
-  PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred);
-  if (PI != PE && *PI == SrcPred) {
-    if (++PI != PE) return 0;  // Not a single pred.
-    return SrcPred;  // Otherwise, it's an "if then" situation.  Return the if.
-  }
-
-  // Otherwise, this is something we can't handle.
-  return 0;
-}
-
-
-/// eliminateUnconditionalBranch - Clone the instructions from the destination
-/// block into the source block, eliminating the specified unconditional branch.
-/// If the destination block defines values used by successors of the dest
-/// block, we may need to insert PHI nodes.
-///
-void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
-  BasicBlock *SourceBlock = Branch->getParent();
-  BasicBlock *DestBlock = Branch->getSuccessor(0);
-  assert(SourceBlock != DestBlock && "Our predicate is broken!");
-
-  DEBUG(dbgs() << "TailDuplication[" << SourceBlock->getParent()->getName()
-        << "]: Eliminating branch: " << *Branch);
-
-  // See if we can avoid duplicating code by moving it up to a dominator of both
-  // blocks.
-  if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) {
-    DEBUG(dbgs() << "Found shared dominator: " << DomBlock->getName() << "\n");
-
-    // If there are non-phi instructions in DestBlock that have no operands
-    // defined in DestBlock, and if the instruction has no side effects, we can
-    // move the instruction to DomBlock instead of duplicating it.
-    BasicBlock::iterator BBI = DestBlock->getFirstNonPHI();
-    while (!isa<TerminatorInst>(BBI)) {
-      Instruction *I = BBI++;
-
-      bool CanHoist = I->isSafeToSpeculativelyExecute() &&
-                      !I->mayReadFromMemory();
-      if (CanHoist) {
-        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-          if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op)))
-            if (OpI->getParent() == DestBlock ||
-                (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) {
-              CanHoist = false;
-              break;
-            }
-        if (CanHoist) {
-          // Remove from DestBlock, move right before the term in DomBlock.
-          DestBlock->getInstList().remove(I);
-          DomBlock->getInstList().insert(DomBlock->getTerminator(), I);
-          DEBUG(dbgs() << "Hoisted: " << *I);
-        }
-      }
-    }
-  }
-
-  // Tail duplication can not update SSA properties correctly if the values
-  // defined in the duplicated tail are used outside of the tail itself.  For
-  // this reason, we spill all values that are used outside of the tail to the
-  // stack.
-  for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I)
-    if (I->isUsedOutsideOfBlock(DestBlock)) {
-      // We found a use outside of the tail.  Create a new stack slot to
-      // break this inter-block usage pattern.
-      DemoteRegToStack(*I);
-    }
-
-  // We are going to have to map operands from the original block B to the new
-  // copy of the block B'.  If there are PHI nodes in the DestBlock, these PHI
-  // nodes also define part of this mapping.  Loop over these PHI nodes, adding
-  // them to our mapping.
-  //
-  std::map<Value*, Value*> ValueMapping;
-
-  BasicBlock::iterator BI = DestBlock->begin();
-  bool HadPHINodes = isa<PHINode>(BI);
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-    ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock);
-
-  // Clone the non-phi instructions of the dest block into the source block,
-  // keeping track of the mapping...
-  //
-  for (; BI != DestBlock->end(); ++BI) {
-    Instruction *New = BI->clone();
-    New->setName(BI->getName());
-    SourceBlock->getInstList().push_back(New);
-    ValueMapping[BI] = New;
-  }
-
-  // Now that we have built the mapping information and cloned all of the
-  // instructions (giving us a new terminator, among other things), walk the new
-  // instructions, rewriting references of old instructions to use new
-  // instructions.
-  //
-  BI = Branch; ++BI;  // Get an iterator to the first new instruction
-  for (; BI != SourceBlock->end(); ++BI)
-    for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
-      std::map<Value*, Value*>::const_iterator I =
-        ValueMapping.find(BI->getOperand(i));
-      if (I != ValueMapping.end())
-        BI->setOperand(i, I->second);
-    }
-
-  // Next we check to see if any of the successors of DestBlock had PHI nodes.
-  // If so, we need to add entries to the PHI nodes for SourceBlock now.
-  for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock);
-       SI != SE; ++SI) {
-    BasicBlock *Succ = *SI;
-    for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) {
-      PHINode *PN = cast<PHINode>(PNI);
-      // Ok, we have a PHI node.  Figure out what the incoming value was for the
-      // DestBlock.
-      Value *IV = PN->getIncomingValueForBlock(DestBlock);
-
-      // Remap the value if necessary...
-      std::map<Value*, Value*>::const_iterator I = ValueMapping.find(IV);
-      if (I != ValueMapping.end())
-        IV = I->second;
-      PN->addIncoming(IV, SourceBlock);
-    }
-  }
-
-  // Next, remove the old branch instruction, and any PHI node entries that we
-  // had.
-  BI = Branch; ++BI;  // Get an iterator to the first new instruction
-  DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes...
-  SourceBlock->getInstList().erase(Branch);  // Destroy the uncond branch...
-
-  // Final step: now that we have finished everything up, walk the cloned
-  // instructions one last time, constant propagating and DCE'ing them, because
-  // they may not be needed anymore.
-  //
-  if (HadPHINodes) {
-    while (BI != SourceBlock->end()) {
-      Instruction *Inst = BI++;
-      if (isInstructionTriviallyDead(Inst))
-        Inst->eraseFromParent();
-      else if (Value *V = SimplifyInstruction(Inst)) {
-        Inst->replaceAllUsesWith(V);
-        Inst->eraseFromParent();
-      }
-    }
-  }
-
-  ++NumEliminated;  // We just killed a branch!
-}
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index be7bed1..8e5a1eb 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -222,7 +222,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     const TargetData *TD = TLI.getTargetData();
     gep_type_iterator GTI = gep_type_begin(AddrInst);
     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = TD->getStructLayout(STy);
         unsigned Idx =
           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
@@ -557,7 +557,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     Value *Address = User->getOperand(OpNo);
     if (!Address->getType()->isPointerTy())
       return false;
-    const Type *AddressAccessTy =
+    Type *AddressAccessTy =
       cast<PointerType>(Address->getType())->getElementType();
     
     // Do a match against the root of this address, ignoring profitability. This
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index b4f74f9..a7f9efd 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -287,7 +287,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
 ///
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
   BasicBlock::iterator SplitIt = SplitPt;
-  while (isa<PHINode>(SplitIt))
+  while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt))
     ++SplitIt;
   BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
 
@@ -299,138 +299,114 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
 
   if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
     // Old dominates New. New node dominates all other nodes dominated by Old.
-    DomTreeNode *OldNode = DT->getNode(Old);
-    std::vector<DomTreeNode *> Children;
-    for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-         I != E; ++I) 
-      Children.push_back(*I);
+    if (DomTreeNode *OldNode = DT->getNode(Old)) {
+      std::vector<DomTreeNode *> Children;
+      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
+           I != E; ++I) 
+        Children.push_back(*I);
 
       DomTreeNode *NewNode = DT->addNewBlock(New,Old);
       for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
              E = Children.end(); I != E; ++I) 
         DT->changeImmediateDominator(*I, NewNode);
+    }
   }
 
   return New;
 }
 
+/// UpdateAnalysisInformation - Update DominatorTree, LoopInfo, and LCCSA
+/// analysis information.
+static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
+                                      ArrayRef<BasicBlock *> Preds,
+                                      Pass *P, bool &HasLoopExit) {
+  if (!P) return;
 
-/// SplitBlockPredecessors - This method transforms BB by introducing a new
-/// basic block into the function, and moving some of the predecessors of BB to
-/// be predecessors of the new block.  The new predecessors are indicated by the
-/// Preds array, which has NumPreds elements in it.  The new block is given a
-/// suffix of 'Suffix'.
-///
-/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
-/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
-/// preserve LoopSimplify (because it's complicated to handle the case where one
-/// of the edges being split is an exit of a loop with other exits).
-///
-BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
-                                         BasicBlock *const *Preds,
-                                         unsigned NumPreds, const char *Suffix,
-                                         Pass *P) {
-  // Create new basic block, insert right before the original block.
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
-                                         BB->getParent(), BB);
-  
-  // The new block unconditionally branches to the old block.
-  BranchInst *BI = BranchInst::Create(BB, NewBB);
-  
-  LoopInfo *LI = P ? P->getAnalysisIfAvailable<LoopInfo>() : 0;
-  Loop *L = LI ? LI->getLoopFor(BB) : 0;
-  bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID);
+  LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
+  Loop *L = LI ? LI->getLoopFor(OldBB) : 0;
 
-  // Move the edges from Preds to point to NewBB instead of BB.
-  // While here, if we need to preserve loop analyses, collect
-  // some information about how this split will affect loops.
-  bool HasLoopExit = false;
+  // If we need to preserve loop analyses, collect some information about how
+  // this split will affect loops.
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
-  for (unsigned i = 0; i != NumPreds; ++i) {
-    // This is slightly more strict than necessary; the minimum requirement
-    // is that there be no more than one indirectbr branching to BB. And
-    // all BlockAddress uses would need to be updated.
-    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
-           "Cannot split an edge from an IndirectBrInst");
-
-    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
-
-    if (LI) {
-      // If we need to preserve LCSSA, determine if any of
-      // the preds is a loop exit.
+  if (LI) {
+    bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID);
+    for (ArrayRef<BasicBlock*>::iterator
+           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
+      BasicBlock *Pred = *i;
+
+      // If we need to preserve LCSSA, determine if any of the preds is a loop
+      // exit.
       if (PreserveLCSSA)
-        if (Loop *PL = LI->getLoopFor(Preds[i]))
-          if (!PL->contains(BB))
+        if (Loop *PL = LI->getLoopFor(Pred))
+          if (!PL->contains(OldBB))
             HasLoopExit = true;
-      // If we need to preserve LoopInfo, note whether any of the
-      // preds crosses an interesting loop boundary.
-      if (L) {
-        if (L->contains(Preds[i]))
-          IsLoopEntry = false;
-        else
-          SplitMakesNewLoopHeader = true;
-      }
+
+      // If we need to preserve LoopInfo, note whether any of the preds crosses
+      // an interesting loop boundary.
+      if (!L) continue;
+      if (L->contains(Pred))
+        IsLoopEntry = false;
+      else
+        SplitMakesNewLoopHeader = true;
     }
   }
 
   // Update dominator tree if available.
-  DominatorTree *DT = P ? P->getAnalysisIfAvailable<DominatorTree>() : 0;
+  DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   if (DT)
     DT->splitBlock(NewBB);
 
-  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
-  // node becomes an incoming value for BB's phi node.  However, if the Preds
-  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
-  // account for the newly created predecessor.
-  if (NumPreds == 0) {
-    // Insert dummy values as the incoming value.
-    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
-      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
-    return NewBB;
+  if (!L) return;
+
+  if (IsLoopEntry) {
+    // Add the new block to the nearest enclosing loop (and not an adjacent
+    // loop). To find this, examine each of the predecessors and determine which
+    // loops enclose them, and select the most-nested loop which contains the
+    // loop containing the block being split.
+    Loop *InnermostPredLoop = 0;
+    for (ArrayRef<BasicBlock*>::iterator
+           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
+      BasicBlock *Pred = *i;
+      if (Loop *PredLoop = LI->getLoopFor(Pred)) {
+        // Seek a loop which actually contains the block being split (to avoid
+        // adjacent loops).
+        while (PredLoop && !PredLoop->contains(OldBB))
+          PredLoop = PredLoop->getParentLoop();
+
+        // Select the most-nested of these loops which contains the block.
+        if (PredLoop && PredLoop->contains(OldBB) &&
+            (!InnermostPredLoop ||
+             InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+          InnermostPredLoop = PredLoop;
+      }
+    }
+
+    if (InnermostPredLoop)
+      InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+  } else {
+    L->addBasicBlockToLoop(NewBB, LI->getBase());
+    if (SplitMakesNewLoopHeader)
+      L->moveToHeader(NewBB);
   }
+}
 
+/// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming
+/// from NewBB. This also updates AliasAnalysis, if available.
+static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
+                           ArrayRef<BasicBlock*> Preds, BranchInst *BI,
+                           Pass *P, bool HasLoopExit) {
+  // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
   AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0;
-
-  if (L) {
-    if (IsLoopEntry) {
-      // Add the new block to the nearest enclosing loop (and not an
-      // adjacent loop). To find this, examine each of the predecessors and
-      // determine which loops enclose them, and select the most-nested loop
-      // which contains the loop containing the block being split.
-      Loop *InnermostPredLoop = 0;
-      for (unsigned i = 0; i != NumPreds; ++i)
-        if (Loop *PredLoop = LI->getLoopFor(Preds[i])) {
-          // Seek a loop which actually contains the block being split (to
-          // avoid adjacent loops).
-          while (PredLoop && !PredLoop->contains(BB))
-            PredLoop = PredLoop->getParentLoop();
-          // Select the most-nested of these loops which contains the block.
-          if (PredLoop &&
-              PredLoop->contains(BB) &&
-              (!InnermostPredLoop ||
-               InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
-            InnermostPredLoop = PredLoop;
-        }
-      if (InnermostPredLoop)
-        InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
-    } else {
-      L->addBasicBlockToLoop(NewBB, LI->getBase());
-      if (SplitMakesNewLoopHeader)
-        L->moveToHeader(NewBB);
-    }
-  }
-  
-  // Otherwise, create a new PHI node in NewBB for each PHI node in BB.
-  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ) {
+  for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I++);
-    
+
     // Check to see if all of the values coming in are the same.  If so, we
     // don't need to create a new PHI node, unless it's needed for LCSSA.
     Value *InVal = 0;
     if (!HasLoopExit) {
       InVal = PN->getIncomingValueForBlock(Preds[0]);
-      for (unsigned i = 1; i != NumPreds; ++i)
+      for (unsigned i = 1, e = Preds.size(); i != e; ++i)
         if (InVal != PN->getIncomingValueForBlock(Preds[i])) {
           InVal = 0;
           break;
@@ -441,31 +417,191 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
       // If all incoming values for the new PHI would be the same, just don't
       // make a new PHI.  Instead, just remove the incoming values from the old
       // PHI.
-      for (unsigned i = 0; i != NumPreds; ++i)
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i)
         PN->removeIncomingValue(Preds[i], false);
     } else {
       // If the values coming into the block are not the same, we need a PHI.
       // Create the new PHI node, insert it into NewBB at the end of the block
       PHINode *NewPHI =
-        PHINode::Create(PN->getType(), NumPreds, PN->getName()+".ph", BI);
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
       if (AA) AA->copyValue(PN, NewPHI);
       
       // Move all of the PHI values for 'Preds' to the new PHI.
-      for (unsigned i = 0; i != NumPreds; ++i) {
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
         Value *V = PN->removeIncomingValue(Preds[i], false);
         NewPHI->addIncoming(V, Preds[i]);
       }
+
       InVal = NewPHI;
     }
-    
+
     // Add an incoming value to the PHI node in the loop for the preheader
     // edge.
     PN->addIncoming(InVal, NewBB);
   }
+}
+
+/// SplitBlockPredecessors - This method transforms BB by introducing a new
+/// basic block into the function, and moving some of the predecessors of BB to
+/// be predecessors of the new block.  The new predecessors are indicated by the
+/// Preds array, which has NumPreds elements in it.  The new block is given a
+/// suffix of 'Suffix'.
+///
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
+/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
+/// preserve LoopSimplify (because it's complicated to handle the case where one
+/// of the edges being split is an exit of a loop with other exits).
+///
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+                                         BasicBlock *const *Preds,
+                                         unsigned NumPreds, const char *Suffix,
+                                         Pass *P) {
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
+                                         BB->getParent(), BB);
   
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
+  
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0; i != NumPreds; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  }
+
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (NumPreds == 0) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+    return NewBB;
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(BB, NewBB, ArrayRef<BasicBlock*>(Preds, NumPreds),
+                            P, HasLoopExit);
+
+  // Update the PHI nodes in BB with the values coming from NewBB.
+  UpdatePHINodes(BB, NewBB, ArrayRef<BasicBlock*>(Preds, NumPreds), BI,
+                 P, HasLoopExit);
   return NewBB;
 }
 
+/// SplitLandingPadPredecessors - This method transforms the landing pad,
+/// OrigBB, by introducing two new basic blocks into the function. One of those
+/// new basic blocks gets the predecessors listed in Preds. The other basic
+/// block gets the remaining predecessors of OrigBB. The landingpad instruction
+/// OrigBB is clone into both of the new basic blocks. The new blocks are given
+/// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
+/// 
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
+/// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
+/// it does not preserve LoopSimplify (because it's complicated to handle the
+/// case where one of the edges being split is an exit of a loop with other
+/// exits).
+/// 
+void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                       ArrayRef<BasicBlock*> Preds,
+                                       const char *Suffix1, const char *Suffix2,
+                                       Pass *P,
+                                       SmallVectorImpl<BasicBlock*> &NewBBs) {
+  assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
+
+  // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
+  // it right before the original block.
+  BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
+                                          OrigBB->getName() + Suffix1,
+                                          OrigBB->getParent(), OrigBB);
+  NewBBs.push_back(NewBB1);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+
+  // Move the edges from Preds to point to NewBB1 instead of OrigBB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, P, HasLoopExit);
+
+  // Update the PHI nodes in OrigBB with the values coming from NewBB1.
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, P, HasLoopExit);
+
+  // Move the remaining edges from OrigBB to point to NewBB2.
+  SmallVector<BasicBlock*, 8> NewBB2Preds;
+  for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
+       i != e; ) {
+    BasicBlock *Pred = *i++;
+    if (Pred == NewBB1) continue;
+    assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    NewBB2Preds.push_back(Pred);
+    e = pred_end(OrigBB);
+  }
+
+  BasicBlock *NewBB2 = 0;
+  if (!NewBB2Preds.empty()) {
+    // Create another basic block for the rest of OrigBB's predecessors.
+    NewBB2 = BasicBlock::Create(OrigBB->getContext(),
+                                OrigBB->getName() + Suffix2,
+                                OrigBB->getParent(), OrigBB);
+    NewBBs.push_back(NewBB2);
+
+    // The new block unconditionally branches to the old block.
+    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+
+    // Move the remaining edges from OrigBB to point to NewBB2.
+    for (SmallVectorImpl<BasicBlock*>::iterator
+           i = NewBB2Preds.begin(), e = NewBB2Preds.end(); i != e; ++i)
+      (*i)->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+
+    // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+    HasLoopExit = false;
+    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, P, HasLoopExit);
+
+    // Update the PHI nodes in OrigBB with the values coming from NewBB2.
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, P, HasLoopExit);
+  }
+
+  LandingPadInst *LPad = OrigBB->getLandingPadInst();
+  Instruction *Clone1 = LPad->clone();
+  Clone1->setName(Twine("lpad") + Suffix1);
+  NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
+
+  if (NewBB2) {
+    Instruction *Clone2 = LPad->clone();
+    Clone2->setName(Twine("lpad") + Suffix2);
+    NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
+
+    // Create a PHI node for the two cloned landingpad instructions.
+    PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+    PN->addIncoming(Clone1, NewBB1);
+    PN->addIncoming(Clone2, NewBB2);
+    LPad->replaceAllUsesWith(PN);
+    LPad->eraseFromParent();
+  } else {
+    // There is no second clone. Just replace the landing pad with the first
+    // clone.
+    LPad->replaceAllUsesWith(Clone1);
+    LPad->eraseFromParent();
+  }
+}
+
 /// FindFunctionBackedges - Analyze the specified function to find all of the
 /// loop backedges in the function and return them.  This is a relatively cheap
 /// (compared to computing dominators and loop info) analysis.
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 92ce500..c052910 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -102,7 +102,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
   ++I;        // Skip one edge due to the incoming arc from TI.
   if (!AllowIdenticalEdges)
     return I != E;
-  
+
   // If AllowIdenticalEdges is true, then we allow this edge to be considered
   // non-critical iff all preds come from TI's block.
   while (I != E) {
@@ -155,10 +155,10 @@ static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
 /// This returns the new block if the edge was split, null otherwise.
 ///
 /// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the
-/// specified successor will be merged into the same critical edge block.  
-/// This is most commonly interesting with switch instructions, which may 
+/// specified successor will be merged into the same critical edge block.
+/// This is most commonly interesting with switch instructions, which may
 /// have many edges to any one destination.  This ensures that all edges to that
-/// dest go to one block instead of each going to a different block, but isn't 
+/// dest go to one block instead of each going to a different block, but isn't
 /// the standard definition of a "critical edge".
 ///
 /// It is invalid to call this function on a critical edge that starts at an
@@ -167,15 +167,20 @@ static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
 /// to.
 ///
 BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                                    Pass *P, bool MergeIdenticalEdges) {
+                                    Pass *P, bool MergeIdenticalEdges,
+                                    bool DontDeleteUselessPhis) {
   if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0;
-  
+
   assert(!isa<IndirectBrInst>(TI) &&
          "Cannot split critical edge from IndirectBrInst");
-  
+
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (DestBB->isLandingPad()) return 0;
+
   // Create a new basic block, linking it into the CFG.
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
                       TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
@@ -190,7 +195,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   Function &F = *TIBB->getParent();
   Function::iterator FBBI = TIBB;
   F.getBasicBlockList().insert(++FBBI, NewBB);
-  
+
   // If there are any PHI nodes in DestBB, we need to update them so that they
   // merge incoming values from NewBB instead of from TIBB.
   {
@@ -207,35 +212,35 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
       // happens because the BB list of PHI nodes are usually in the same
       // order.
       if (PN->getIncomingBlock(BBIdx) != TIBB)
-	BBIdx = PN->getBasicBlockIndex(TIBB);
+        BBIdx = PN->getBasicBlockIndex(TIBB);
       PN->setIncomingBlock(BBIdx, NewBB);
     }
   }
-   
+
   // If there are any other edges from TIBB to DestBB, update those to go
   // through the split block, making those edges non-critical as well (and
   // reducing the number of phi entries in the DestBB if relevant).
   if (MergeIdenticalEdges) {
     for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
       if (TI->getSuccessor(i) != DestBB) continue;
-      
+
       // Remove an entry for TIBB from DestBB phi nodes.
-      DestBB->removePredecessor(TIBB);
-      
+      DestBB->removePredecessor(TIBB, DontDeleteUselessPhis);
+
       // We found another edge to DestBB, go to NewBB instead.
       TI->setSuccessor(i, NewBB);
     }
   }
-  
-  
+
+
 
   // If we don't have a pass object, we can't update anything...
   if (P == 0) return NewBB;
-  
+
   DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
   ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
-  
+
   // If we have nothing to update, just return.
   if (DT == 0 && LI == 0 && PI == 0)
     return NewBB;
@@ -263,7 +268,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   }
 
   bool NewBBDominatesDestBB = true;
-  
+
   // Should we update DominatorTree information?
   if (DT) {
     DomTreeNode *TINode = DT->getNode(TIBB);
@@ -274,7 +279,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
     if (TINode) {       // Don't break unreachable code!
       DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
       DomTreeNode *DestBBNode = 0;
-     
+
       // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
       if (!OtherPreds.empty()) {
         DestBBNode = DT->getNode(DestBB);
@@ -285,7 +290,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         }
         OtherPreds.clear();
       }
-      
+
       // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it
       // doesn't dominate anything.
       if (NewBBDominatesDestBB) {
@@ -337,6 +342,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         }
 
         // For each unique exit block...
+        // FIXME: This code is functionally equivalent to the corresponding
+        // loop in LoopSimplify.
         SmallVector<BasicBlock *, 4> ExitBlocks;
         TIL->getExitBlocks(ExitBlocks);
         for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
@@ -348,10 +355,15 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit);
                I != E; ++I) {
             BasicBlock *P = *I;
-            if (TIL->contains(P))
+            if (TIL->contains(P)) {
+              if (isa<IndirectBrInst>(P->getTerminator())) {
+                Preds.clear();
+                break;
+              }
               Preds.push_back(P);
-            else
+            } else {
               HasPredOutsideOfLoop = true;
+            }
           }
           // If there are any preds not in the loop, we'll need to split
           // the edges. The Preds.empty() check is needed because a block
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 14bb17f..4b5f45b 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -58,8 +58,8 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
   AttributeWithIndex AWI =
     AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
 
-  const Type *I8Ptr = B.getInt8PtrTy();
-  const Type *I32Ty = B.getInt32Ty();
+  Type *I8Ptr = B.getInt8PtrTy();
+  Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
@@ -102,7 +102,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
-  const Type *I8Ptr = B.getInt8PtrTy();
+  Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -120,7 +120,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
-  const Type *I8Ptr = B.getInt8PtrTy();
+  Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
@@ -361,7 +361,7 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
   this->CI = CI;
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-  const FunctionType *FT = Callee->getFunctionType();
+  FunctionType *FT = Callee->getFunctionType();
   LLVMContext &Context = CI->getParent()->getContext();
   IRBuilder<> B(CI);
 
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 204c2c6..7adc5f1 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -21,9 +21,17 @@ add_llvm_library(LLVMTransformUtils
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
+  SimplifyIndVar.cpp
   SimplifyInstructions.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
   )
 
+add_llvm_library_dependencies(LLVMTransformUtils
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTarget
+  LLVMipa
+  )
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 6ea831f..cf21f1e 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -331,15 +331,10 @@ ConstantFoldMappedInstruction(const Instruction *I) {
                                            TD);
 
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
-      if (!LI->isVolatile() && CE->getOpcode() == Instruction::GetElementPtr)
-        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
-          if (GV->isConstant() && GV->hasDefinitiveInitializer())
-            return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(),
-                                                          CE);
-
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), &Ops[0],
-                                  Ops.size(), TD);
+    if (!LI->isVolatile())
+      return ConstantFoldLoadFromConstPtr(Ops[0], TD);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD);
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index a08fa35..a0e027b 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -50,10 +50,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
        I != E; ++I) {
     GlobalVariable *GV = new GlobalVariable(*New, 
                                             I->getType()->getElementType(),
-                                            false,
-                                            GlobalValue::ExternalLinkage, 0,
-                                            I->getName());
-    GV->setAlignment(I->getAlignment());
+                                            I->isConstant(), I->getLinkage(),
+                                            (Constant*) 0, I->getName(),
+                                            (GlobalVariable*) 0,
+                                            I->isThreadLocal(),
+                                            I->getType()->getAddressSpace());
+    GV->copyAttributesFrom(I);
     VMap[I] = GV;
   }
 
@@ -61,16 +63,19 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
     Function *NF =
       Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                       GlobalValue::ExternalLinkage, I->getName(), New);
+                       I->getLinkage(), I->getName(), New);
     NF->copyAttributesFrom(I);
     VMap[I] = NF;
   }
 
   // Loop over the aliases in the module
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
-       I != E; ++I)
-    VMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage,
-                                  I->getName(), NULL, New);
+       I != E; ++I) {
+    GlobalAlias *GA = new GlobalAlias(I->getType(), I->getLinkage(),
+                                      I->getName(), NULL, New);
+    GA->copyAttributesFrom(I);
+    VMap[I] = GA;
+  }
   
   // Now that all of the things that global variable initializer can refer to
   // have been created, loop through and copy the global variable referrers
@@ -81,9 +86,6 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
     GlobalVariable *GV = cast<GlobalVariable>(VMap[I]);
     if (I->hasInitializer())
       GV->setInitializer(MapValue(I->getInitializer(), VMap));
-    GV->setLinkage(I->getLinkage());
-    GV->setThreadLocal(I->isThreadLocal());
-    GV->setConstant(I->isConstant());
   }
 
   // Similarly, copy over function bodies now...
@@ -101,15 +103,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
       SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
       CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns);
     }
-
-    F->setLinkage(I->getLinkage());
   }
 
   // And aliases
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
-    GA->setLinkage(I->getLinkage());
     if (const Constant *C = I->getAliasee())
       GA->setAliasee(MapValue(C, VMap));
   }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 0813523..5f47ebb 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -50,14 +50,14 @@ namespace {
     DominatorTree* DT;
     bool AggregateArgs;
     unsigned NumExitBlocks;
-    const Type *RetTy;
+    Type *RetTy;
   public:
     CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false)
       : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {}
 
-    Function *ExtractCodeRegion(const std::vector<BasicBlock*> &code);
+    Function *ExtractCodeRegion(ArrayRef<BasicBlock*> code);
 
-    bool isEligible(const std::vector<BasicBlock*> &code);
+    bool isEligible(ArrayRef<BasicBlock*> code);
 
   private:
     /// definedInRegion - Return true if the specified value is defined in the
@@ -290,7 +290,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
     paramTy.clear();
     paramTy.push_back(StructPtr);
   }
-  const FunctionType *funcType =
+  FunctionType *funcType =
                   FunctionType::get(RetTy, paramTy, false);
 
   // Create the new function
@@ -317,8 +317,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
       TerminatorInst *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = 
-        GetElementPtrInst::Create(AI, Idx, Idx+2, 
-                                  "gep_" + inputs[i]->getName(), TI);
+        GetElementPtrInst::Create(AI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
     } else
       RewriteVal = AI++;
@@ -420,7 +419,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
       GetElementPtrInst *GEP =
-        GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+        GetElementPtrInst::Create(Struct, Idx,
                                   "gep_" + StructValues[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       StoreInst *SI = new StoreInst(StructValues[i], GEP);
@@ -446,7 +445,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
       GetElementPtrInst *GEP
-        = GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+        = GetElementPtrInst::Create(Struct, Idx,
                                     "gep_reload_" + outputs[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       Output = GEP;
@@ -561,7 +560,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
                 Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),
                                           FirstOut+out);
                 GetElementPtrInst *GEP =
-                  GetElementPtrInst::Create(OAI, Idx, Idx + 2,
+                  GetElementPtrInst::Create(OAI, Idx,
                                             "gep_" + outputs[out]->getName(),
                                             NTRet);
                 new StoreInst(outputs[out], GEP, NTRet);
@@ -580,7 +579,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   }
 
   // Now that we've done the deed, simplify the switch instruction.
-  const Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
   switch (NumExitBlocks) {
   case 0:
     // There are no successors (the block containing the switch itself), which
@@ -655,7 +654,7 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
 /// computed result back into memory.
 ///
 Function *CodeExtractor::
-ExtractCodeRegion(const std::vector<BasicBlock*> &code) {
+ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
   if (!isEligible(code))
     return 0;
 
@@ -755,9 +754,13 @@ ExtractCodeRegion(const std::vector<BasicBlock*> &code) {
   return newFunction;
 }
 
-bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) {
+bool CodeExtractor::isEligible(ArrayRef<BasicBlock*> code) {
+  // Deny a single basic block that's a landing pad block.
+  if (code.size() == 1 && code[0]->isLandingPad())
+    return false;
+
   // Deny code region if it contains allocas or vastarts.
-  for (std::vector<BasicBlock*>::const_iterator BB = code.begin(), e=code.end();
+  for (ArrayRef<BasicBlock*>::iterator BB = code.begin(), e=code.end();
        BB != e; ++BB)
     for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end();
          I != Ie; ++I)
@@ -771,25 +774,23 @@ bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) {
 }
 
 
-/// ExtractCodeRegion - slurp a sequence of basic blocks into a brand new
-/// function
+/// ExtractCodeRegion - Slurp a sequence of basic blocks into a brand new
+/// function.
 ///
 Function* llvm::ExtractCodeRegion(DominatorTree &DT,
-                                  const std::vector<BasicBlock*> &code,
+                                  ArrayRef<BasicBlock*> code,
                                   bool AggregateArgs) {
   return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code);
 }
 
-/// ExtractBasicBlock - slurp a natural loop into a brand new function
+/// ExtractLoop - Slurp a natural loop into a brand new function.
 ///
 Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) {
   return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks());
 }
 
-/// ExtractBasicBlock - slurp a basic block into a brand new function
+/// ExtractBasicBlock - Slurp a basic block into a brand new function.
 ///
-Function* llvm::ExtractBasicBlock(BasicBlock *BB, bool AggregateArgs) {
-  std::vector<BasicBlock*> Blocks;
-  Blocks.push_back(BB);
-  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(Blocks);
+Function* llvm::ExtractBasicBlock(ArrayRef<BasicBlock*> BBs, bool AggregateArgs){
+  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(BBs);
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index d5b382e..5464dbc 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -45,6 +45,9 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
   return InlineFunction(CallSite(II), IFI);
 }
 
+// FIXME: New EH - Remove the functions marked [LIBUNWIND] when new EH is
+// turned on.
+
 /// [LIBUNWIND] Look for an llvm.eh.exception call in the given block.
 static EHExceptionInst *findExceptionInBlock(BasicBlock *bb) {
   for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; i++) {
@@ -250,20 +253,32 @@ namespace {
     PHINode *InnerSelectorPHI;
     SmallVector<Value*, 8> UnwindDestPHIValues;
 
+    // FIXME: New EH - These will replace the analogous ones above.
+    BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind.
+    BasicBlock *InnerResumeDest; //< Destination for the callee's resume.
+    LandingPadInst *CallerLPad;  //< LandingPadInst associated with the invoke.
+    PHINode *InnerEHValuesPHI;   //< PHI for EH values from landingpad insts.
+
   public:
-    InvokeInliningInfo(InvokeInst *II) :
-      OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
-      InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0) {
-
-      // If there are PHI nodes in the unwind destination block, we
-      // need to keep track of which values came into them from the
-      // invoke before removing the edge from this block.
-      llvm::BasicBlock *invokeBB = II->getParent();
-      for (BasicBlock::iterator I = OuterUnwindDest->begin();
-             isa<PHINode>(I); ++I) {
+    InvokeInliningInfo(InvokeInst *II)
+      : OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
+        InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0),
+        OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0),
+        CallerLPad(0), InnerEHValuesPHI(0) {
+      // If there are PHI nodes in the unwind destination block, we need to keep
+      // track of which values came into them from the invoke before removing
+      // the edge from this block.
+      llvm::BasicBlock *InvokeBB = II->getParent();
+      BasicBlock::iterator I = OuterUnwindDest->begin();
+      for (; isa<PHINode>(I); ++I) {
         // Save the value to use for this edge.
-        PHINode *phi = cast<PHINode>(I);
-        UnwindDestPHIValues.push_back(phi->getIncomingValueForBlock(invokeBB));
+        PHINode *PHI = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+      }
+
+      // FIXME: With the new EH, this if/dyn_cast should be a 'cast'.
+      if (LandingPadInst *LPI = dyn_cast<LandingPadInst>(I)) {
+        CallerLPad = LPI;
       }
     }
 
@@ -281,11 +296,23 @@ namespace {
 
     BasicBlock *getInnerUnwindDest();
 
+    // FIXME: New EH - Rename when new EH is turned on.
+    BasicBlock *getInnerUnwindDestNewEH();
+
+    LandingPadInst *getLandingPadInst() const { return CallerLPad; }
+
     bool forwardEHResume(CallInst *call, BasicBlock *src);
 
-    /// Add incoming-PHI values to the unwind destination block for
-    /// the given basic block, using the values for the original
-    /// invoke's source block.
+    /// forwardResume - Forward the 'resume' instruction to the caller's landing
+    /// pad block. When the landing pad block has only one predecessor, this is
+    /// a simple branch. When there is more than one predecessor, we need to
+    /// split the landing pad block after the landingpad instruction and jump
+    /// to there.
+    void forwardResume(ResumeInst *RI);
+
+    /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
+    /// destination block for the given basic block, using the values for the
+    /// original invoke's source block.
     void addIncomingPHIValuesFor(BasicBlock *BB) const {
       addIncomingPHIValuesForInto(BB, OuterUnwindDest);
     }
@@ -300,7 +327,7 @@ namespace {
   };
 }
 
-/// Get or create a target for the branch out of rewritten calls to
+/// [LIBUNWIND] Get or create a target for the branch out of rewritten calls to
 /// llvm.eh.resume.
 BasicBlock *InvokeInliningInfo::getInnerUnwindDest() {
   if (InnerUnwindDest) return InnerUnwindDest;
@@ -404,6 +431,60 @@ bool InvokeInliningInfo::forwardEHResume(CallInst *call, BasicBlock *src) {
   return true;
 }
 
+/// Get or create a target for the branch from ResumeInsts.
+BasicBlock *InvokeInliningInfo::getInnerUnwindDestNewEH() {
+  // FIXME: New EH - rename this function when new EH is turned on.
+  if (InnerResumeDest) return InnerResumeDest;
+
+  // Split the landing pad.
+  BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint;
+  InnerResumeDest =
+    OuterResumeDest->splitBasicBlock(SplitPoint,
+                                     OuterResumeDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned PHICapacity = 2;
+
+  // Create corresponding new PHIs for all the PHIs in the outer landing pad.
+  BasicBlock::iterator InsertPoint = InnerResumeDest->begin();
+  BasicBlock::iterator I = OuterResumeDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *OuterPHI = cast<PHINode>(I);
+    PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
+                                        OuterPHI->getName() + ".lpad-body",
+                                        InsertPoint);
+    OuterPHI->replaceAllUsesWith(InnerPHI);
+    InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
+  }
+
+  // Create a PHI for the exception values.
+  InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
+                                     "eh.lpad-body", InsertPoint);
+  CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
+  InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
+
+  // All done.
+  return InnerResumeDest;
+}
+
+/// forwardResume - Forward the 'resume' instruction to the caller's landing pad
+/// block. When the landing pad block has only one predecessor, this is a simple
+/// branch. When there is more than one predecessor, we need to split the
+/// landing pad block after the landingpad instruction and jump to there.
+void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
+  BasicBlock *Dest = getInnerUnwindDestNewEH();
+  BasicBlock *Src = RI->getParent();
+
+  BranchInst::Create(Dest, Src);
+
+  // Update the PHIs in the destination. They were inserted in an order which
+  // makes this work.
+  addIncomingPHIValuesForInto(Src, Dest);
+
+  InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
+  RI->eraseFromParent();
+}
+
 /// [LIBUNWIND] Check whether this selector is "only cleanups":
 ///   call i32 @llvm.eh.selector(blah, blah, i32 0)
 static bool isCleanupOnlySelector(EHSelectorInst *selector) {
@@ -421,9 +502,19 @@ static bool isCleanupOnlySelector(EHSelectorInst *selector) {
 /// Returns true to indicate that the next block should be skipped.
 static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
                                                    InvokeInliningInfo &Invoke) {
+  LandingPadInst *LPI = Invoke.getLandingPadInst();
+
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = BBI++;
-    
+
+    if (LPI) // FIXME: New EH - This won't be NULL in the new EH.
+      if (LandingPadInst *L = dyn_cast<LandingPadInst>(I)) {
+        unsigned NumClauses = LPI->getNumClauses();
+        L->reserveClauses(NumClauses);
+        for (unsigned i = 0; i != NumClauses; ++i)
+          L->addClause(LPI->getClause(i));
+      }
+
     // We only need to check for function calls: inlined invoke
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
@@ -557,6 +648,10 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
       // there is now a new entry in them.
       Invoke.addIncomingPHIValuesFor(BB);
     }
+
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+      Invoke.forwardResume(RI);
+    }
   }
 
   // Now that everything is happy, we have one final detail.  The PHI nodes in
@@ -636,7 +731,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
                                   const Function *CalledFunc,
                                   InlineFunctionInfo &IFI,
                                   unsigned ByValAlignment) {
-  const Type *AggTy = cast<PointerType>(Arg->getType())->getElementType();
+  Type *AggTy = cast<PointerType>(Arg->getType())->getElementType();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -726,7 +821,7 @@ static bool isUsedByLifetimeMarker(Value *V) {
 // hasLifetimeMarkers - Check whether the given alloca already has
 // lifetime.start or lifetime.end intrinsics.
 static bool hasLifetimeMarkers(AllocaInst *AI) {
-  const Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
+  Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
   if (AI->getType() == Int8PtrTy)
     return isUsedByLifetimeMarker(AI);
 
@@ -770,8 +865,15 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       DebugLoc DL = BI->getDebugLoc();
-      if (!DL.isUnknown())
+      if (!DL.isUnknown()) {
         BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
+        if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
+          LLVMContext &Ctx = BI->getContext();
+          MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx);
+          DVI->setOperand(2, createInlinedVariable(DVI->getVariable(), 
+                                                   InlinedAt, Ctx));
+        }
+      }
     }
   }
 }
@@ -822,6 +924,40 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       return false;
   }
 
+  // Find the personality function used by the landing pads of the caller. If it
+  // exists, then check to see that it matches the personality function used in
+  // the callee.
+  for (Function::const_iterator
+         I = Caller->begin(), E = Caller->end(); I != E; ++I)
+    if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+      const BasicBlock *BB = II->getUnwindDest();
+      // FIXME: This 'isa' here should become go away once the new EH system is
+      // in place.
+      if (!isa<LandingPadInst>(BB->getFirstNonPHI()))
+        continue;
+      const LandingPadInst *LP = cast<LandingPadInst>(BB->getFirstNonPHI());
+      const Value *CallerPersFn = LP->getPersonalityFn();
+
+      // If the personality functions match, then we can perform the
+      // inlining. Otherwise, we can't inline.
+      // TODO: This isn't 100% true. Some personality functions are proper
+      //       supersets of others and can be used in place of the other.
+      for (Function::const_iterator
+             I = CalledFunc->begin(), E = CalledFunc->end(); I != E; ++I)
+        if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+          const BasicBlock *BB = II->getUnwindDest();
+          // FIXME: This 'if/dyn_cast' here should become a normal 'cast' once
+          // the new EH system is in place.
+          if (const LandingPadInst *LP =
+              dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
+            if (CallerPersFn != LP->getPersonalityFn())
+              return false;
+          break;
+        }
+
+      break;
+    }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
   //
@@ -1090,7 +1226,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
 
   // Handle all of the return instructions that we just cloned in, and eliminate
   // any users of the original call/invoke instruction.
-  const Type *RTy = CalledFunc->getReturnType();
+  Type *RTy = CalledFunc->getReturnType();
 
   PHINode *PHI = 0;
   if (Returns.size() > 1) {
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0f6d9ae..7034feb 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -227,13 +226,17 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
 bool llvm::isInstructionTriviallyDead(Instruction *I) {
   if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
 
+  // We don't want the landingpad instruction removed by anything this general.
+  if (isa<LandingPadInst>(I))
+    return false;
+
   // We don't want debug info removed by anything this general, unless
   // debug info is empty.
   if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
-    if (DDI->getAddress()) 
+    if (DDI->getAddress())
       return false;
     return true;
-  } 
+  }
   if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
     if (DVI->getValue())
       return false;
@@ -244,10 +247,16 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
 
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     // Safe to delete llvm.stacksave if dead.
     if (II->getIntrinsicID() == Intrinsic::stacksave)
       return true;
+
+    // Lifetime intrinsics are dead when their right-hand is undef.
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+        II->getIntrinsicID() == Intrinsic::lifetime_end)
+      return isa<UndefValue>(II->getArgOperand(1));
+  }
   return false;
 }
 
@@ -712,10 +721,14 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// their preferred alignment from the beginning.
 ///
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
-                                      unsigned PrefAlign) {
+                                      unsigned PrefAlign, const TargetData *TD) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    // If the preferred alignment is greater than the natural stack alignment
+    // then don't round up. This avoids dynamic stack realignment.
+    if (TD && TD->exceedsNaturalStackAlignment(PrefAlign))
+      return Align;
     // If there is a requested alignment and if this is an alloca, round up.
     if (AI->getAlignment() >= PrefAlign)
       return AI->getAlignment();
@@ -766,7 +779,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
   Align = std::min(Align, +Value::MaximumAlignment);
   
   if (PrefAlign > Align)
-    Align = enforceKnownAlignment(V, Align, PrefAlign);
+    Align = enforceKnownAlignment(V, Align, PrefAlign, TD);
     
   // We don't need to make any adjustment.
   return Align;
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index e79fb5a..cbd54a8 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -213,7 +213,7 @@ ReprocessLoop:
   // predecessors from outside of the loop, split the edge now.
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getExitBlocks(ExitBlocks);
-    
+
   SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
                                                ExitBlocks.end());
   for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(),
@@ -325,6 +325,14 @@ ReprocessLoop:
       DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
                    << ExitingBlock->getName() << "\n");
 
+      // If any reachable control flow within this loop has changed, notify
+      // ScalarEvolution. Currently assume the parent loop doesn't change
+      // (spliting edges doesn't count). If blocks, CFG edges, or other values
+      // in the parent loop change, then we need call to forgetLoop() for the
+      // parent instead.
+      if (SE)
+        SE->forgetLoop(L);
+
       assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
       Changed = true;
       LI->removeBlock(ExitingBlock);
@@ -402,13 +410,24 @@ BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
   }
 
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
-  BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], 
-                                             LoopBlocks.size(), ".loopexit",
-                                             this);
+  BasicBlock *NewExitBB = 0;
+
+  if (Exit->isLandingPad()) {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Exit, ArrayRef<BasicBlock*>(&LoopBlocks[0],
+                                                            LoopBlocks.size()),
+                                ".loopexit", ".nonloopexit",
+                                this, NewBBs);
+    NewExitBB = NewBBs[0];
+  } else {
+    NewExitBB = SplitBlockPredecessors(Exit, &LoopBlocks[0],
+                                       LoopBlocks.size(), ".loopexit",
+                                       this);
+  }
 
   DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
-               << NewBB->getName() << "\n");
-  return NewBB;
+               << NewExitBB->getName() << "\n");
+  return NewExitBB;
 }
 
 /// AddBlockAndPredsToSet - Add the specified block, and all of its
@@ -467,23 +486,23 @@ void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
     if (&*BBI == SplitPreds[i])
       return;
   }
-  
+
   // If it isn't already after an outside block, move it after one.  This is
   // always good as it makes the uncond branch from the outside block into a
   // fall-through.
-  
+
   // Figure out *which* outside block to put this after.  Prefer an outside
   // block that neighbors a BB actually in the loop.
   BasicBlock *FoundBB = 0;
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
     Function::iterator BBI = SplitPreds[i];
-    if (++BBI != NewBB->getParent()->end() && 
+    if (++BBI != NewBB->getParent()->end() &&
         L->contains(BBI)) {
       FoundBB = SplitPreds[i];
       break;
     }
   }
-  
+
   // If our heuristic for a *good* bb to place this after doesn't find
   // anything, just pick something.  It's likely better than leaving it within
   // the loop.
@@ -544,7 +563,7 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
   PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L);
-  
+
   // Create the new outer loop.
   Loop *NewOuter = new Loop();
 
@@ -735,6 +754,7 @@ void LoopSimplify::verifyAnalysis() const {
       }
     assert(HasIndBrPred &&
            "LoopSimplify has no excuse for missing loop header info!");
+    (void)HasIndBrPred;
   }
 
   // Indirectbr can interfere with exit block canonicalization.
@@ -742,12 +762,15 @@ void LoopSimplify::verifyAnalysis() const {
     bool HasIndBrExiting = false;
     SmallVector<BasicBlock*, 8> ExitingBlocks;
     L->getExitingBlocks(ExitingBlocks);
-    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i)
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
         HasIndBrExiting = true;
         break;
       }
+    }
+
     assert(HasIndBrExiting &&
            "LoopSimplify has no excuse for missing exit block info!");
+    (void)HasIndBrExiting;
   }
 }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 6772511..62e4fa2 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -11,9 +11,6 @@
 // actual pass or policy, but provides a single function to perform loop
 // unrolling.
 //
-// It works best when loops have been canonicalized by the -indvars pass,
-// allowing it to determine the trip counts of loops easily.
-//
 // The process of unrolling can produce extraneous basic blocks linked with
 // unconditional branches.  This will be corrected in the future.
 //
@@ -24,6 +21,7 @@
 #include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Support/Debug.h"
@@ -31,6 +29,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
 // TODO: Should these be here or in LoopUnroll?
@@ -61,7 +60,8 @@ static inline void RemapInstruction(Instruction *I,
 /// only has one predecessor, and that predecessor only has one successor.
 /// The LoopInfo Analysis that is passed will be kept consistent.
 /// Returns the new combined block.
-static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
+static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
+                                            LPPassManager *LPM) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -93,6 +93,12 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
   std::string OldName = BB->getName();
 
   // Erase basic block from the function...
+
+  // ScalarEvolution holds references to loop exit blocks.
+  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
+    if (Loop *L = LI->getLoopFor(BB))
+      SE->forgetLoop(L);
+  }
   LI->removeBlock(BB);
   BB->eraseFromParent();
 
@@ -109,12 +115,27 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
 /// branch instruction. However, if the trip count (and multiple) are not known,
 /// loop unrolling will mostly produce more code that is no faster.
 ///
+/// TripCount is generally defined as the number of times the loop header
+/// executes. UnrollLoop relaxes the definition to permit early exits: here
+/// TripCount is the iteration on which control exits LatchBlock if no early
+/// exits were taken. Note that UnrollLoop assumes that the loop counter test
+/// terminates LatchBlock in order to remove unnecesssary instances of the
+/// test. In other words, control may exit the loop prior to TripCount
+/// iterations via an early branch, but control may not exit the loop from the
+/// LatchBlock's terminator prior to TripCount iterations.
+///
+/// Similarly, TripMultiple divides the number of times that the LatchBlock may
+/// execute without exiting the loop.
+///
 /// The LoopInfo Analysis that is passed will be kept consistent.
 ///
 /// If a LoopPassManager is passed in, and the loop is fully removed, it will be
 /// removed from the LoopPassManager as well. LPM can also be NULL.
-bool llvm::UnrollLoop(Loop *L, unsigned Count,
-                      LoopInfo *LI, LPPassManager *LPM) {
+///
+/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
+/// available it must also preserve those analyses.
+bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
+                      unsigned TripMultiple, LoopInfo *LI, LPPassManager *LPM) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -129,14 +150,14 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
 
   BasicBlock *Header = L->getHeader();
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-  
+
   if (!BI || BI->isUnconditional()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     DEBUG(dbgs() <<
              "  Can't unroll; loop not terminated by a conditional branch.\n");
     return false;
   }
-  
+
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     DEBUG(dbgs() <<
@@ -146,16 +167,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
-  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>())
+  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+  if (SE)
     SE->forgetLoop(L);
 
-  // Find trip count
-  unsigned TripCount = L->getSmallConstantTripCount();
-  // Find trip multiple if count is not available
-  unsigned TripMultiple = 1;
-  if (TripCount == 0)
-    TripMultiple = L->getSmallConstantTripMultiple();
-
   if (TripCount != 0)
     DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
   if (TripMultiple != 1)
@@ -208,12 +223,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
   ValueToValueMapTy LastValueMap;
   std::vector<PHINode*> OrigPHINode;
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    OrigPHINode.push_back(PN);
-    if (Instruction *I = 
-                dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
-      if (L->contains(I))
-        LastValueMap[I] = I;
+    OrigPHINode.push_back(cast<PHINode>(I));
   }
 
   std::vector<BasicBlock*> Headers;
@@ -221,11 +231,20 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
   Headers.push_back(Header);
   Latches.push_back(LatchBlock);
 
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
-    
-    for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
-         E = LoopBlocks.end(); BB != E; ++BB) {
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
       ValueToValueMapTy VMap;
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
@@ -251,75 +270,55 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
 
       L->addBasicBlockToLoop(New, LI->getBase());
 
-      // Add phi entries for newly created values to all exit blocks except
-      // the successor of the latch block.  The successor of the exit block will
-      // be updated specially after unrolling all the way.
-      if (*BB != LatchBlock)
-        for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB); SI != SE;
-             ++SI)
-          if (!L->contains(*SI))
-            for (BasicBlock::iterator BBI = (*SI)->begin();
-                 PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
-              Value *Incoming = phi->getIncomingValueForBlock(*BB);
-              phi->addIncoming(Incoming, New);
-            }
-
+      // Add phi entries for newly created values to all exit blocks.
+      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
+           SI != SE; ++SI) {
+        if (L->contains(*SI))
+          continue;
+        for (BasicBlock::iterator BBI = (*SI)->begin();
+             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
+          Value *Incoming = phi->getIncomingValueForBlock(*BB);
+          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
+          if (It != LastValueMap.end())
+            Incoming = It->second;
+          phi->addIncoming(Incoming, New);
+        }
+      }
       // Keep track of new headers and latches as we create them, so that
       // we can insert the proper branches later.
       if (*BB == Header)
         Headers.push_back(New);
-      if (*BB == LatchBlock) {
+      if (*BB == LatchBlock)
         Latches.push_back(New);
 
-        // Also, clear out the new latch's back edge so that it doesn't look
-        // like a new loop, so that it's amenable to being merged with adjacent
-        // blocks later on.
-        TerminatorInst *Term = New->getTerminator();
-        assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
-        assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
-        Term->setSuccessor(!ContinueOnTrue, NULL);
-      }
-
       NewBlocks.push_back(New);
     }
-    
+
     // Remap all instructions in the most recent iteration
     for (unsigned i = 0; i < NewBlocks.size(); ++i)
       for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
         ::RemapInstruction(I, LastValueMap);
   }
-  
-  // The latch block exits the loop.  If there are any PHI nodes in the
-  // successor blocks, update them to use the appropriate values computed as the
-  // last iteration of the loop.
-  if (Count != 1) {
-    BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
-    for (succ_iterator SI = succ_begin(LatchBlock), SE = succ_end(LatchBlock);
-         SI != SE; ++SI) {
-      for (BasicBlock::iterator BBI = (*SI)->begin();
-           PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
-        Value *InVal = PN->removeIncomingValue(LatchBlock, false);
-        // If this value was defined in the loop, take the value defined by the
-        // last iteration of the loop.
-        if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
-          if (L->contains(InValI))
-            InVal = LastValueMap[InVal];
-        }
-        PN->addIncoming(InVal, LastIterationBB);
-      }
-    }
-  }
 
-  // Now, if we're doing complete unrolling, loop over the PHI nodes in the
-  // original block, setting them to their incoming values.
-  if (CompletelyUnroll) {
-    BasicBlock *Preheader = L->getLoopPreheader();
-    for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-      PHINode *PN = OrigPHINode[i];
+  // Loop over the PHI nodes in the original block, setting incoming values.
+  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+    PHINode *PN = OrigPHINode[i];
+    if (CompletelyUnroll) {
       PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
       Header->getInstList().erase(PN);
     }
+    else if (Count > 1) {
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI))
+          InVal = LastValueMap[InVal];
+      }
+      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
+      PN->addIncoming(InVal, Latches.back());
+    }
   }
 
   // Now that all the basic blocks for the unrolled iterations are in place,
@@ -351,6 +350,19 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
       // iteration.
       Term->setSuccessor(!ContinueOnTrue, Dest);
     } else {
+      // Remove phi operands at this loop exit
+      if (Dest != LoopExit) {
+        BasicBlock *BB = Latches[i];
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+             SI != SE; ++SI) {
+          if (*SI == Headers[i])
+            continue;
+          for (BasicBlock::iterator BBI = (*SI)->begin();
+               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
+            Phi->removeIncomingValue(BB, false);
+          }
+        }
+      }
       // Replace the conditional branch with an unconditional one.
       BranchInst::Create(Dest, Term);
       Term->eraseFromParent();
@@ -362,11 +374,29 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
     BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI))
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM))
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
     }
   }
-  
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  // Incrementally updating domtree after loop unrolling would be easy.
+  if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
+    DT->runOnFunction(*L->getHeader()->getParent());
+
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && !CompletelyUnroll) {
+    SmallVector<WeakVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, LPM, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty())
+      if (Instruction *Inst =
+          dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index c1213fa..61ab3f6 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -58,7 +58,7 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
     return false;
 
   LLVMContext &Context = CI->getContext();
-  const Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int32Ty = Type::getInt32Ty(Context);
 
   unsigned caseNo = SI->findCaseValue(ExpectedValue);
   std::vector<Value *> Vec;
@@ -105,7 +105,7 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) {
     return false;
 
   LLVMContext &Context = CI->getContext();
-  const Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int32Ty = Type::getInt32Ty(Context);
   bool Likely = ExpectedValue->isOne();
 
   // If expect value is equal to 1 it means that we are more likely to take
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index f77d19d..c96c8fc 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -120,18 +120,18 @@ FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI,
 // doInitialization - Make sure that there is a prototype for abort in the
 // current module.
 bool LowerInvoke::doInitialization(Module &M) {
-  const Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   if (useExpensiveEHSupport) {
     // Insert a type for the linked list of jump buffers.
     unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
     JBSize = JBSize ? JBSize : 200;
     Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
 
-    JBLinkTy = StructType::createNamed(M.getContext(), "llvm.sjljeh.jmpbufty");
+    JBLinkTy = StructType::create(M.getContext(), "llvm.sjljeh.jmpbufty");
     Type *Elts[] = { JmpBufTy, PointerType::getUnqual(JBLinkTy) };
     JBLinkTy->setBody(Elts);
 
-    const Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
+    Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
 
     // Now that we've done that, insert the jmpbuf list head global, unless it
     // already exists.
@@ -240,14 +240,14 @@ void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
   CallInst* StackSaveRet = CallInst::Create(StackSaveFn, "ssret", II);
   new StoreInst(StackSaveRet, StackPtr, true, II); // volatile
 
-  BasicBlock::iterator NI = II->getNormalDest()->getFirstNonPHI();
+  BasicBlock::iterator NI = II->getNormalDest()->getFirstInsertionPt();
   // nonvolatile.
   new StoreInst(Constant::getNullValue(Type::getInt32Ty(II->getContext())), 
                 InvokeNum, false, NI);
 
-  Instruction* StackPtrLoad = new LoadInst(StackPtr, "stackptr.restore", true,
-                                           II->getUnwindDest()->getFirstNonPHI()
-                                           );
+  Instruction* StackPtrLoad =
+    new LoadInst(StackPtr, "stackptr.restore", true,
+                 II->getUnwindDest()->getFirstInsertionPt());
   CallInst::Create(StackRestoreFn, StackPtrLoad, "")->insertAfter(StackPtrLoad);
     
   // Add a switch case to our unwind block.
@@ -305,7 +305,7 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
     ++AfterAllocaInsertPt;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
        AI != E; ++AI) {
-    const Type *Ty = AI->getType();
+    Type *Ty = AI->getType();
     // Aggregate types can't be cast, but are legal argument types, so we have
     // to handle them differently. We use an extract/insert pair as a
     // lightweight method to achieve the same goal.
@@ -406,6 +406,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
   SmallVector<ReturnInst*,16> Returns;
   SmallVector<UnwindInst*,16> Unwinds;
   SmallVector<InvokeInst*,16> Invokes;
+  UnreachableInst* UnreachablePlaceholder = 0;
 
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
@@ -455,8 +456,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
 
     Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
                      ConstantInt::get(Type::getInt32Ty(F.getContext()), 1) };
-    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2],
-                                             "OldBuf",
+    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "OldBuf",
                                              EntryBB->getTerminator());
 
     // Copy the JBListHead to the alloca.
@@ -487,9 +487,10 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
 
     // Insert a load in the Catch block, and a switch on its value.  By default,
     // we go to a block that just does an unwind (which is the correct action
-    // for a standard call).
+    // for a standard call). We insert an unreachable instruction here and
+    // modify the block to jump to the correct unwinding pad later.
     BasicBlock *UnwindBB = BasicBlock::Create(F.getContext(), "unwindbb", &F);
-    Unwinds.push_back(new UnwindInst(F.getContext(), UnwindBB));
+    UnreachablePlaceholder = new UnreachableInst(F.getContext(), UnwindBB);
 
     Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB);
     SwitchInst *CatchSwitch =
@@ -502,8 +503,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
                                                      "setjmp.cont");
 
     Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 0);
-    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2],
-                                                 "TheJmpBuf",
+    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "TheJmpBuf",
                                                  EntryBB->getTerminator());
     JmpBufPtr = new BitCastInst(JmpBufPtr,
                         Type::getInt8PtrTy(F.getContext()),
@@ -557,8 +557,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
   // Get a pointer to the jmpbuf and longjmp.
   Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
                    ConstantInt::get(Type::getInt32Ty(F.getContext()), 0) };
-  Idx[0] = GetElementPtrInst::Create(BufPtr, &Idx[0], &Idx[2], "JmpBuf",
-                                     UnwindBlock);
+  Idx[0] = GetElementPtrInst::Create(BufPtr, Idx, "JmpBuf", UnwindBlock);
   Idx[0] = new BitCastInst(Idx[0],
              Type::getInt8PtrTy(F.getContext()),
                            "tmp", UnwindBlock);
@@ -580,6 +579,12 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
     Unwinds[i]->eraseFromParent();
   }
 
+  // Replace the inserted unreachable with a branch to the unwind handler.
+  if (UnreachablePlaceholder) {
+    BranchInst::Create(UnwindHandler, UnreachablePlaceholder);
+    UnreachablePlaceholder->eraseFromParent();
+  }
+
   // Finally, for any returns from this function, if this function contains an
   // invoke, restore the old jmpbuf pointer to its input value.
   if (OldJmpBufPtr) {
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index ed733d3..686178c 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -277,11 +277,11 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   BasicBlock *CurBlock = SI->getParent();
   BasicBlock *OrigBlock = CurBlock;
   Function *F = CurBlock->getParent();
-  Value *Val = SI->getOperand(0);  // The value we are switching on...
+  Value *Val = SI->getCondition();  // The value we are switching on...
   BasicBlock* Default = SI->getDefaultDest();
 
   // If there is only the default destination, don't bother with the code below.
-  if (SI->getNumOperands() == 2) {
+  if (SI->getNumCases() == 1) {
     BranchInst::Create(SI->getDefaultDest(), CurBlock);
     CurBlock->getInstList().erase(SI);
     return;
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index e5a00f4..db3e942 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -86,11 +86,15 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
        UI != UE; ++UI) {   // Loop over all of the uses of the alloca
     const User *U = *UI;
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
       if (LI->isVolatile())
         return false;
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == AI)
         return false;   // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
       if (SI->isVolatile())
         return false;
     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index b47a7cc..fa8061c 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
@@ -43,7 +44,7 @@ SSAUpdater::~SSAUpdater() {
 
 /// Initialize - Reset this object to get ready for a new set of SSA
 /// updates with type 'Ty'.  PHI nodes get a name based on 'Name'.
-void SSAUpdater::Initialize(const Type *Ty, StringRef Name) {
+void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
   if (AV == 0)
     AV = new AvailableValsTy();
   else
@@ -378,8 +379,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   // First step: bucket up uses of the alloca by the block they occur in.
   // This is important because we have to handle multiple defs/uses in a block
   // ourselves: SSAUpdater is purely for cross-block references.
-  // FIXME: Want a TinyVector<Instruction*> since there is often 0/1 element.
-  DenseMap<BasicBlock*, std::vector<Instruction*> > UsesByBlock;
+  DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock;
   
   for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
     Instruction *User = Insts[i];
@@ -395,7 +395,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
     Instruction *User = Insts[i];
     BasicBlock *BB = User->getParent();
-    std::vector<Instruction*> &BlockUses = UsesByBlock[BB];
+    TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB];
     
     // If this block has already been processed, ignore this repeat use.
     if (BlockUses.empty()) continue;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 9d9c324..b8c3ab4 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -63,6 +63,7 @@ class SimplifyCFGOpt {
   bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                            IRBuilder<> &Builder);
 
+  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
   bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
   bool SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder);
   bool SimplifyUnreachable(UnreachableInst *UI);
@@ -322,7 +323,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
   // ConstantInt if possible.
-  const IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
+  IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
 
   // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
   if (isa<ConstantPointerNull>(V))
@@ -2138,6 +2139,52 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   return true;
 }
 
+bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+  // If this is a trivial landing pad that just continues unwinding the caught
+  // exception then zap the landing pad, turning its invokes into calls.
+  BasicBlock *BB = RI->getParent();
+  LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
+  if (RI->getValue() != LPInst)
+    // Not a landing pad, or the resume is not unwinding the exception that
+    // caused control to branch here.
+    return false;
+
+  // Check that there are no other instructions except for debug intrinsics.
+  BasicBlock::iterator I = LPInst, E = RI;
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  // Turn all invokes that unwind here into calls and delete the basic block.
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
+    SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+    // Insert a call instruction before the invoke.
+    CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+    Call->takeName(II);
+    Call->setCallingConv(II->getCallingConv());
+    Call->setAttributes(II->getAttributes());
+    Call->setDebugLoc(II->getDebugLoc());
+
+    // Anything that used the value produced by the invoke instruction now uses
+    // the value produced by the call instruction.  Note that we do this even
+    // for void functions and calls with no uses so that the callgraph edge is
+    // updated.
+    II->replaceAllUsesWith(Call);
+    BB->removePredecessor(II->getParent());
+
+    // Insert a branch to the normal destination right before the invoke.
+    BranchInst::Create(II->getNormalDest(), II);
+
+    // Finally, delete the invoke instruction!
+    II->eraseFromParent();
+  }
+
+  // The landingpad is now unreachable.  Zap it.
+  BB->eraseFromParent();
+  return true;
+}
+
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
@@ -2244,18 +2291,34 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   while (UI != BB->begin()) {
     BasicBlock::iterator BBI = UI;
     --BBI;
-    // Do not delete instructions that can have side effects, like calls
-    // (which may never return) and volatile loads and stores.
+    // Do not delete instructions that can have side effects which might cause
+    // the unreachable to not be reachable; specifically, calls and volatile
+    // operations may have this effect.
     if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
-    
-    if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
-      if (SI->isVolatile())
-        break;
-    
-    if (LoadInst *LI = dyn_cast<LoadInst>(BBI))
-      if (LI->isVolatile())
+
+    if (BBI->mayHaveSideEffects()) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        if (SI->isVolatile())
+          break;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+        if (LI->isVolatile())
+          break;
+      } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+        if (RMWI->isVolatile())
+          break;
+      } else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+        if (CXI->isVolatile())
+          break;
+      } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
+                 !isa<LandingPadInst>(BBI)) {
         break;
-    
+      }
+      // Note that deleting LandingPad's here is in fact okay, although it
+      // involves a bit of subtle reasoning. If this inst is a LandingPad,
+      // all the predecessors of this block will be the unwind edges of Invokes,
+      // and we can therefore guarantee this block will be erased.
+    }
+
     // Delete this instruction (any uses are guaranteed to be dead)
     if (!BBI->use_empty())
       BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
@@ -2707,6 +2770,71 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   return false;
 }
 
+/// Check if passing a value to an instruction will cause undefined behavior.
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (!I->hasOneUse()) // Only look at single-use instructions, for compile time
+    return false;
+
+  if (C->isNullValue()) {
+    Instruction *Use = I->use_back();
+
+    // Now make sure that there are no instructions in between that can alter
+    // control flow (eg. calls)
+    for (BasicBlock::iterator i = ++BasicBlock::iterator(I); &*i != Use; ++i)
+      if (i == I->getParent()->end() || i->mayHaveSideEffects())
+        return false;
+
+    // Look through GEPs. A load from a GEP derived from NULL is still undefined
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+      if (GEP->getPointerOperand() == I)
+        return passingValueIsAlwaysUndefined(V, GEP);
+
+    // Look through bitcasts.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
+      return passingValueIsAlwaysUndefined(V, BC);
+
+    // Load from null is undefined.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+      return LI->getPointerAddressSpace() == 0;
+
+    // Store to null is undefined.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+      return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+  }
+  return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer derefence), remove the branch leading here.
+static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
+  for (BasicBlock::iterator i = BB->begin();
+       PHINode *PHI = dyn_cast<PHINode>(i); ++i)
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) {
+        TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator();
+        IRBuilder<> Builder(T);
+        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+          BB->removePredecessor(PHI->getIncomingBlock(i));
+          // Turn uncoditional branches into unreachables and remove the dead
+          // destination from conditional branches.
+          if (BI->isUnconditional())
+            Builder.CreateUnreachable();
+          else
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1) :
+                                                         BI->getSuccessor(0));
+          BI->eraseFromParent();
+          return true;
+        }
+        // TODO: SwitchInst.
+      }
+
+  return false;
+}
+
 bool SimplifyCFGOpt::run(BasicBlock *BB) {
   bool Changed = false;
 
@@ -2730,6 +2858,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // Check for and eliminate duplicate PHI nodes in this block.
   Changed |= EliminateDuplicatePHINodes(BB);
 
+  // Check for and remove branches that will always cause undefined behavior.
+  Changed |= removeUndefIntroducingPredecessor(BB);
+
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -2752,6 +2883,8 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
     } else {
       if (SimplifyCondBranch(BI, Builder)) return true;
     }
+  } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+    if (SimplifyResume(RI, Builder)) return true;
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
     if (SimplifyReturn(RI, Builder)) return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
new file mode 100644
index 0000000..76289c0
--- /dev/null
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -0,0 +1,432 @@
+//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements induction variable simplification. It does
+// not define any actual pass or policy, but provides a single function to
+// simplify a loop's induction variables based on ScalarEvolution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
+STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
+STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
+
+namespace {
+  /// SimplifyIndvar - This is a utility for simplifying induction variables
+  /// based on ScalarEvolution. It is the primary instrument of the
+  /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
+  /// other loop passes that preserve SCEV.
+  class SimplifyIndvar {
+    Loop             *L;
+    LoopInfo         *LI;
+    DominatorTree    *DT;
+    ScalarEvolution  *SE;
+    IVUsers          *IU; // NULL for DisableIVRewrite
+    const TargetData *TD; // May be NULL
+
+    SmallVectorImpl<WeakVH> &DeadInsts;
+
+    bool Changed;
+
+  public:
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM,
+                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = NULL) :
+      L(Loop),
+      LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
+      SE(SE),
+      IU(IVU),
+      TD(LPM->getAnalysisIfAvailable<TargetData>()),
+      DeadInsts(Dead),
+      Changed(false) {
+      assert(LI && "IV simplification requires LoopInfo");
+    }
+
+    bool hasChanged() const { return Changed; }
+
+    /// Iteratively perform simplification on a worklist of users of the
+    /// specified induction variable. This is the top-level driver that applies
+    /// all simplicitions to users of an IV.
+    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = NULL);
+
+    Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
+
+    bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
+    void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+                              bool IsSigned);
+  };
+}
+
+/// foldIVUser - Fold an IV operand into its use.  This removes increments of an
+/// aligned IV when used by a instruction that ignores the low bits.
+///
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+///
+/// Return the operand of IVOperand for this induction variable if IVOperand can
+/// be folded (in case more folding opportunities have been exposed).
+/// Otherwise return null.
+Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
+  Value *IVSrc = 0;
+  unsigned OperIdx = 0;
+  const SCEV *FoldedExpr = 0;
+  switch (UseInst->getOpcode()) {
+  default:
+    return 0;
+  case Instruction::UDiv:
+  case Instruction::LShr:
+    // We're only interested in the case where we know something about
+    // the numerator and have a constant denominator.
+    if (IVOperand != UseInst->getOperand(OperIdx) ||
+        !isa<ConstantInt>(UseInst->getOperand(1)))
+      return 0;
+
+    // Attempt to fold a binary operator with constant operand.
+    // e.g. ((I + 1) >> 2) => I >> 2
+    if (IVOperand->getNumOperands() != 2 ||
+        !isa<ConstantInt>(IVOperand->getOperand(1)))
+      return 0;
+
+    IVSrc = IVOperand->getOperand(0);
+    // IVSrc must be the (SCEVable) IV, since the other operand is const.
+    assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
+
+    ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
+    if (UseInst->getOpcode() == Instruction::LShr) {
+      // Get a constant for the divisor. See createSCEV.
+      uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
+      if (D->getValue().uge(BitWidth))
+        return 0;
+
+      D = ConstantInt::get(UseInst->getContext(),
+                           APInt(BitWidth, 1).shl(D->getZExtValue()));
+    }
+    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+  }
+  // We have something that might fold it's operand. Compare SCEVs.
+  if (!SE->isSCEVable(UseInst->getType()))
+    return 0;
+
+  // Bypass the operand if SCEV can prove it has no effect.
+  if (SE->getSCEV(UseInst) != FoldedExpr)
+    return 0;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+        << " -> " << *UseInst << '\n');
+
+  UseInst->setOperand(OperIdx, IVSrc);
+  assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
+
+  ++NumElimOperand;
+  Changed = true;
+  if (IVOperand->use_empty())
+    DeadInsts.push_back(IVOperand);
+  return IVSrc;
+}
+
+/// eliminateIVComparison - SimplifyIVUsers helper for eliminating useless
+/// comparisons against an induction variable.
+void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
+  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+  else
+    return;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  ++NumElimCmp;
+  Changed = true;
+  DeadInsts.push_back(ICmp);
+}
+
+/// eliminateIVRemainder - SimplifyIVUsers helper for eliminating useless
+/// remainder operations operating on an induction variable.
+void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
+                                      Value *IVOperand,
+                                      bool IsSigned) {
+  // We're only interested in the case where we know something about
+  // the numerator.
+  if (IVOperand != Rem->getOperand(0))
+    return;
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
+  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // i % n  -->  i  if i is in [0,n).
+  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
+      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                           S, X))
+    Rem->replaceAllUsesWith(Rem->getOperand(0));
+  else {
+    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+    const SCEV *LessOne =
+      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
+    if (IsSigned && !SE->isKnownNonNegative(LessOne))
+      return;
+
+    if (!SE->isKnownPredicate(IsSigned ?
+                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              LessOne, X))
+      return;
+
+    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
+                                  Rem->getOperand(0), Rem->getOperand(1));
+    SelectInst *Sel =
+      SelectInst::Create(ICmp,
+                         ConstantInt::get(Rem->getType(), 0),
+                         Rem->getOperand(0), "tmp", Rem);
+    Rem->replaceAllUsesWith(Sel);
+  }
+
+  // Inform IVUsers about the new users.
+  if (IU) {
+    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
+      IU->AddUsersIfInteresting(I);
+  }
+  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.push_back(Rem);
+}
+
+/// eliminateIVUser - Eliminate an operation that consumes a simple IV and has
+/// no observable side-effect given the range of IV values.
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
+                                     Instruction *IVOperand) {
+  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+    eliminateIVComparison(ICmp, IVOperand);
+    return true;
+  }
+  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Rem, IVOperand, IsSigned);
+      return true;
+    }
+  }
+
+  // Eliminate any operation that SCEV can prove is an identity function.
+  if (!SE->isSCEVable(UseInst->getType()) ||
+      (UseInst->getType() != IVOperand->getType()) ||
+      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+    return false;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+
+  UseInst->replaceAllUsesWith(IVOperand);
+  ++NumElimIdentity;
+  Changed = true;
+  DeadInsts.push_back(UseInst);
+  return true;
+}
+
+/// pushIVUsers - Add all uses of Def to the current IV's worklist.
+///
+static void pushIVUsers(
+  Instruction *Def,
+  SmallPtrSet<Instruction*,16> &Simplified,
+  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
+
+  for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Avoid infinite or exponential worklist processing.
+    // Also ensure unique worklist users.
+    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
+    // self edges first.
+    if (User != Def && Simplified.insert(User))
+      SimpleIVUsers.push_back(std::make_pair(User, Def));
+  }
+}
+
+/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
+/// expression in terms of that IV.
+///
+/// This is similar to IVUsers' isInteresting() but processes each instruction
+/// non-recursively when the operand is already known to be a simpleIVUser.
+///
+static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *S = SE->getSCEV(I);
+
+  // Only consider affine recurrences.
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (AR && AR->getLoop() == L)
+    return true;
+
+  return false;
+}
+
+/// simplifyUsers - Iteratively perform simplification on a worklist of users
+/// of the specified induction variable. Each successive simplification may push
+/// more users which may themselves be candidates for simplification.
+///
+/// This algorithm does not require IVUsers analysis. Instead, it simplifies
+/// instructions in-place during analysis. Rather than rewriting induction
+/// variables bottom-up from their users, it transforms a chain of IVUsers
+/// top-down, updating the IR only when it encouters a clear optimization
+/// opportunitiy.
+///
+/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
+///
+void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
+  if (!SE->isSCEVable(CurrIV->getType()))
+    return;
+
+  // Instructions processed by SimplifyIndvar for CurrIV.
+  SmallPtrSet<Instruction*,16> Simplified;
+
+  // Use-def pairs if IV users waiting to be processed for CurrIV.
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+
+  // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
+  // called multiple times for the same LoopPhi. This is the proper thing to
+  // do for loop header phis that use each other.
+  pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
+
+  while (!SimpleIVUsers.empty()) {
+    std::pair<Instruction*, Instruction*> UseOper =
+      SimpleIVUsers.pop_back_val();
+    // Bypass back edges to avoid extra work.
+    if (UseOper.first == CurrIV) continue;
+
+    Instruction *IVOperand = UseOper.second;
+    for (unsigned N = 0; IVOperand; ++N) {
+      assert(N <= Simplified.size() && "runaway iteration");
+
+      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+      if (!NewOper)
+        break; // done folding
+      IVOperand = dyn_cast<Instruction>(NewOper);
+    }
+    if (!IVOperand)
+      continue;
+
+    if (eliminateIVUser(UseOper.first, IVOperand)) {
+      pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
+      continue;
+    }
+    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+    if (V && Cast) {
+      V->visitCast(Cast);
+      continue;
+    }
+    if (isSimpleIVUser(UseOper.first, L, SE)) {
+      pushIVUsers(UseOper.first, Simplified, SimpleIVUsers);
+    }
+  }
+}
+
+namespace llvm {
+
+/// simplifyUsersOfIV - Simplify instructions that use this induction variable
+/// by using ScalarEvolution to analyze the IV's recurrence.
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
+                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V)
+{
+  LoopInfo *LI = &LPM->getAnalysis<LoopInfo>();
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LPM, Dead);
+  SIV.simplifyUsers(CurrIV, V);
+  return SIV.hasChanged();
+}
+
+/// simplifyLoopIVs - Simplify users of induction variables within this
+/// loop. This does not actually change or add IVs.
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
+                     SmallVectorImpl<WeakVH> &Dead) {
+  bool Changed = false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead);
+  }
+  return Changed;
+}
+
+/// simplifyIVUsers - Perform simplification on instructions recorded by the
+/// IVUsers pass.
+///
+/// This is the old approach to IV simplification to be replaced by
+/// SimplifyLoopIVs.
+bool simplifyIVUsers(IVUsers *IU, ScalarEvolution *SE, LPPassManager *LPM,
+                     SmallVectorImpl<WeakVH> &Dead) {
+  SimplifyIndvar SIV(IU->getLoop(), SE, LPM, Dead);
+
+  // Each round of simplification involves a round of eliminating operations
+  // followed by a round of widening IVs. A single IVUsers worklist is used
+  // across all rounds. The inner loop advances the user. If widening exposes
+  // more uses, then another pass through the outer loop is triggered.
+  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
+    Instruction *UseInst = I->getUser();
+    Value *IVOperand = I->getOperandValToReplace();
+
+    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+      SIV.eliminateIVComparison(ICmp, IVOperand);
+      continue;
+    }
+    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+        SIV.eliminateIVRemainder(Rem, IVOperand, IsSigned);
+        continue;
+      }
+    }
+  }
+  return SIV.hasChanged();
+}
+
+} // namespace llvm
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 973b105..fc2538d 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -183,10 +183,9 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
     }
   }
 
-  // Remap attached metadata.  Don't bother remapping DebugLoc, it can never
-  // have mappings to do.
+  // Remap attached metadata.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
-  I->getAllMetadataOtherThanDebugLoc(MDs);
+  I->getAllMetadata(MDs);
   for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
        MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
     MDNode *Old = MI->second;