Update LLVM to r89205.

author: rdivacky <rdivacky@FreeBSD.org> 2009-11-18 14:58:34 +0000
committer: rdivacky <rdivacky@FreeBSD.org> 2009-11-18 14:58:34 +0000
commit: d2e985fd323c167e20f77b045a1d99ad166e65db (patch)
tree: 6a111e552c75afc66228e3d8f19b6731e4013f10 /lib/Transforms
parent: ded64d5d348ce8d8c5aa42cf63f6de9dd84b7e89 (diff)
download: FreeBSD-src-d2e985fd323c167e20f77b045a1d99ad166e65db.zip
FreeBSD-src-d2e985fd323c167e20f77b045a1d99ad166e65db.tar.gz
36 files changed, 2298 insertions, 1842 deletions
diff --git a/lib/Transforms/Hello/CMakeLists.txt b/lib/Transforms/Hello/CMakeLists.txt
index b80d15b..917b745 100644
--- a/lib/Transforms/Hello/CMakeLists.txt
+++ b/lib/Transforms/Hello/CMakeLists.txt
@@ -1,3 +1,3 @@
-add_llvm_library( LLVMHello
+add_llvm_loadable_module( LLVMHello
   Hello.cpp
   )
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 234d0ec..442f2fb 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -20,7 +20,6 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -245,8 +244,7 @@ static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
   return false;
 }
 
-static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx,
-                                             LLVMContext &Context) {
+static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) {
   ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
   if (!CI) return 0;
   unsigned IdxV = CI->getZExtValue();
@@ -282,8 +280,7 @@ static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx,
 /// users of the global, cleaning up the obvious ones.  This is largely just a
 /// quick scan over the use list to clean up the easy and obvious cruft.  This
 /// returns true if it made a change.
-static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
-                                       LLVMContext &Context) {
+static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
   bool Changed = false;
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
     User *U = *UI++;
@@ -304,11 +301,11 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
         Constant *SubInit = 0;
         if (Init)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
-        Changed |= CleanupConstantGlobalUsers(CE, SubInit, Context);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit);
       } else if (CE->getOpcode() == Instruction::BitCast && 
                  isa<PointerType>(CE->getType())) {
         // Pointer cast, delete any stores and memsets to the global.
-        Changed |= CleanupConstantGlobalUsers(CE, 0, Context);
+        Changed |= CleanupConstantGlobalUsers(CE, 0);
       }
 
       if (CE->use_empty()) {
@@ -322,11 +319,11 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       Constant *SubInit = 0;
       if (!isa<ConstantExpr>(GEP->getOperand(0))) {
         ConstantExpr *CE = 
-          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, Context));
+          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP));
         if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
       }
-      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, Context);
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit);
 
       if (GEP->use_empty()) {
         GEP->eraseFromParent();
@@ -344,7 +341,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       if (SafeToDestroyConstant(C)) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
-        CleanupConstantGlobalUsers(V, Init, Context);
+        CleanupConstantGlobalUsers(V, Init);
         return true;
       }
     }
@@ -469,8 +466,7 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
 /// behavior of the program in a more fine-grained way.  We have determined that
 /// this transformation is safe already.  We return the first global variable we
 /// insert so that the caller can reprocess it.
-static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD,
-                                 LLVMContext &Context) {
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
   // Make sure this global only has simple uses that we can SRA.
   if (!GlobalUsersSafeToSRA(GV))
     return 0;
@@ -492,11 +488,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD,
     const StructLayout &Layout = *TD.getStructLayout(STy);
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       Constant *In = getAggregateConstantElement(Init,
-                                ConstantInt::get(Type::getInt32Ty(Context), i),
-                                    Context);
+                    ConstantInt::get(Type::getInt32Ty(STy->getContext()), i));
       assert(In && "Couldn't get element of initializer?");
-      GlobalVariable *NGV = new GlobalVariable(Context,
-                                               STy->getElementType(i), false,
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
                                                GlobalVariable::InternalLinkage,
                                                In, GV->getName()+"."+Twine(i),
                                                GV->isThreadLocal(),
@@ -527,12 +521,10 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD,
     unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType());
     for (unsigned i = 0, e = NumElements; i != e; ++i) {
       Constant *In = getAggregateConstantElement(Init,
-                                ConstantInt::get(Type::getInt32Ty(Context), i),
-                                    Context);
+                    ConstantInt::get(Type::getInt32Ty(Init->getContext()), i));
       assert(In && "Couldn't get element of initializer?");
 
-      GlobalVariable *NGV = new GlobalVariable(Context,
-                                               STy->getElementType(), false,
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
                                                GlobalVariable::InternalLinkage,
                                                In, GV->getName()+"."+Twine(i),
                                                GV->isThreadLocal(),
@@ -554,7 +546,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD,
 
   DEBUG(errs() << "PERFORMING GLOBAL SRA ON: " << *GV);
 
-  Constant *NullInt = Constant::getNullValue(Type::getInt32Ty(Context));
+  Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
 
   // Loop over all of the uses of the global, replacing the constantexpr geps,
   // with smaller constantexpr geps or direct references.
@@ -678,8 +670,7 @@ static bool AllUsesOfLoadedValueWillTrapIfNull(GlobalVariable *GV) {
   return true;
 }
 
-static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV,
-                                           LLVMContext &Context) {
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
   bool Changed = false;
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) {
     Instruction *I = cast<Instruction>(*UI++);
@@ -712,7 +703,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV,
     } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
       Changed |= OptimizeAwayTrappingUsesOfValue(CI,
                                 ConstantExpr::getCast(CI->getOpcode(),
-                                                NewV, CI->getType()), Context);
+                                                      NewV, CI->getType()));
       if (CI->use_empty()) {
         Changed = true;
         CI->eraseFromParent();
@@ -730,7 +721,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV,
       if (Idxs.size() == GEPI->getNumOperands()-1)
         Changed |= OptimizeAwayTrappingUsesOfValue(GEPI,
                           ConstantExpr::getGetElementPtr(NewV, &Idxs[0],
-                                                        Idxs.size()), Context);
+                                                        Idxs.size()));
       if (GEPI->use_empty()) {
         Changed = true;
         GEPI->eraseFromParent();
@@ -746,8 +737,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV,
 /// value stored into it.  If there are uses of the loaded value that would trap
 /// if the loaded value is dynamically null, then we know that they cannot be
 /// reachable with a null optimize away the load.
-static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
-                                            LLVMContext &Context) {
+static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
   bool Changed = false;
 
   // Keep track of whether we are able to remove all the uses of the global
@@ -758,7 +748,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   for (Value::use_iterator GUI = GV->use_begin(), E = GV->use_end(); GUI != E;){
     User *GlobalUser = *GUI++;
     if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
-      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV, Context);
+      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
       // If we were able to delete all uses of the loads
       if (LI->use_empty()) {
         LI->eraseFromParent();
@@ -789,7 +779,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // nor is the global.
   if (AllNonStoreUsesGone) {
     DEBUG(errs() << "  *** GLOBAL NOW DEAD!\n");
-    CleanupConstantGlobalUsers(GV, 0, Context);
+    CleanupConstantGlobalUsers(GV, 0);
     if (GV->use_empty()) {
       GV->eraseFromParent();
       ++NumDeleted;
@@ -801,10 +791,10 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
 
 /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
 /// instructions that are foldable.
-static void ConstantPropUsersOf(Value *V, LLVMContext &Context) {
+static void ConstantPropUsersOf(Value *V) {
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; )
     if (Instruction *I = dyn_cast<Instruction>(*UI++))
-      if (Constant *NewC = ConstantFoldInstruction(I, Context)) {
+      if (Constant *NewC = ConstantFoldInstruction(I)) {
         I->replaceAllUsesWith(NewC);
 
         // Advance UI to the next non-I use to avoid invalidating it!
@@ -824,11 +814,10 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                                      CallInst *CI,
                                                      const Type *AllocTy,
                                                      Value* NElems,
-                                                     LLVMContext &Context,
                                                      TargetData* TD) {
   DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
 
-  const Type *IntPtrTy = TD->getIntPtrType(Context);
+  const Type *IntPtrTy = TD->getIntPtrType(GV->getContext());
   
   // CI has either 0 or 1 bitcast uses (getMallocType() would otherwise have
   // returned NULL and we would not be here).
@@ -883,10 +872,10 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
   // If there is a comparison against null, we will insert a global bool to
   // keep track of whether the global was initialized yet or not.
   GlobalVariable *InitBool =
-    new GlobalVariable(Context, Type::getInt1Ty(Context), false,
+    new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
                        GlobalValue::InternalLinkage,
-                       ConstantInt::getFalse(Context), GV->getName()+".init",
-                       GV->isThreadLocal());
+                       ConstantInt::getFalse(GV->getContext()),
+                       GV->getName()+".init", GV->isThreadLocal());
   bool InitBoolUsed = false;
 
   // Loop over all uses of GV, processing them in turn.
@@ -905,8 +894,8 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
           switch (ICI->getPredicate()) {
           default: llvm_unreachable("Unknown ICmp Predicate!");
           case ICmpInst::ICMP_ULT:
-          case ICmpInst::ICMP_SLT:
-            LV = ConstantInt::getFalse(Context);   // X < null -> always false
+          case ICmpInst::ICMP_SLT:   // X < null -> always false
+            LV = ConstantInt::getFalse(GV->getContext());
             break;
           case ICmpInst::ICMP_ULE:
           case ICmpInst::ICMP_SLE:
@@ -928,7 +917,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
     } else {
       StoreInst *SI = cast<StoreInst>(GV->use_back());
       // The global is initialized when the store to it occurs.
-      new StoreInst(ConstantInt::getTrue(Context), InitBool, SI);
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, SI);
       SI->eraseFromParent();
     }
 
@@ -949,9 +938,9 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
   // To further other optimizations, loop over all users of NewGV and try to
   // constant prop them.  This will promote GEP instructions with constant
   // indices into GEP constant-exprs, which will allow global-opt to hack on it.
-  ConstantPropUsersOf(NewGV, Context);
+  ConstantPropUsersOf(NewGV);
   if (RepValue != NewGV)
-    ConstantPropUsersOf(RepValue, Context);
+    ConstantPropUsersOf(RepValue);
 
   return NewGV;
 }
@@ -1153,8 +1142,7 @@ static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(GlobalVariable *GV,
 
 static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
                DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
-                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite,
-                   LLVMContext &Context) {
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
   std::vector<Value*> &FieldVals = InsertedScalarizedValues[V];
   
   if (FieldNo >= FieldVals.size())
@@ -1172,7 +1160,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
     // a new Load of the scalarized global.
     Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo,
                                            InsertedScalarizedValues,
-                                           PHIsToRewrite, Context),
+                                           PHIsToRewrite),
                           LI->getName()+".f"+Twine(FieldNo), LI);
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
@@ -1196,16 +1184,14 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
 /// the load, rewrite the derived value to use the HeapSRoA'd load.
 static void RewriteHeapSROALoadUser(Instruction *LoadUser, 
              DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
-                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite,
-                   LLVMContext &Context) {
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
   // If this is a comparison against null, handle it.
   if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
     assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
     // If we have a setcc of the loaded pointer, we can use a setcc of any
     // field.
     Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
-                                   InsertedScalarizedValues, PHIsToRewrite,
-                                   Context);
+                                   InsertedScalarizedValues, PHIsToRewrite);
     
     Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
                               Constant::getNullValue(NPtr->getType()), 
@@ -1223,8 +1209,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
     // Load the pointer for this field.
     unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
     Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
-                                     InsertedScalarizedValues, PHIsToRewrite,
-                                     Context);
+                                     InsertedScalarizedValues, PHIsToRewrite);
     
     // Create the new GEP idx vector.
     SmallVector<Value*, 8> GEPIdx;
@@ -1256,8 +1241,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
   // users.
   for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) {
     Instruction *User = cast<Instruction>(*UI++);
-    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite,
-                            Context);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
   }
 }
 
@@ -1267,13 +1251,11 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
 /// AllGlobalLoadUsesSimpleEnoughForHeapSRA.
 static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, 
                DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
-                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite,
-                   LLVMContext &Context) {
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
   for (Value::use_iterator UI = Load->use_begin(), E = Load->use_end();
        UI != E; ) {
     Instruction *User = cast<Instruction>(*UI++);
-    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite,
-                            Context);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
   }
   
   if (Load->use_empty()) {
@@ -1285,8 +1267,7 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
-                                            Value* NElems, LLVMContext &Context,
-                                            TargetData *TD) {
+                                            Value* NElems, TargetData *TD) {
   DEBUG(errs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
   const Type* MAT = getMallocAllocatedType(CI);
   const StructType *STy = cast<StructType>(MAT);
@@ -1315,14 +1296,16 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     FieldGlobals.push_back(NGV);
     
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
-    if (const StructType* ST = dyn_cast<StructType>(FieldTy))
+    if (const StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    const Type* IntPtrTy = TD->getIntPtrType(Context);
+    const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems,
                                         CI->getName() + ".f" + Twine(FieldNo));
-    FieldMallocs.push_back(NMI);
+    CallInst *NCI = dyn_cast<BitCastInst>(NMI) ?
+                    extractMallocCallFromBitCast(NMI) : cast<CallInst>(NMI);
+    FieldMallocs.push_back(NCI);
     new StoreInst(NMI, NGV, CI);
   }
   
@@ -1338,15 +1321,15 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   //      if (F1) { free(F1); F1 = 0; }
   //      if (F2) { free(F2); F2 = 0; }
   //    }
-  Value *RunningOr = 0;
+  // The malloc can also fail if its argument is too large.
+  Constant *ConstantZero = ConstantInt::get(CI->getOperand(1)->getType(), 0);
+  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getOperand(1),
+                                  ConstantZero, "isneg");
   for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
     Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
                              Constant::getNullValue(FieldMallocs[i]->getType()),
                                "isnull");
-    if (!RunningOr)
-      RunningOr = Cond;   // First seteq
-    else
-      RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
+    RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
   }
 
   // Split the basic block at the old malloc.
@@ -1355,7 +1338,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   
   // Create the block to check the first condition.  Put all these blocks at the
   // end of the function as they are unlikely to be executed.
-  BasicBlock *NullPtrBlock = BasicBlock::Create(Context, "malloc_ret_null",
+  BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
+                                                "malloc_ret_null",
                                                 OrigBB->getParent());
   
   // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
@@ -1370,9 +1354,9 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal, 
                               Constant::getNullValue(GVVal->getType()),
                               "tmp");
-    BasicBlock *FreeBlock = BasicBlock::Create(Context, "free_it",
+    BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
                                                OrigBB->getParent());
-    BasicBlock *NextBlock = BasicBlock::Create(Context, "next",
+    BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
                                                OrigBB->getParent());
     Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
                                          Cmp, NullPtrBlock);
@@ -1406,8 +1390,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     Instruction *User = cast<Instruction>(*UI++);
     
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite,
-                                   Context);
+      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
       continue;
     }
     
@@ -1438,7 +1421,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *InVal = PN->getIncomingValue(i);
       InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
-                               PHIsToRewrite, Context);
+                               PHIsToRewrite);
       FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
     }
   }
@@ -1477,8 +1460,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                CallInst *CI,
                                                const Type *AllocTy,
                                                Module::global_iterator &GVI,
-                                               TargetData *TD,
-                                               LLVMContext &Context) {
+                                               TargetData *TD) {
   // If this is a malloc of an abstract type, don't touch it.
   if (!AllocTy->isSized())
     return false;
@@ -1508,15 +1490,14 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // This eliminates dynamic allocation, avoids an indirection accessing the
   // data, and exposes the resultant global to further GlobalOpt.
   // We cannot optimize the malloc if we cannot determine malloc array size.
-  if (Value *NElems = getMallocArraySize(CI, Context, TD)) {
+  if (Value *NElems = getMallocArraySize(CI, TD, true)) {
     if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
       // Restrict this transformation to only working on small allocations
       // (2048 bytes currently), as we don't want to introduce a 16M global or
       // something.
       if (TD && 
           NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) {
-        GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElems,
-                                            Context, TD);
+        GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElems, TD);
         return true;
       }
   
@@ -1540,7 +1521,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
         // structs.  malloc [100 x struct],1 -> malloc struct, 100
         if (const ArrayType *AT =
                               dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
-          const Type *IntPtrTy = TD->getIntPtrType(Context);
+          const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
           unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
           Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
           Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
@@ -1551,12 +1532,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
           CI->replaceAllUsesWith(Cast);
           CI->eraseFromParent();
           CI = dyn_cast<BitCastInst>(Malloc) ?
-               extractMallocCallFromBitCast(Malloc):
-               cast<CallInst>(Malloc);
+               extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc);
         }
       
-        GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, Context, TD), 
-                                   Context, TD);
+        GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true),TD);
         return true;
       }
     }
@@ -1569,7 +1548,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 // that only one value (besides its initializer) is ever stored to the global.
 static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
                                      Module::global_iterator &GVI,
-                                     TargetData *TD, LLVMContext &Context) {
+                                     TargetData *TD) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
 
@@ -1585,12 +1564,12 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
          ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
 
       // Optimize away any trapping uses of the loaded value.
-      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, Context))
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
         return true;
     } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
       const Type* MallocType = getMallocAllocatedType(CI);
       if (MallocType && TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, 
-                                                           GVI, TD, Context))
+                                                           GVI, TD))
         return true;
     }
   }
@@ -1602,8 +1581,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
 /// two values ever stored into GV are its initializer and OtherVal.  See if we
 /// can shrink the global into a boolean and select between the two values
 /// whenever it is used.  This exposes the values to other scalar optimizations.
-static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal,
-                                       LLVMContext &Context) {
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
   const Type *GVElType = GV->getType()->getElementType();
   
   // If GVElType is already i1, it is already shrunk.  If the type of the GV is
@@ -1611,7 +1589,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal,
   // between them is very expensive and unlikely to lead to later
   // simplification.  In these cases, we typically end up with "cond ? v1 : v2"
   // where v1 and v2 both require constant pool loads, a big loss.
-  if (GVElType == Type::getInt1Ty(Context) || GVElType->isFloatingPoint() ||
+  if (GVElType == Type::getInt1Ty(GV->getContext()) ||
+      GVElType->isFloatingPoint() ||
       isa<PointerType>(GVElType) || isa<VectorType>(GVElType))
     return false;
   
@@ -1624,15 +1603,16 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal,
   DEBUG(errs() << "   *** SHRINKING TO BOOL: " << *GV);
   
   // Create the new global, initializing it to false.
-  GlobalVariable *NewGV = new GlobalVariable(Context,
-                                             Type::getInt1Ty(Context), false,
-         GlobalValue::InternalLinkage, ConstantInt::getFalse(Context),
+  GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
+                                             false,
+                                             GlobalValue::InternalLinkage, 
+                                        ConstantInt::getFalse(GV->getContext()),
                                              GV->getName()+".b",
                                              GV->isThreadLocal());
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   Constant *InitVal = GV->getInitializer();
-  assert(InitVal->getType() != Type::getInt1Ty(Context) &&
+  assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
          "No reason to shrink to bool!");
 
   // If initialized to zero and storing one into the global, we can use a cast
@@ -1649,7 +1629,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal,
       // Only do this if we weren't storing a loaded value.
       Value *StoreVal;
       if (StoringOther || SI->getOperand(0) == InitVal)
-        StoreVal = ConstantInt::get(Type::getInt1Ty(Context), StoringOther);
+        StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
+                                    StoringOther);
       else {
         // Otherwise, we are storing a previously loaded copy.  To do this,
         // change the copy from copying the original value to just copying the
@@ -1708,24 +1689,26 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
   if (!AnalyzeGlobal(GV, GS, PHIUsers)) {
 #if 0
-    cerr << "Global: " << *GV;
-    cerr << "  isLoaded = " << GS.isLoaded << "\n";
-    cerr << "  StoredType = ";
+    DEBUG(errs() << "Global: " << *GV);
+    DEBUG(errs() << "  isLoaded = " << GS.isLoaded << "\n");
+    DEBUG(errs() << "  StoredType = ");
     switch (GS.StoredType) {
-    case GlobalStatus::NotStored: cerr << "NEVER STORED\n"; break;
-    case GlobalStatus::isInitializerStored: cerr << "INIT STORED\n"; break;
-    case GlobalStatus::isStoredOnce: cerr << "STORED ONCE\n"; break;
-    case GlobalStatus::isStored: cerr << "stored\n"; break;
+    case GlobalStatus::NotStored: DEBUG(errs() << "NEVER STORED\n"); break;
+    case GlobalStatus::isInitializerStored: DEBUG(errs() << "INIT STORED\n");
+                                            break;
+    case GlobalStatus::isStoredOnce: DEBUG(errs() << "STORED ONCE\n"); break;
+    case GlobalStatus::isStored: DEBUG(errs() << "stored\n"); break;
     }
     if (GS.StoredType == GlobalStatus::isStoredOnce && GS.StoredOnceValue)
-      cerr << "  StoredOnceValue = " << *GS.StoredOnceValue << "\n";
+      DEBUG(errs() << "  StoredOnceValue = " << *GS.StoredOnceValue << "\n");
     if (GS.AccessingFunction && !GS.HasMultipleAccessingFunctions)
-      cerr << "  AccessingFunction = " << GS.AccessingFunction->getName()
-                << "\n";
-    cerr << "  HasMultipleAccessingFunctions =  "
-              << GS.HasMultipleAccessingFunctions << "\n";
-    cerr << "  HasNonInstructionUser = " << GS.HasNonInstructionUser<<"\n";
-    cerr << "\n";
+      DEBUG(errs() << "  AccessingFunction = " << GS.AccessingFunction->getName()
+                  << "\n");
+    DEBUG(errs() << "  HasMultipleAccessingFunctions =  "
+                 << GS.HasMultipleAccessingFunctions << "\n");
+    DEBUG(errs() << "  HasNonInstructionUser = " 
+                 << GS.HasNonInstructionUser<<"\n");
+    DEBUG(errs() << "\n");
 #endif
     
     // If this is a first class global and has only one accessing function
@@ -1764,8 +1747,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.
-      bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), 
-                                                GV->getContext());
+      bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer());
 
       // If the global is dead now, delete it.
       if (GV->use_empty()) {
@@ -1780,7 +1762,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
       GV->setConstant(true);
 
       // Clean up any obviously simplifiable users now.
-      CleanupConstantGlobalUsers(GV, GV->getInitializer(), GV->getContext());
+      CleanupConstantGlobalUsers(GV, GV->getInitializer());
 
       // If the global is dead now, just nuke it.
       if (GV->use_empty()) {
@@ -1794,8 +1776,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
       return true;
     } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
       if (TargetData *TD = getAnalysisIfAvailable<TargetData>())
-        if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD,
-                                                   GV->getContext())) {
+        if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD)) {
           GVI = FirstNewGV;  // Don't skip the newly produced globals!
           return true;
         }
@@ -1810,8 +1791,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
           GV->setInitializer(SOVConstant);
 
           // Clean up any obviously simplifiable users now.
-          CleanupConstantGlobalUsers(GV, GV->getInitializer(), 
-                                     GV->getContext());
+          CleanupConstantGlobalUsers(GV, GV->getInitializer());
 
           if (GV->use_empty()) {
             DEBUG(errs() << "   *** Substituting initializer allowed us to "
@@ -1828,14 +1808,13 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
       // Try to optimize globals based on the knowledge that only one value
       // (besides its initializer) is ever stored to the global.
       if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GVI,
-                                   getAnalysisIfAvailable<TargetData>(),
-                                   GV->getContext()))
+                                   getAnalysisIfAvailable<TargetData>()))
         return true;
 
       // Otherwise, if the global was not a boolean, we can shrink it to be a
       // boolean.
       if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-        if (TryToShrinkGlobalToBoolean(GV, SOVConstant, GV->getContext())) {
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
           ++NumShrunkToBool;
           return true;
         }
@@ -1987,11 +1966,10 @@ static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) {
 /// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the
 /// specified array, returning the new global to use.
 static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, 
-                                          const std::vector<Function*> &Ctors,
-                                          LLVMContext &Context) {
+                                          const std::vector<Function*> &Ctors) {
   // If we made a change, reassemble the initializer list.
   std::vector<Constant*> CSVals;
-  CSVals.push_back(ConstantInt::get(Type::getInt32Ty(Context), 65535));
+  CSVals.push_back(ConstantInt::get(Type::getInt32Ty(GCL->getContext()),65535));
   CSVals.push_back(0);
   
   // Create the new init list.
@@ -2000,12 +1978,14 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
     if (Ctors[i]) {
       CSVals[1] = Ctors[i];
     } else {
-      const Type *FTy = FunctionType::get(Type::getVoidTy(Context), false);
+      const Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
+                                          false);
       const PointerType *PFTy = PointerType::getUnqual(FTy);
       CSVals[1] = Constant::getNullValue(PFTy);
-      CSVals[0] = ConstantInt::get(Type::getInt32Ty(Context), 2147483647);
+      CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()),
+                                   2147483647);
     }
-    CAList.push_back(ConstantStruct::get(Context, CSVals, false));
+    CAList.push_back(ConstantStruct::get(GCL->getContext(), CSVals, false));
   }
   
   // Create the array initializer.
@@ -2021,8 +2001,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   }
   
   // Create the new global and insert it next to the existing list.
-  GlobalVariable *NGV = new GlobalVariable(Context, CA->getType(), 
-                                           GCL->isConstant(),
+  GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
                                            GCL->getLinkage(), CA, "",
                                            GCL->isThreadLocal());
   GCL->getParent()->getGlobalList().insert(GCL, NGV);
@@ -2056,7 +2035,7 @@ static Constant *getVal(DenseMap<Value*, Constant*> &ComputedValues,
 /// enough for us to understand.  In particular, if it is a cast of something,
 /// we punt.  We basically just support direct accesses to globals and GEP's of
 /// globals.  This should be kept up to date with CommitValueTo.
-static bool isSimpleEnoughPointerToCommit(Constant *C, LLVMContext &Context) {
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
   // Conservatively, avoid aggregate types. This is because we don't
   // want to worry about them partially overlapping other stores.
   if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
@@ -2096,8 +2075,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C, LLVMContext &Context) {
 /// initializer.  This returns 'Init' modified to reflect 'Val' stored into it.
 /// At this point, the GEP operands of Addr [0, OpNo) have been stepped into.
 static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
-                                   ConstantExpr *Addr, unsigned OpNo,
-                                   LLVMContext &Context) {
+                                   ConstantExpr *Addr, unsigned OpNo) {
   // Base case of the recursion.
   if (OpNo == Addr->getNumOperands()) {
     assert(Val->getType() == Init->getType() && "Type mismatch!");
@@ -2126,10 +2104,11 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
     ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
     unsigned Idx = CU->getZExtValue();
     assert(Idx < STy->getNumElements() && "Struct index out of range!");
-    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1, Context);
+    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
     
     // Return the modified struct.
-    return ConstantStruct::get(Context, &Elts[0], Elts.size(), STy->isPacked());
+    return ConstantStruct::get(Init->getContext(), &Elts[0], Elts.size(),
+                               STy->isPacked());
   } else {
     ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
     const ArrayType *ATy = cast<ArrayType>(Init->getType());
@@ -2152,15 +2131,14 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
     
     assert(CI->getZExtValue() < ATy->getNumElements());
     Elts[CI->getZExtValue()] =
-      EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1, Context);
+      EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
     return ConstantArray::get(ATy, Elts);
   }    
 }
 
 /// CommitValueTo - We have decided that Addr (which satisfies the predicate
 /// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
-static void CommitValueTo(Constant *Val, Constant *Addr,
-                          LLVMContext &Context) {
+static void CommitValueTo(Constant *Val, Constant *Addr) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
     assert(GV->hasInitializer());
     GV->setInitializer(Val);
@@ -2171,7 +2149,7 @@ static void CommitValueTo(Constant *Val, Constant *Addr,
   GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
   
   Constant *Init = GV->getInitializer();
-  Init = EvaluateStoreInto(Init, Val, CE, 2, Context);
+  Init = EvaluateStoreInto(Init, Val, CE, 2);
   GV->setInitializer(Init);
 }
 
@@ -2179,8 +2157,7 @@ static void CommitValueTo(Constant *Val, Constant *Addr,
 /// P after the stores reflected by 'memory' have been performed.  If we can't
 /// decide, return null.
 static Constant *ComputeLoadResult(Constant *P,
-                                const DenseMap<Constant*, Constant*> &Memory,
-                                LLVMContext &Context) {
+                                const DenseMap<Constant*, Constant*> &Memory) {
   // If this memory location has been recently stored, use the stored value: it
   // is the most up-to-date.
   DenseMap<Constant*, Constant*>::const_iterator I = Memory.find(P);
@@ -2218,8 +2195,6 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
   if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
     return false;
   
-  LLVMContext &Context = F->getContext();
-  
   CallStack.push_back(F);
   
   /// Values - As we compute SSA register values, we store their contents here.
@@ -2246,7 +2221,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
       if (SI->isVolatile()) return false;  // no volatile accesses.
       Constant *Ptr = getVal(Values, SI->getOperand(1));
-      if (!isSimpleEnoughPointerToCommit(Ptr, Context))
+      if (!isSimpleEnoughPointerToCommit(Ptr))
         // If this is too complex for us to commit, reject it.
         return false;
       Constant *Val = getVal(Values, SI->getOperand(0));
@@ -2280,12 +2255,12 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
       if (LI->isVolatile()) return false;  // no volatile accesses.
       InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)),
-                                     MutatedMemory, Context);
+                                     MutatedMemory);
       if (InstResult == 0) return false; // Could not evaluate load.
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
       if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
       const Type *Ty = AI->getType()->getElementType();
-      AllocaTmps.push_back(new GlobalVariable(Context, Ty, false,
+      AllocaTmps.push_back(new GlobalVariable(Ty, false,
                                               GlobalValue::InternalLinkage,
                                               UndefValue::get(Ty),
                                               AI->getName()));
@@ -2423,7 +2398,7 @@ static bool EvaluateStaticConstructor(Function *F) {
           << " stores.\n");
     for (DenseMap<Constant*, Constant*>::iterator I = MutatedMemory.begin(),
          E = MutatedMemory.end(); I != E; ++I)
-      CommitValueTo(I->second, I->first, F->getContext());
+      CommitValueTo(I->second, I->first);
   }
   
   // At this point, we are done interpreting.  If we created any 'alloca'
@@ -2480,7 +2455,7 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   
   if (!MadeChange) return false;
   
-  GCL = InstallGlobalCtors(GCL, Ctors, GCL->getContext());
+  GCL = InstallGlobalCtors(GCL, Ctors);
   return true;
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index ea47366..6918fe8 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -19,10 +19,11 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,6 +33,7 @@
 using namespace llvm;
 
 STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 
@@ -336,23 +338,38 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) {
     for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
       CallSite CS = CallSites[CSi];
       
+      Function *Caller = CS.getCaller();
       Function *Callee = CS.getCalledFunction();
-      // We can only inline direct calls to non-declarations.
-      if (Callee == 0 || Callee->isDeclaration()) continue;
+
+      // If this call site is dead and it is to a readonly function, we should
+      // just delete the call instead of trying to inline it, regardless of
+      // size.  This happens because IPSCCP propagates the result out of the
+      // call and then we're left with the dead call.
+      if (isInstructionTriviallyDead(CS.getInstruction())) {
+        DEBUG(errs() << "    -> Deleting dead call: "
+                     << *CS.getInstruction() << "\n");
+        // Update the call graph by deleting the edge from Callee to Caller.
+        CG[Caller]->removeCallEdgeFor(CS);
+        CS.getInstruction()->eraseFromParent();
+        ++NumCallsDeleted;
+      } else {
+        // We can only inline direct calls to non-declarations.
+        if (Callee == 0 || Callee->isDeclaration()) continue;
       
-      // If the policy determines that we should inline this function,
-      // try to do so.
-      if (!shouldInline(CS))
-        continue;
+        // If the policy determines that we should inline this function,
+        // try to do so.
+        if (!shouldInline(CS))
+          continue;
 
-      Function *Caller = CS.getCaller();
-      // Attempt to inline the function...
-      if (!InlineCallIfPossible(CS, CG, TD, InlinedArrayAllocas))
-        continue;
+        // Attempt to inline the function...
+        if (!InlineCallIfPossible(CS, CG, TD, InlinedArrayAllocas))
+          continue;
+        ++NumInlined;
+      }
       
-      // If we inlined the last possible call site to the function, delete the
-      // function body now.
-      if (Callee->use_empty() && Callee->hasLocalLinkage() &&
+      // If we inlined or deleted the last possible call site to the function,
+      // delete the function body now.
+      if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
           // TODO: Can remove if in SCC now.
           !SCCFunctions.count(Callee) &&
           
@@ -391,7 +408,6 @@ bool Inliner::runOnSCC(std::vector<CallGraphNode*> &SCC) {
       }
       --CSi;
 
-      ++NumInlined;
       Changed = true;
       LocalChange = true;
     }
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index fd69aeb..cb81330 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -75,6 +75,10 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (L->getParentLoop())
     return false;
 
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
   DominatorTree &DT = getAnalysis<DominatorTree>();
   bool Changed = false;
 
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 4f6369e..0b5e007 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -202,53 +202,35 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
 // llvm.dbg.region.end calls, and any globals they point to if now dead.
 static bool StripDebugInfo(Module &M) {
 
+  bool Changed = false;
+
   // Remove all of the calls to the debugger intrinsics, and remove them from
   // the module.
-  Function *FuncStart = M.getFunction("llvm.dbg.func.start");
-  Function *StopPoint = M.getFunction("llvm.dbg.stoppoint");
-  Function *RegionStart = M.getFunction("llvm.dbg.region.start");
-  Function *RegionEnd = M.getFunction("llvm.dbg.region.end");
-  Function *Declare = M.getFunction("llvm.dbg.declare");
-
-  if (FuncStart) {
-    while (!FuncStart->use_empty()) {
-      CallInst *CI = cast<CallInst>(FuncStart->use_back());
-      CI->eraseFromParent();
-    }
-    FuncStart->eraseFromParent();
-  }
-  if (StopPoint) {
-    while (!StopPoint->use_empty()) {
-      CallInst *CI = cast<CallInst>(StopPoint->use_back());
-      CI->eraseFromParent();
-    }
-    StopPoint->eraseFromParent();
-  }
-  if (RegionStart) {
-    while (!RegionStart->use_empty()) {
-      CallInst *CI = cast<CallInst>(RegionStart->use_back());
-      CI->eraseFromParent();
-    }
-    RegionStart->eraseFromParent();
-  }
-  if (RegionEnd) {
-    while (!RegionEnd->use_empty()) {
-      CallInst *CI = cast<CallInst>(RegionEnd->use_back());
-      CI->eraseFromParent();
-    }
-    RegionEnd->eraseFromParent();
-  }
-  if (Declare) {
+  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
     while (!Declare->use_empty()) {
       CallInst *CI = cast<CallInst>(Declare->use_back());
       CI->eraseFromParent();
     }
     Declare->eraseFromParent();
+    Changed = true;
   }
 
   NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv");
-  if (NMD)
+  if (NMD) {
+    Changed = true;
     NMD->eraseFromParent();
+  }
+  MetadataContext &TheMetadata = M.getContext().getMetadata();
+  unsigned MDDbgKind = TheMetadata.getMDKind("dbg");
+  if (!MDDbgKind)
+    return Changed;
+
+  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI) 
+    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
+         ++FI)
+      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+           ++BI) 
+        TheMetadata.removeMD(MDDbgKind, BI);
 
   return true;
 }
diff --git a/lib/Transforms/Scalar/ABCD.cpp b/lib/Transforms/Scalar/ABCD.cpp
index c8541d7..e58fa63 100644
--- a/lib/Transforms/Scalar/ABCD.cpp
+++ b/lib/Transforms/Scalar/ABCD.cpp
@@ -412,7 +412,9 @@ class ABCD : public FunctionPass {
   /// If PN_op1 and PN_o2 are different from NULL, create a constraint
   /// PN_op2 -> PN_op1 with value. In case any of them is NULL, replace
   /// with the respective V_op#, if V_op# is a ConstantInt.
-  void createConstraintSigSig(PHINode *SIG_op1, PHINode *SIG_op2, APInt value);
+  void createConstraintSigSig(PHINode *SIG_op1, PHINode *SIG_op2, 
+                              ConstantInt *V_op1, ConstantInt *V_op2,
+                              APInt value);
 
   /// Returns the sigma representing the Instruction I in BasicBlock BB.
   /// Returns NULL in case there is no sigma for this Instruction in this
@@ -735,25 +737,27 @@ void ABCD::createConstraintCmpInst(ICmpInst *ICI, TerminatorInst *TI) {
   APInt Zero = APInt::getNullValue(width);
 
   CmpInst::Predicate Pred = ICI->getPredicate();
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(V_op1);
+  ConstantInt *CI2 = dyn_cast<ConstantInt>(V_op2);
   switch (Pred) {
   case CmpInst::ICMP_SGT:  // signed greater than
-    createConstraintSigSig(SIG_op2_t, SIG_op1_t, MinusOne);
-    createConstraintSigSig(SIG_op1_f, SIG_op2_f, Zero);
+    createConstraintSigSig(SIG_op2_t, SIG_op1_t, CI2, CI1, MinusOne);
+    createConstraintSigSig(SIG_op1_f, SIG_op2_f, CI1, CI2, Zero);
     break;
 
   case CmpInst::ICMP_SGE:  // signed greater or equal
-    createConstraintSigSig(SIG_op2_t, SIG_op1_t, Zero);
-    createConstraintSigSig(SIG_op1_f, SIG_op2_f, MinusOne);
+    createConstraintSigSig(SIG_op2_t, SIG_op1_t, CI2, CI1, Zero);
+    createConstraintSigSig(SIG_op1_f, SIG_op2_f, CI1, CI2, MinusOne);
     break;
 
   case CmpInst::ICMP_SLT:  // signed less than
-    createConstraintSigSig(SIG_op1_t, SIG_op2_t, MinusOne);
-    createConstraintSigSig(SIG_op2_f, SIG_op1_f, Zero);
+    createConstraintSigSig(SIG_op1_t, SIG_op2_t, CI1, CI2, MinusOne);
+    createConstraintSigSig(SIG_op2_f, SIG_op1_f, CI2, CI1, Zero);
     break;
 
   case CmpInst::ICMP_SLE:  // signed less or equal
-    createConstraintSigSig(SIG_op1_t, SIG_op2_t, Zero);
-    createConstraintSigSig(SIG_op2_f, SIG_op1_f, MinusOne);
+    createConstraintSigSig(SIG_op1_t, SIG_op2_t, CI1, CI2, Zero);
+    createConstraintSigSig(SIG_op2_f, SIG_op1_f, CI2, CI1, MinusOne);
     break;
 
   default:
@@ -772,6 +776,10 @@ void ABCD::createConstraintCmpInst(ICmpInst *ICI, TerminatorInst *TI) {
 /// b->a and c->a with weight 0 in the lower bound graph, and the edges
 /// a->b and a->c with weight 0 in the upper bound graph.
 void ABCD::createConstraintPHINode(PHINode *PN) {
+  // FIXME: We really want to disallow sigma nodes, but I don't know the best
+  // way to detect the other than this.
+  if (PN->getNumOperands() == 2) return;
+  
   int32_t width = cast<IntegerType>(PN->getType())->getBitWidth();
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     Value *V = PN->getIncomingValue(i);
@@ -796,13 +804,11 @@ void ABCD::createConstraintSigInst(Instruction *I_op, BasicBlock *BB_succ_t,
     int32_t width = cast<IntegerType>((*SIG_op_t)->getType())->getBitWidth();
     inequality_graph.addEdge(I_op, *SIG_op_t, APInt(width, 0), true);
     inequality_graph.addEdge(*SIG_op_t, I_op, APInt(width, 0), false);
-    created.insert(*SIG_op_t);
   }
   if (*SIG_op_f) {
     int32_t width = cast<IntegerType>((*SIG_op_f)->getType())->getBitWidth();
     inequality_graph.addEdge(I_op, *SIG_op_f, APInt(width, 0), true);
     inequality_graph.addEdge(*SIG_op_f, I_op, APInt(width, 0), false);
-    created.insert(*SIG_op_f);
   }
 }
 
@@ -810,10 +816,17 @@ void ABCD::createConstraintSigInst(Instruction *I_op, BasicBlock *BB_succ_t,
 /// PN_op2 -> PN_op1 with value. In case any of them is NULL, replace
 /// with the respective V_op#, if V_op# is a ConstantInt.
 void ABCD::createConstraintSigSig(PHINode *SIG_op1, PHINode *SIG_op2,
+                                  ConstantInt *V_op1, ConstantInt *V_op2,
                                   APInt value) {
   if (SIG_op1 && SIG_op2) {
     inequality_graph.addEdge(SIG_op2, SIG_op1, value, true);
     inequality_graph.addEdge(SIG_op1, SIG_op2, -value, false);
+  } else if (SIG_op1 && V_op2) {
+    inequality_graph.addEdge(V_op2, SIG_op1, value, true);
+    inequality_graph.addEdge(SIG_op1, V_op2, -value, false);
+  } else if (SIG_op2 && V_op1) {
+    inequality_graph.addEdge(SIG_op2, V_op1, value, true);
+    inequality_graph.addEdge(V_op1, SIG_op2, -value, false);
   }
 }
 
@@ -1036,7 +1049,7 @@ void ABCD::InequalityGraph::printHeader(raw_ostream &OS, Function &F) const {
 
 /// Prints the body of the dot file
 void ABCD::InequalityGraph::printBody(raw_ostream &OS) const {
-  DenseMap<Value *, SmallPtrSet<Edge *, 16> >::iterator begin =
+  DenseMap<Value *, SmallPtrSet<Edge *, 16> >::const_iterator begin =
       graph.begin(), end = graph.end();
 
   for (; begin != end ; ++begin) {
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index e048518..5a92399 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -3,7 +3,6 @@ add_llvm_library(LLVMScalarOpts
   ADCE.cpp
   BasicBlockPlacement.cpp
   CodeGenPrepare.cpp
-  CondPropagate.cpp
   ConstantProp.cpp
   DCE.cpp
   DeadStoreElimination.cpp
diff --git a/lib/Transforms/Scalar/CondPropagate.cpp b/lib/Transforms/Scalar/CondPropagate.cpp
deleted file mode 100644
index 8a6c556..0000000
--- a/lib/Transforms/Scalar/CondPropagate.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-//===-- CondPropagate.cpp - Propagate Conditional Expressions -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass propagates information about conditional expressions through the
-// program, allowing it to eliminate conditional branches in some cases.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "condprop"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
-using namespace llvm;
-
-STATISTIC(NumBrThread, "Number of CFG edges threaded through branches");
-STATISTIC(NumSwThread, "Number of CFG edges threaded through switches");
-
-namespace {
-  struct CondProp : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    CondProp() : FunctionPass(&ID) {}
-
-    virtual bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequiredID(BreakCriticalEdgesID);
-      //AU.addRequired<DominanceFrontier>();
-    }
-
-  private:
-    bool MadeChange;
-    SmallVector<BasicBlock *, 4> DeadBlocks;
-    void SimplifyBlock(BasicBlock *BB);
-    void SimplifyPredecessors(BranchInst *BI);
-    void SimplifyPredecessors(SwitchInst *SI);
-    void RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB);
-    bool RevectorBlockTo(BasicBlock *FromBB, Value *Cond, BranchInst *BI);
-  };
-}
-  
-char CondProp::ID = 0;
-static RegisterPass<CondProp> X("condprop", "Conditional Propagation");
-
-FunctionPass *llvm::createCondPropagationPass() {
-  return new CondProp();
-}
-
-bool CondProp::runOnFunction(Function &F) {
-  bool EverMadeChange = false;
-  DeadBlocks.clear();
-
-  // While we are simplifying blocks, keep iterating.
-  do {
-    MadeChange = false;
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E;)
-      SimplifyBlock(BB++);
-    EverMadeChange = EverMadeChange || MadeChange;
-  } while (MadeChange);
-
-  if (EverMadeChange) {
-    while (!DeadBlocks.empty()) {
-      BasicBlock *BB = DeadBlocks.back(); DeadBlocks.pop_back();
-      DeleteDeadBlock(BB);
-    }
-  }
-  return EverMadeChange;
-}
-
-void CondProp::SimplifyBlock(BasicBlock *BB) {
-  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-    // If this is a conditional branch based on a phi node that is defined in
-    // this block, see if we can simplify predecessors of this block.
-    if (BI->isConditional() && isa<PHINode>(BI->getCondition()) &&
-        cast<PHINode>(BI->getCondition())->getParent() == BB)
-      SimplifyPredecessors(BI);
-
-  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-    if (isa<PHINode>(SI->getCondition()) &&
-        cast<PHINode>(SI->getCondition())->getParent() == BB)
-      SimplifyPredecessors(SI);
-  }
-
-  // If possible, simplify the terminator of this block.
-  if (ConstantFoldTerminator(BB))
-    MadeChange = true;
-
-  // If this block ends with an unconditional branch and the only successor has
-  // only this block as a predecessor, merge the two blocks together.
-  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
-    if (BI->isUnconditional() && BI->getSuccessor(0)->getSinglePredecessor() &&
-        BB != BI->getSuccessor(0)) {
-      BasicBlock *Succ = BI->getSuccessor(0);
-      
-      // If Succ has any PHI nodes, they are all single-entry PHI's.  Eliminate
-      // them.
-      FoldSingleEntryPHINodes(Succ);
-      
-      // Remove BI.
-      BI->eraseFromParent();
-
-      // Move over all of the instructions.
-      BB->getInstList().splice(BB->end(), Succ->getInstList());
-
-      // Any phi nodes that had entries for Succ now have entries from BB.
-      Succ->replaceAllUsesWith(BB);
-
-      // Succ is now dead, but we cannot delete it without potentially
-      // invalidating iterators elsewhere.  Just insert an unreachable
-      // instruction in it and delete this block later on.
-      new UnreachableInst(BB->getContext(), Succ);
-      DeadBlocks.push_back(Succ);
-      MadeChange = true;
-    }
-}
-
-// SimplifyPredecessors(branches) - We know that BI is a conditional branch
-// based on a PHI node defined in this block.  If the phi node contains constant
-// operands, then the blocks corresponding to those operands can be modified to
-// jump directly to the destination instead of going through this block.
-void CondProp::SimplifyPredecessors(BranchInst *BI) {
-  // TODO: We currently only handle the most trival case, where the PHI node has
-  // one use (the branch), and is the only instruction besides the branch and dbg
-  // intrinsics in the block.
-  PHINode *PN = cast<PHINode>(BI->getCondition());
-
-  if (PN->getNumIncomingValues() == 1) {
-    // Eliminate single-entry PHI nodes.
-    FoldSingleEntryPHINodes(PN->getParent());
-    return;
-  }
-  
-  
-  if (!PN->hasOneUse()) return;
-
-  BasicBlock *BB = BI->getParent();
-  if (&*BB->begin() != PN)
-    return;
-  BasicBlock::iterator BBI = BB->begin();
-  BasicBlock::iterator BBE = BB->end();
-  while (BBI != BBE && isa<DbgInfoIntrinsic>(++BBI)) /* empty */;
-  if (&*BBI != BI)
-    return;
-
-  // Ok, we have this really simple case, walk the PHI operands, looking for
-  // constants.  Walk from the end to remove operands from the end when
-  // possible, and to avoid invalidating "i".
-  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i) {
-    Value *InVal = PN->getIncomingValue(i-1);
-    if (!RevectorBlockTo(PN->getIncomingBlock(i-1), InVal, BI))
-      continue;
-
-    ++NumBrThread;
-
-    // If there were two predecessors before this simplification, or if the
-    // PHI node contained all the same value except for the one we just
-    // substituted, the PHI node may be deleted.  Don't iterate through it the
-    // last time.
-    if (BI->getCondition() != PN) return;
-  }
-}
-
-// SimplifyPredecessors(switch) - We know that SI is switch based on a PHI node
-// defined in this block.  If the phi node contains constant operands, then the
-// blocks corresponding to those operands can be modified to jump directly to
-// the destination instead of going through this block.
-void CondProp::SimplifyPredecessors(SwitchInst *SI) {
-  // TODO: We currently only handle the most trival case, where the PHI node has
-  // one use (the branch), and is the only instruction besides the branch and 
-  // dbg intrinsics in the block.
-  PHINode *PN = cast<PHINode>(SI->getCondition());
-  if (!PN->hasOneUse()) return;
-
-  BasicBlock *BB = SI->getParent();
-  if (&*BB->begin() != PN)
-    return;
-  BasicBlock::iterator BBI = BB->begin();
-  BasicBlock::iterator BBE = BB->end();
-  while (BBI != BBE && isa<DbgInfoIntrinsic>(++BBI)) /* empty */;
-  if (&*BBI != SI)
-    return;
-
-  // Ok, we have this really simple case, walk the PHI operands, looking for
-  // constants.  Walk from the end to remove operands from the end when
-  // possible, and to avoid invalidating "i".
-  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i)
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(PN->getIncomingValue(i-1))) {
-      BasicBlock *PredBB = PN->getIncomingBlock(i-1);
-      if (isa<BranchInst>(PredBB->getTerminator())) {
-        // If we have a constant, forward the edge from its current to its
-        // ultimate destination.
-        unsigned DestCase = SI->findCaseValue(CI);
-        RevectorBlockTo(PredBB, SI->getSuccessor(DestCase));
-        ++NumSwThread;
-
-        // If there were two predecessors before this simplification, or if the
-        // PHI node contained all the same value except for the one we just
-        // substituted, the PHI node may be deleted.  Don't iterate through it the
-        // last time.
-        if (SI->getCondition() != PN) return;
-      }
-    }
-}
-
-
-// RevectorBlockTo - Revector the unconditional branch at the end of FromBB to
-// the ToBB block, which is one of the successors of its current successor.
-void CondProp::RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB) {
-  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
-  assert(FromBr->isUnconditional() && "FromBB should end with uncond br!");
-
-  // Get the old block we are threading through.
-  BasicBlock *OldSucc = FromBr->getSuccessor(0);
-
-  // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
-  // the edge between them would be critical, which we already took care of.
-  // If ToBB has single operand PHI node then take care of it here.
-  FoldSingleEntryPHINodes(ToBB);
-
-  // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
-  OldSucc->removePredecessor(FromBB);
-
-  // Change FromBr to branch to the new destination.
-  FromBr->setSuccessor(0, ToBB);
-
-  MadeChange = true;
-}
-
-bool CondProp::RevectorBlockTo(BasicBlock *FromBB, Value *Cond, BranchInst *BI){
-  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
-  if (!FromBr->isUnconditional())
-    return false;
-
-  // Get the old block we are threading through.
-  BasicBlock *OldSucc = FromBr->getSuccessor(0);
-
-  // If the condition is a constant, simply revector the unconditional branch at
-  // the end of FromBB to one of the successors of its current successor.
-  if (ConstantInt *CB = dyn_cast<ConstantInt>(Cond)) {
-    BasicBlock *ToBB = BI->getSuccessor(CB->isZero());
-
-    // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
-    // the edge between them would be critical, which we already took care of.
-    // If ToBB has single operand PHI node then take care of it here.
-    FoldSingleEntryPHINodes(ToBB);
-
-    // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
-    OldSucc->removePredecessor(FromBB);
-
-    // Change FromBr to branch to the new destination.
-    FromBr->setSuccessor(0, ToBB);
-  } else {
-    BasicBlock *Succ0 = BI->getSuccessor(0);
-    // Do not perform transform if the new destination has PHI nodes. The
-    // transform will add new preds to the PHI's.
-    if (isa<PHINode>(Succ0->begin()))
-      return false;
-
-    BasicBlock *Succ1 = BI->getSuccessor(1);
-    if (isa<PHINode>(Succ1->begin()))
-      return false;
-
-    // Insert the new conditional branch.
-    BranchInst::Create(Succ0, Succ1, Cond, FromBr);
-
-    FoldSingleEntryPHINodes(Succ0);
-    FoldSingleEntryPHINodes(Succ1);
-
-    // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
-    OldSucc->removePredecessor(FromBB);
-
-    // Delete the old branch.
-    FromBr->eraseFromParent();
-  }
-
-  MadeChange = true;
-  return true;
-}
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 4fee327..ea20813 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -66,7 +66,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
     WorkList.erase(WorkList.begin());    // Get an element from the worklist...
 
     if (!I->use_empty())                 // Don't muck with dead instructions...
-      if (Constant *C = ConstantFoldInstruction(I, F.getContext())) {
+      if (Constant *C = ConstantFoldInstruction(I)) {
         // Add all of the users of this instruction to the worklist, they might
         // be constant propagatable now...
         for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 90436f4..b0988b5 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -78,19 +78,96 @@ static RegisterPass<DSE> X("dse", "Dead Store Elimination");
 
 FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 
-/// isValueAtLeastAsBigAs - Return true if V1 is greater than or equal to the
-/// stored size of V2.  This returns false if we don't know.
+/// doesClobberMemory - Does this instruction clobber (write without reading)
+/// some memory?
+static bool doesClobberMemory(Instruction *I) {
+  if (isa<StoreInst>(I))
+    return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: return false;
+    case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy:
+    case Intrinsic::init_trampoline: case Intrinsic::lifetime_end: return true;
+    }
+  }
+  return false;
+}
+
+/// isElidable - If the value of this instruction and the memory it writes to is
+/// unused, may we delete this instrtction?
+static bool isElidable(Instruction *I) {
+  assert(doesClobberMemory(I));
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    return II->getIntrinsicID() != Intrinsic::lifetime_end;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return !SI->isVolatile();
+  return true;
+}
+
+/// getPointerOperand - Return the pointer that is being clobbered.
+static Value *getPointerOperand(Instruction *I) {
+  assert(doesClobberMemory(I));
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return MI->getOperand(1);
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
+  switch (II->getIntrinsicID()) {
+    default:
+      assert(false && "Unexpected intrinsic!");
+    case Intrinsic::init_trampoline:
+      return II->getOperand(1);
+    case Intrinsic::lifetime_end:
+      return II->getOperand(2);
+  }
+}
+
+/// getStoreSize - Return the length in bytes of the write by the clobbering
+/// instruction. If variable or unknown, returns -1.
+static unsigned getStoreSize(Instruction *I, const TargetData *TD) {
+  assert(doesClobberMemory(I));
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!TD) return -1u;
+    return TD->getTypeStoreSize(SI->getOperand(0)->getType());
+  }
+
+  Value *Len;
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+    Len = MI->getLength();
+  } else {
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
+    switch (II->getIntrinsicID()) {
+      default:
+        assert(false && "Unexpected intrinsic!");
+      case Intrinsic::init_trampoline:
+        return -1u;
+      case Intrinsic::lifetime_end:
+        Len = II->getOperand(1);
+        break;
+    }
+  }
+  if (ConstantInt *LenCI = dyn_cast<ConstantInt>(Len))
+    if (!LenCI->isAllOnesValue())
+      return LenCI->getZExtValue();
+  return -1u;
+}
+
+/// isStoreAtLeastAsWideAs - Return true if the size of the store in I1 is
+/// greater than or equal to the store in I2.  This returns false if we don't
+/// know.
 ///
-static bool isValueAtLeastAsBigAs(Value *V1, Value *V2, const TargetData *TD) {
-  const Type *V1Ty = V1->getType(), *V2Ty = V2->getType();
+static bool isStoreAtLeastAsWideAs(Instruction *I1, Instruction *I2,
+                                   const TargetData *TD) {
+  const Type *I1Ty = getPointerOperand(I1)->getType();
+  const Type *I2Ty = getPointerOperand(I2)->getType();
   
   // Exactly the same type, must have exactly the same size.
-  if (V1Ty == V2Ty) return true;
+  if (I1Ty == I2Ty) return true;
   
-  // If we don't have target data, we don't know.
-  if (TD == 0) return false;
+  int I1Size = getStoreSize(I1, TD);
+  int I2Size = getStoreSize(I2, TD);
   
-  return TD->getTypeStoreSize(V1Ty) >= TD->getTypeStoreSize(V2Ty);
+  return I1Size != -1 && I2Size != -1 && I1Size >= I2Size;
 }
 
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
@@ -104,14 +181,9 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     Instruction *Inst = BBI++;
     
     // If we find a store or a free, get its memory dependence.
-    if (!isa<StoreInst>(Inst) && !isFreeCall(Inst))
+    if (!doesClobberMemory(Inst) && !isFreeCall(Inst))
       continue;
     
-    // Don't molest volatile stores or do queries that will return "clobber".
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      if (SI->isVolatile())
-        continue;
-
     MemDepResult InstDep = MD.getDependency(Inst);
     
     // Ignore non-local stores.
@@ -124,16 +196,16 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       continue;
     }
     
-    StoreInst *SI = cast<StoreInst>(Inst);
-    
     // If not a definite must-alias dependency, ignore it.
     if (!InstDep.isDef())
       continue;
     
     // If this is a store-store dependence, then the previous store is dead so
     // long as this store is at least as big as it.
-    if (StoreInst *DepStore = dyn_cast<StoreInst>(InstDep.getInst()))
-      if (isValueAtLeastAsBigAs(SI->getOperand(0), DepStore->getOperand(0),TD)){
+    if (doesClobberMemory(InstDep.getInst())) {
+      Instruction *DepStore = InstDep.getInst();
+      if (isStoreAtLeastAsWideAs(Inst, DepStore, TD) &&
+          isElidable(DepStore)) {
         // Delete the store and now-dead instructions that feed it.
         DeleteDeadInstruction(DepStore);
         NumFastStores++;
@@ -146,37 +218,43 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           --BBI;
         continue;
       }
+    }
+    
+    if (!isElidable(Inst))
+      continue;
     
     // If we're storing the same value back to a pointer that we just
     // loaded from, then the store can be removed.
-    if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
-      if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-          SI->getOperand(0) == DepLoad) {
-        // DeleteDeadInstruction can delete the current instruction.  Save BBI
-        // in case we need it.
-        WeakVH NextInst(BBI);
-        
-        DeleteDeadInstruction(SI);
-        
-        if (NextInst == 0)  // Next instruction deleted.
-          BBI = BB.begin();
-        else if (BBI != BB.begin())  // Revisit this instruction if possible.
-          --BBI;
-        NumFastStores++;
-        MadeChange = true;
-        continue;
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
+        if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+            SI->getOperand(0) == DepLoad) {
+          // DeleteDeadInstruction can delete the current instruction.  Save BBI
+          // in case we need it.
+          WeakVH NextInst(BBI);
+          
+          DeleteDeadInstruction(SI);
+          
+          if (NextInst == 0)  // Next instruction deleted.
+            BBI = BB.begin();
+          else if (BBI != BB.begin())  // Revisit this instruction if possible.
+            --BBI;
+          NumFastStores++;
+          MadeChange = true;
+          continue;
+        }
       }
     }
     
     // If this is a lifetime end marker, we can throw away the store.
-    if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(InstDep.getInst())) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(InstDep.getInst())) {
       if (II->getIntrinsicID() == Intrinsic::lifetime_end) {
         // Delete the store and now-dead instructions that feed it.
         // DeleteDeadInstruction can delete the current instruction.  Save BBI
         // in case we need it.
         WeakVH NextInst(BBI);
         
-        DeleteDeadInstruction(SI);
+        DeleteDeadInstruction(Inst);
         
         if (NextInst == 0)  // Next instruction deleted.
           BBI = BB.begin();
@@ -202,11 +280,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 bool DSE::handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep) {
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
   
-  StoreInst *Dependency = dyn_cast_or_null<StoreInst>(Dep.getInst());
-  if (!Dependency || Dependency->isVolatile())
+  Instruction *Dependency = Dep.getInst();
+  if (!Dependency || !doesClobberMemory(Dependency) || !isElidable(Dependency))
     return false;
   
-  Value *DepPointer = Dependency->getPointerOperand()->getUnderlyingObject();
+  Value *DepPointer = getPointerOperand(Dependency)->getUnderlyingObject();
 
   // Check for aliasing.
   if (AA.alias(F->getOperand(1), 1, DepPointer, 1) !=
@@ -251,39 +329,28 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     --BBI;
     
     // If we find a store whose pointer is dead.
-    if (StoreInst* S = dyn_cast<StoreInst>(BBI)) {
-      if (!S->isVolatile()) {
+    if (doesClobberMemory(BBI)) {
+      if (isElidable(BBI)) {
         // See through pointer-to-pointer bitcasts
-        Value* pointerOperand = S->getPointerOperand()->getUnderlyingObject();
+        Value *pointerOperand = getPointerOperand(BBI)->getUnderlyingObject();
 
         // Alloca'd pointers or byval arguments (which are functionally like
         // alloca's) are valid candidates for removal.
         if (deadPointers.count(pointerOperand)) {
           // DCE instructions only used to calculate that store.
+          Instruction *Dead = BBI;
           BBI++;
-          DeleteDeadInstruction(S, &deadPointers);
+          DeleteDeadInstruction(Dead, &deadPointers);
           NumFastStores++;
           MadeChange = true;
+          continue;
         }
       }
       
-      continue;
-    }
-    
-    // We can also remove memcpy's to local variables at the end of a function.
-    if (MemCpyInst *M = dyn_cast<MemCpyInst>(BBI)) {
-      Value *dest = M->getDest()->getUnderlyingObject();
-
-      if (deadPointers.count(dest)) {
-        BBI++;
-        DeleteDeadInstruction(M, &deadPointers);
-        NumFastOther++;
-        MadeChange = true;
+      // Because a memcpy or memmove is also a load, we can't skip it if we
+      // didn't remove it.
+      if (!isa<MemTransferInst>(BBI))
         continue;
-      }
-      
-      // Because a memcpy is also a load, we can't skip it if we didn't remove
-      // it.
     }
     
     Value* killPointer = 0;
@@ -304,11 +371,11 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       killPointer = L->getPointerOperand();
     } else if (VAArgInst* V = dyn_cast<VAArgInst>(BBI)) {
       killPointer = V->getOperand(0);
-    } else if (isa<MemCpyInst>(BBI) &&
-               isa<ConstantInt>(cast<MemCpyInst>(BBI)->getLength())) {
-      killPointer = cast<MemCpyInst>(BBI)->getSource();
+    } else if (isa<MemTransferInst>(BBI) &&
+               isa<ConstantInt>(cast<MemTransferInst>(BBI)->getLength())) {
+      killPointer = cast<MemTransferInst>(BBI)->getSource();
       killPointerSize = cast<ConstantInt>(
-                            cast<MemCpyInst>(BBI)->getLength())->getZExtValue();
+                       cast<MemTransferInst>(BBI)->getLength())->getZExtValue();
     } else if (AllocaInst* A = dyn_cast<AllocaInst>(BBI)) {
       deadPointers.erase(A);
       
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 0e3f750..a8f39c1 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -443,6 +443,11 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
       valueNumbering[C] = e;
       return e;
     }
+    if (!MD) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
 
     MemDepResult local_dep = MD->getDependency(C);
 
@@ -624,7 +629,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
 /// lookup - Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
 uint32_t ValueTable::lookup(Value *V) const {
-  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
   assert(VI != valueNumbering.end() && "Value not numbered?");
   return VI->second;
 }
@@ -644,7 +649,7 @@ void ValueTable::erase(Value *V) {
 /// verifyRemoved - Verify that the value is removed from all internal data
 /// structures.
 void ValueTable::verifyRemoved(const Value *V) const {
-  for (DenseMap<Value*, uint32_t>::iterator
+  for (DenseMap<Value*, uint32_t>::const_iterator
          I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
     assert(I->first != V && "Inst still occurs in value numbering map!");
   }
@@ -669,10 +674,12 @@ namespace {
     bool runOnFunction(Function &F);
   public:
     static char ID; // Pass identification, replacement for typeid
-    GVN(bool nopre = false) : FunctionPass(&ID), NoPRE(nopre) { }
+    explicit GVN(bool nopre = false, bool noloads = false)
+      : FunctionPass(&ID), NoPRE(nopre), NoLoads(noloads), MD(0) { }
 
   private:
     bool NoPRE;
+    bool NoLoads;
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
 
@@ -682,7 +689,8 @@ namespace {
     // This transformation requires dominator postdominator info
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<DominatorTree>();
-      AU.addRequired<MemoryDependenceAnalysis>();
+      if (!NoLoads)
+        AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
 
       AU.addPreserved<DominatorTree>();
@@ -711,7 +719,9 @@ namespace {
 }
 
 // createGVNPass - The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoPRE) { return new GVN(NoPRE); }
+FunctionPass *llvm::createGVNPass(bool NoPRE, bool NoLoads) {
+  return new GVN(NoPRE, NoLoads);
+}
 
 static RegisterPass<GVN> X("gvn",
                            "Global Value Numbering");
@@ -1476,6 +1486,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
 /// processLoad - Attempt to eliminate a load, first by eliminating it
 /// locally, and then attempting non-local elimination if that fails.
 bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
+  if (!MD)
+    return false;
+
   if (L->isVolatile())
     return false;
 
@@ -1686,7 +1699,7 @@ bool GVN::processInstruction(Instruction *I,
 
     if (constVal) {
       p->replaceAllUsesWith(constVal);
-      if (isa<PointerType>(constVal->getType()))
+      if (MD && isa<PointerType>(constVal->getType()))
         MD->invalidateCachedPointerInfo(constVal);
       VN.erase(p);
 
@@ -1707,7 +1720,7 @@ bool GVN::processInstruction(Instruction *I,
     // Remove it!
     VN.erase(I);
     I->replaceAllUsesWith(repl);
-    if (isa<PointerType>(repl->getType()))
+    if (MD && isa<PointerType>(repl->getType()))
       MD->invalidateCachedPointerInfo(repl);
     toErase.push_back(I);
     return true;
@@ -1721,7 +1734,8 @@ bool GVN::processInstruction(Instruction *I,
 
 /// runOnFunction - This is the main transformation entry point for a function.
 bool GVN::runOnFunction(Function& F) {
-  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  if (!NoLoads)
+    MD = &getAnalysis<MemoryDependenceAnalysis>();
   DT = &getAnalysis<DominatorTree>();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
@@ -1793,7 +1807,7 @@ bool GVN::processBlock(BasicBlock *BB) {
     for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(),
          E = toErase.end(); I != E; ++I) {
       DEBUG(errs() << "GVN removed: " << **I << '\n');
-      MD->removeInstruction(*I);
+      if (MD) MD->removeInstruction(*I);
       (*I)->eraseFromParent();
       DEBUG(verifyRemoved(*I));
     }
@@ -1946,12 +1960,12 @@ bool GVN::performPRE(Function &F) {
       localAvail[CurrentBlock]->table[ValNo] = Phi;
 
       CurInst->replaceAllUsesWith(Phi);
-      if (isa<PointerType>(Phi->getType()))
+      if (MD && isa<PointerType>(Phi->getType()))
         MD->invalidateCachedPointerInfo(Phi);
       VN.erase(CurInst);
 
       DEBUG(errs() << "GVN PRE removed: " << *CurInst << '\n');
-      MD->removeInstruction(CurInst);
+      if (MD) MD->removeInstruction(CurInst);
       CurInst->eraseFromParent();
       DEBUG(verifyRemoved(CurInst));
       Changed = true;
@@ -2011,12 +2025,12 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
 
   // Walk through the value number scope to make sure the instruction isn't
   // ferreted away in it.
-  for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
+  for (DenseMap<BasicBlock*, ValueNumberScope*>::const_iterator
          I = localAvail.begin(), E = localAvail.end(); I != E; ++I) {
     const ValueNumberScope *VNS = I->second;
 
     while (VNS) {
-      for (DenseMap<uint32_t, Value*>::iterator
+      for (DenseMap<uint32_t, Value*>::const_iterator
              II = VNS->table.begin(), IE = VNS->table.end(); II != IE; ++II) {
         assert(II->second != Inst && "Inst still in value numbering scope!");
       }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index b0bc70c..2912421 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -536,8 +536,10 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
   BasicBlock *ExitBlock = L->getExitBlock();
   if (!ExitBlock) return;
 
-  Instruction *InsertPt = ExitBlock->getFirstNonPHI();
   BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) return;
+
+  Instruction *InsertPt = ExitBlock->getFirstNonPHI();
   BasicBlock::iterator I = Preheader->getTerminator();
   while (I != Preheader->begin()) {
     --I;
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
index 7e75cfb..1c48366 100644
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -42,6 +42,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Operator.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
@@ -283,6 +284,8 @@ namespace {
     Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
     Instruction *visitCallInst(CallInst &CI);
     Instruction *visitInvokeInst(InvokeInst &II);
+
+    Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
     Instruction *visitPHINode(PHINode &PN);
     Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
     Instruction *visitAllocaInst(AllocaInst &AI);
@@ -380,10 +383,6 @@ namespace {
     /// commutative operators.
     bool SimplifyCommutative(BinaryOperator &I);
 
-    /// SimplifyCompare - This reorders the operands of a CmpInst to get them in
-    /// most-complex to least-complex order.
-    bool SimplifyCompare(CmpInst &I);
-
     /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
     /// based on the demanded bits.
     Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
@@ -478,6 +477,34 @@ static const Type *getPromotedType(const Type *Ty) {
   return Ty;
 }
 
+/// ShouldChangeType - Return true if it is desirable to convert a computation
+/// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
+/// type for example, or from a smaller to a larger illegal type.
+static bool ShouldChangeType(const Type *From, const Type *To,
+                             const TargetData *TD) {
+  assert(isa<IntegerType>(From) && isa<IntegerType>(To));
+  
+  // If we don't have TD, we don't know if the source/dest are legal.
+  if (!TD) return false;
+  
+  unsigned FromWidth = From->getPrimitiveSizeInBits();
+  unsigned ToWidth = To->getPrimitiveSizeInBits();
+  bool FromLegal = TD->isLegalInteger(FromWidth);
+  bool ToLegal = TD->isLegalInteger(ToWidth);
+  
+  // If this is a legal integer from type, and the result would be an illegal
+  // type, don't do the transformation.
+  if (FromLegal && !ToLegal)
+    return false;
+  
+  // Otherwise, if both are illegal, do not increase the size of the result. We
+  // do allow things like i160 -> i64, but not i64 -> i160.
+  if (!FromLegal && !ToLegal && ToWidth > FromWidth)
+    return false;
+  
+  return true;
+}
+
 /// getBitCastOperand - If the specified operand is a CastInst, a constant
 /// expression bitcast, or a GetElementPtrInst with all zero indices, return the
 /// operand value, otherwise return null.
@@ -584,17 +611,6 @@ bool InstCombiner::SimplifyCommutative(BinaryOperator &I) {
   return Changed;
 }
 
-/// SimplifyCompare - For a CmpInst this function just orders the operands
-/// so that theyare listed from right (least complex) to left (most complex).
-/// This puts constants before unary operators before binary operators.
-bool InstCombiner::SimplifyCompare(CmpInst &I) {
-  if (getComplexity(I.getOperand(0)) >= getComplexity(I.getOperand(1)))
-    return false;
-  I.swapOperands();
-  // Compare instructions are not associative so there's nothing else we can do.
-  return true;
-}
-
 // dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
 // if the LHS is a constant zero (which is the 'negate' form).
 //
@@ -4304,25 +4320,15 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   bool Changed = SimplifyCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (isa<UndefValue>(Op1))                         // X & undef -> 0
-    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-
-  // and X, X = X
-  if (Op0 == Op1)
-    return ReplaceInstUsesWith(I, Op1);
+  if (Value *V = SimplifyAndInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+    
 
   // See if we can simplify any instructions used by the instruction whose sole 
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
-  if (isa<VectorType>(I.getType())) {
-    if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
-      if (CP->isAllOnesValue())            // X & <-1,-1> -> X
-        return ReplaceInstUsesWith(I, I.getOperand(0));
-    } else if (isa<ConstantAggregateZero>(Op1)) {
-      return ReplaceInstUsesWith(I, Op1);  // X & <0,0> -> <0,0>
-    }
-  }
+  
 
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
@@ -4443,42 +4449,29 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return NV;
   }
 
-  Value *Op0NotVal = dyn_castNotVal(Op0);
-  Value *Op1NotVal = dyn_castNotVal(Op1);
-
-  if (Op0NotVal == Op1 || Op1NotVal == Op0)  // A & ~A  == ~A & A == 0
-    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
 
   // (~A & ~B) == (~(A | B)) - De Morgan's Law
-  if (Op0NotVal && Op1NotVal && isOnlyUse(Op0) && isOnlyUse(Op1)) {
-    Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal,
-                                  I.getName()+".demorgan");
-    return BinaryOperator::CreateNot(Or);
-  }
-  
+  if (Value *Op0NotVal = dyn_castNotVal(Op0))
+    if (Value *Op1NotVal = dyn_castNotVal(Op1))
+      if (Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal,
+                                      I.getName()+".demorgan");
+        return BinaryOperator::CreateNot(Or);
+      }
+
   {
     Value *A = 0, *B = 0, *C = 0, *D = 0;
-    if (match(Op0, m_Or(m_Value(A), m_Value(B)))) {
-      if (A == Op1 || B == Op1)    // (A | ?) & A  --> A
-        return ReplaceInstUsesWith(I, Op1);
-    
-      // (A|B) & ~(A&B) -> A^B
-      if (match(Op1, m_Not(m_And(m_Value(C), m_Value(D))))) {
-        if ((A == C && B == D) || (A == D && B == C))
-          return BinaryOperator::CreateXor(A, B);
-      }
-    }
+    // (A|B) & ~(A&B) -> A^B
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) &&
+        ((A == C && B == D) || (A == D && B == C)))
+      return BinaryOperator::CreateXor(A, B);
     
-    if (match(Op1, m_Or(m_Value(A), m_Value(B)))) {
-      if (A == Op0 || B == Op0)    // A & (A | ?)  --> A
-        return ReplaceInstUsesWith(I, Op0);
-
-      // ~(A&B) & (A|B) -> A^B
-      if (match(Op0, m_Not(m_And(m_Value(C), m_Value(D))))) {
-        if ((A == C && B == D) || (A == D && B == C))
-          return BinaryOperator::CreateXor(A, B);
-      }
-    }
+    // ~(A&B) & (A|B) -> A^B
+    if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) &&
+        ((A == C && B == D) || (A == D && B == C)))
+      return BinaryOperator::CreateXor(A, B);
     
     if (Op0->hasOneUse() &&
         match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
@@ -5010,27 +5003,15 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   bool Changed = SimplifyCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (isa<UndefValue>(Op1))                       // X | undef -> -1
-    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
-
-  // or X, X = X
-  if (Op0 == Op1)
-    return ReplaceInstUsesWith(I, Op0);
-
+  if (Value *V = SimplifyOrInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+  
+  
   // See if we can simplify any instructions used by the instruction whose sole 
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
-  if (isa<VectorType>(I.getType())) {
-    if (isa<ConstantAggregateZero>(Op1)) {
-      return ReplaceInstUsesWith(I, Op0);  // X | <0,0> -> X
-    } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
-      if (CP->isAllOnesValue())            // X | <-1,-1> -> <-1,-1>
-        return ReplaceInstUsesWith(I, I.getOperand(1));
-    }
-  }
 
-  // or X, -1 == -1
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     ConstantInt *C1 = 0; Value *X = 0;
     // (X & C1) | C2 --> (X | C2) & (C1|C2)
@@ -5063,13 +5044,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   Value *A = 0, *B = 0;
   ConstantInt *C1 = 0, *C2 = 0;
 
-  if (match(Op0, m_And(m_Value(A), m_Value(B))))
-    if (A == Op1 || B == Op1)    // (A & ?) | A  --> A
-      return ReplaceInstUsesWith(I, Op1);
-  if (match(Op1, m_And(m_Value(A), m_Value(B))))
-    if (A == Op0 || B == Op0)    // A | (A & ?)  --> A
-      return ReplaceInstUsesWith(I, Op0);
-
   // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
   // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
   if (match(Op0, m_Or(m_Value(), m_Value())) ||
@@ -5203,23 +5177,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     if (Ret) return Ret;
   }
 
-  if ((A = dyn_castNotVal(Op0))) {   // ~A | Op1
-    if (A == Op1)   // ~A | A == -1
-      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
-  } else {
-    A = 0;
-  }
-  // Note, A is still live here!
-  if ((B = dyn_castNotVal(Op1))) {   // Op0 | ~B
-    if (Op0 == B)
-      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
-
-    // (~A | ~B) == (~(A & B)) - De Morgan's Law
-    if (A && isOnlyUse(Op0) && isOnlyUse(Op1)) {
-      Value *And = Builder->CreateAnd(A, B, I.getName()+".demorgan");
-      return BinaryOperator::CreateNot(And);
-    }
-  }
+  // (~A | ~B) == (~(A & B)) - De Morgan's Law
+  if (Value *Op0NotVal = dyn_castNotVal(Op0))
+    if (Value *Op1NotVal = dyn_castNotVal(Op1))
+      if (Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal,
+                                        I.getName()+".demorgan");
+        return BinaryOperator::CreateNot(And);
+      }
 
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
   if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) {
@@ -5942,28 +5907,25 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
 }
 
 Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
-  bool Changed = SimplifyCompare(I);
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  bool Changed = false;
+  
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
+    I.swapOperands();
+    Changed = true;
+  }
 
-  // Fold trivial predicates.
-  if (I.getPredicate() == FCmpInst::FCMP_FALSE)
-    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), 0));
-  if (I.getPredicate() == FCmpInst::FCMP_TRUE)
-    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), 1));
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   
+  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
   // Simplify 'fcmp pred X, X'
   if (Op0 == Op1) {
     switch (I.getPredicate()) {
     default: llvm_unreachable("Unknown predicate!");
-    case FCmpInst::FCMP_UEQ:    // True if unordered or equal
-    case FCmpInst::FCMP_UGE:    // True if unordered, greater than, or equal
-    case FCmpInst::FCMP_ULE:    // True if unordered, less than, or equal
-      return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), 1));
-    case FCmpInst::FCMP_OGT:    // True if ordered and greater than
-    case FCmpInst::FCMP_OLT:    // True if ordered and less than
-    case FCmpInst::FCMP_ONE:    // True if ordered and operands are unequal
-      return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), 0));
-      
     case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
     case FCmpInst::FCMP_ULT:    // True if unordered or less than
     case FCmpInst::FCMP_UGT:    // True if unordered or greater than
@@ -5984,23 +5946,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
     }
   }
     
-  if (isa<UndefValue>(Op1))                  // fcmp pred X, undef -> undef
-    return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
-
   // Handle fcmp with constant RHS
   if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
-    // If the constant is a nan, see if we can fold the comparison based on it.
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
-      if (CFP->getValueAPF().isNaN()) {
-        if (FCmpInst::isOrdered(I.getPredicate()))   // True if ordered and...
-          return ReplaceInstUsesWith(I, ConstantInt::getFalse(*Context));
-        assert(FCmpInst::isUnordered(I.getPredicate()) &&
-               "Comparison must be either ordered or unordered!");
-        // True if unordered.
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(*Context));
-      }
-    }
-    
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
       switch (LHSI->getOpcode()) {
       case Instruction::PHI:
@@ -6047,26 +5994,22 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 }
 
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
-  bool Changed = SimplifyCompare(I);
+  bool Changed = false;
+  
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
+    I.swapOperands();
+    Changed = true;
+  }
+  
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  const Type *Ty = Op0->getType();
-
-  // icmp X, X
-  if (Op0 == Op1)
-    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(),
-                                                   I.isTrueWhenEqual()));
-
-  if (isa<UndefValue>(Op1))                  // X icmp undef -> undef
-    return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
   
-  // icmp <global/alloca*/null>, <global/alloca*/null> - Global/Stack value
-  // addresses never equal each other!  We already know that Op0 != Op1.
-  if ((isa<GlobalValue>(Op0) || isa<AllocaInst>(Op0) || 
-       isa<ConstantPointerNull>(Op0)) &&
-      (isa<GlobalValue>(Op1) || isa<AllocaInst>(Op1) || 
-       isa<ConstantPointerNull>(Op1)))
-    return ReplaceInstUsesWith(I, ConstantInt::get(Type::getInt1Ty(*Context), 
-                                                   !I.isTrueWhenEqual()));
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+  
+  const Type *Ty = Op0->getType();
 
   // icmp's with boolean values can always be turned into bitwise operations
   if (Ty == Type::getInt1Ty(*Context)) {
@@ -6131,27 +6074,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     
     // If we have an icmp le or icmp ge instruction, turn it into the
     // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
-    // them being folded in the code below.
+    // them being folded in the code below.  The SimplifyICmpInst code has
+    // already handled the edge cases for us, so we just assert on them.
     switch (I.getPredicate()) {
     default: break;
     case ICmpInst::ICMP_ULE:
-      if (CI->isMaxValue(false))                 // A <=u MAX -> TRUE
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(*Context));
+      assert(!CI->isMaxValue(false));                 // A <=u MAX -> TRUE
       return new ICmpInst(ICmpInst::ICMP_ULT, Op0,
                           AddOne(CI));
     case ICmpInst::ICMP_SLE:
-      if (CI->isMaxValue(true))                  // A <=s MAX -> TRUE
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(*Context));
+      assert(!CI->isMaxValue(true));                  // A <=s MAX -> TRUE
       return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
                           AddOne(CI));
     case ICmpInst::ICMP_UGE:
-      if (CI->isMinValue(false))                 // A >=u MIN -> TRUE
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(*Context));
+      assert(!CI->isMinValue(false));                  // A >=u MIN -> TRUE
       return new ICmpInst(ICmpInst::ICMP_UGT, Op0,
                           SubOne(CI));
     case ICmpInst::ICMP_SGE:
-      if (CI->isMinValue(true))                  // A >=s MIN -> TRUE
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(*Context));
+      assert(!CI->isMinValue(true));                   // A >=s MIN -> TRUE
       return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
                           SubOne(CI));
     }
@@ -8083,8 +8023,7 @@ bool InstCombiner::CanEvaluateInDifferentType(Value *V, const Type *Ty,
 Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V))
-    return ConstantExpr::getIntegerCast(C, Ty,
-                                               isSigned /*Sext or ZExt*/);
+    return ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
 
   // Otherwise, it must be an instruction.
   Instruction *I = cast<Instruction>(V);
@@ -8117,8 +8056,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty,
       return I->getOperand(0);
     
     // Otherwise, must be the same type of cast, so just reinsert a new one.
-    Res = CastInst::Create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),
-                           Ty);
+    Res = CastInst::Create(cast<CastInst>(I)->getOpcode(), I->getOperand(0),Ty);
     break;
   case Instruction::Select: {
     Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
@@ -8167,9 +8105,15 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       return NV;
 
   // If we are casting a PHI then fold the cast into the PHI
-  if (isa<PHINode>(Src))
-    if (Instruction *NV = FoldOpIntoPhi(CI))
-      return NV;
+  if (isa<PHINode>(Src)) {
+    // We don't do this if this would create a PHI node with an illegal type if
+    // it is currently legal.
+    if (!isa<IntegerType>(Src->getType()) ||
+        !isa<IntegerType>(CI.getType()) ||
+        ShouldChangeType(CI.getType(), Src->getType(), TD))
+      if (Instruction *NV = FoldOpIntoPhi(CI))
+        return NV;
+  }
   
   return 0;
 }
@@ -8289,23 +8233,6 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   return commonCastTransforms(CI);
 }
 
-/// isSafeIntegerType - Return true if this is a basic integer type, not a crazy
-/// type like i42.  We don't want to introduce operations on random non-legal
-/// integer types where they don't already exist in the code.  In the future,
-/// we should consider making this based off target-data, so that 32-bit targets
-/// won't get i64 operations etc.
-static bool isSafeIntegerType(const Type *Ty) {
-  switch (Ty->getPrimitiveSizeInBits()) {
-  case 8:
-  case 16:
-  case 32:
-  case 64:
-    return true;
-  default: 
-    return false;
-  }
-}
-
 /// commonIntCastTransforms - This function implements the common transforms
 /// for trunc, zext, and sext.
 Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
@@ -8334,8 +8261,8 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
   // Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((isSafeIntegerType(DestTy->getScalarType()) ||
-       !isSafeIntegerType(SrcI->getType()->getScalarType())) &&
+  if ((isa<VectorType>(DestTy) ||
+       ShouldChangeType(SrcI->getType(), DestTy, TD)) &&
       CanEvaluateInDifferentType(SrcI, DestTy,
                                  CI.getOpcode(), NumCastsRemoved)) {
     // If this cast is a truncate, evaluting in a different type always
@@ -8356,6 +8283,7 @@ Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
       break;
     case Instruction::ZExt: {
       DoXForm = NumCastsRemoved >= 1;
+      
       if (!DoXForm && 0) {
         // If it's unnecessary to issue an AND to clear the high bits, it's
         // always profitable to do this xform.
@@ -8522,7 +8450,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
       return BinaryOperator::CreateLShr(V1, V2);
     }
   }
-  
+ 
   return 0;
 }
 
@@ -10880,9 +10808,10 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 }
 
 
-// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
-// operator and they all are only used by the PHI, PHI together their
-// inputs, and do the operation once, to the result of the PHI.
+
+/// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+/// operator and they all are only used by the PHI, PHI together their
+/// inputs, and do the operation once, to the result of the PHI.
 Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
 
@@ -10900,6 +10829,13 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   
   if (isa<CastInst>(FirstInst)) {
     CastSrcTy = FirstInst->getOperand(0)->getType();
+
+    // Be careful about transforming integer PHIs.  We don't want to pessimize
+    // the code by turning an i32 into an i1293.
+    if (isa<IntegerType>(PN.getType()) && isa<IntegerType>(CastSrcTy)) {
+      if (!ShouldChangeType(PN.getType(), CastSrcTy, TD))
+        return 0;
+    }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
     // Can fold binop, compare or shift here if the RHS is a constant, 
     // otherwise call FoldPHIArgBinOpIntoPHI.
@@ -11012,6 +10948,222 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
 }
 
 
+namespace {
+struct PHIUsageRecord {
+  unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on)
+  unsigned Shift;     // The amount shifted.
+  Instruction *Inst;  // The trunc instruction.
+  
+  PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
+    : PHIId(pn), Shift(Sh), Inst(User) {}
+  
+  bool operator<(const PHIUsageRecord &RHS) const {
+    if (PHIId < RHS.PHIId) return true;
+    if (PHIId > RHS.PHIId) return false;
+    if (Shift < RHS.Shift) return true;
+    if (Shift > RHS.Shift) return false;
+    return Inst->getType()->getPrimitiveSizeInBits() <
+           RHS.Inst->getType()->getPrimitiveSizeInBits();
+  }
+};
+  
+struct LoweredPHIRecord {
+  PHINode *PN;        // The PHI that was lowered.
+  unsigned Shift;     // The amount shifted.
+  unsigned Width;     // The width extracted.
+  
+  LoweredPHIRecord(PHINode *pn, unsigned Sh, const Type *Ty)
+    : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
+  
+  // Ctor form used by DenseMap.
+  LoweredPHIRecord(PHINode *pn, unsigned Sh)
+    : PN(pn), Shift(Sh), Width(0) {}
+};
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<LoweredPHIRecord> {
+    static inline LoweredPHIRecord getEmptyKey() {
+      return LoweredPHIRecord(0, 0);
+    }
+    static inline LoweredPHIRecord getTombstoneKey() {
+      return LoweredPHIRecord(0, 1);
+    }
+    static unsigned getHashValue(const LoweredPHIRecord &Val) {
+      return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
+             (Val.Width>>3);
+    }
+    static bool isEqual(const LoweredPHIRecord &LHS,
+                        const LoweredPHIRecord &RHS) {
+      return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
+             LHS.Width == RHS.Width;
+    }
+    static bool isPod() { return true; }
+  };
+}
+
+
+/// SliceUpIllegalIntegerPHI - This is an integer PHI and we know that it has an
+/// illegal type: see if it is only used by trunc or trunc(lshr) operations.  If
+/// so, we split the PHI into the various pieces being extracted.  This sort of
+/// thing is introduced when SROA promotes an aggregate to large integer values.
+///
+/// TODO: The user of the trunc may be an bitcast to float/double/vector or an
+/// inttoptr.  We should produce new PHIs in the right type.
+///
+Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
+  // PHIUsers - Keep track of all of the truncated values extracted from a set
+  // of PHIs, along with their offset.  These are the things we want to rewrite.
+  SmallVector<PHIUsageRecord, 16> PHIUsers;
+  
+  // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
+  // nodes which are extracted from. PHIsToSlice is a set we use to avoid
+  // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
+  // check the uses of (to ensure they are all extracts).
+  SmallVector<PHINode*, 8> PHIsToSlice;
+  SmallPtrSet<PHINode*, 8> PHIsInspected;
+  
+  PHIsToSlice.push_back(&FirstPhi);
+  PHIsInspected.insert(&FirstPhi);
+  
+  for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
+    PHINode *PN = PHIsToSlice[PHIId];
+    
+    for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      
+      // If the user is a PHI, inspect its uses recursively.
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (PHIsInspected.insert(UserPN))
+          PHIsToSlice.push_back(UserPN);
+        continue;
+      }
+      
+      // Truncates are always ok.
+      if (isa<TruncInst>(User)) {
+        PHIUsers.push_back(PHIUsageRecord(PHIId, 0, User));
+        continue;
+      }
+      
+      // Otherwise it must be a lshr which can only be used by one trunc.
+      if (User->getOpcode() != Instruction::LShr ||
+          !User->hasOneUse() || !isa<TruncInst>(User->use_back()) ||
+          !isa<ConstantInt>(User->getOperand(1)))
+        return 0;
+      
+      unsigned Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
+      PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, User->use_back()));
+    }
+  }
+  
+  // If we have no users, they must be all self uses, just nuke the PHI.
+  if (PHIUsers.empty())
+    return ReplaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
+  
+  // If this phi node is transformable, create new PHIs for all the pieces
+  // extracted out of it.  First, sort the users by their offset and size.
+  array_pod_sort(PHIUsers.begin(), PHIUsers.end());
+  
+  DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
+            for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+              errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
+        );
+  
+  // PredValues - This is a temporary used when rewriting PHI nodes.  It is
+  // hoisted out here to avoid construction/destruction thrashing.
+  DenseMap<BasicBlock*, Value*> PredValues;
+  
+  // ExtractedVals - Each new PHI we introduce is saved here so we don't
+  // introduce redundant PHIs.
+  DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
+  
+  for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
+    unsigned PHIId = PHIUsers[UserI].PHIId;
+    PHINode *PN = PHIsToSlice[PHIId];
+    unsigned Offset = PHIUsers[UserI].Shift;
+    const Type *Ty = PHIUsers[UserI].Inst->getType();
+    
+    PHINode *EltPHI;
+    
+    // If we've already lowered a user like this, reuse the previously lowered
+    // value.
+    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) {
+      
+      // Otherwise, Create the new PHI node for this user.
+      EltPHI = PHINode::Create(Ty, PN->getName()+".off"+Twine(Offset), PN);
+      assert(EltPHI->getType() != PN->getType() &&
+             "Truncate didn't shrink phi?");
+    
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *Pred = PN->getIncomingBlock(i);
+        Value *&PredVal = PredValues[Pred];
+        
+        // If we already have a value for this predecessor, reuse it.
+        if (PredVal) {
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+
+        // Handle the PHI self-reuse case.
+        Value *InVal = PN->getIncomingValue(i);
+        if (InVal == PN) {
+          PredVal = EltPHI;
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        } else if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
+          // If the incoming value was a PHI, and if it was one of the PHIs we
+          // already rewrote it, just use the lowered value.
+          if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
+            PredVal = Res;
+            EltPHI->addIncoming(PredVal, Pred);
+            continue;
+          }
+        }
+        
+        // Otherwise, do an extract in the predecessor.
+        Builder->SetInsertPoint(Pred, Pred->getTerminator());
+        Value *Res = InVal;
+        if (Offset)
+          Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(),
+                                                          Offset), "extract");
+        Res = Builder->CreateTrunc(Res, Ty, "extract.t");
+        PredVal = Res;
+        EltPHI->addIncoming(Res, Pred);
+        
+        // If the incoming value was a PHI, and if it was one of the PHIs we are
+        // rewriting, we will ultimately delete the code we inserted.  This
+        // means we need to revisit that PHI to make sure we extract out the
+        // needed piece.
+        if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
+          if (PHIsInspected.count(OldInVal)) {
+            unsigned RefPHIId = std::find(PHIsToSlice.begin(),PHIsToSlice.end(),
+                                          OldInVal)-PHIsToSlice.begin();
+            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, 
+                                              cast<Instruction>(Res)));
+            ++UserE;
+          }
+      }
+      PredValues.clear();
+      
+      DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
+                   << *EltPHI << '\n');
+      ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
+    }
+    
+    // Replace the use of this piece with the PHI node.
+    ReplaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
+  }
+  
+  // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
+  // with undefs.
+  Value *Undef = UndefValue::get(FirstPhi.getType());
+  for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+    ReplaceInstUsesWith(*PHIsToSlice[i], Undef);
+  return ReplaceInstUsesWith(FirstPhi, Undef);
+}
+
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
@@ -11117,6 +11269,15 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       }
     }
 
+  // If this is an integer PHI and we know that it has an illegal type, see if
+  // it is only used by trunc or trunc(lshr) operations.  If so, we split the
+  // PHI into the various pieces being extracted.  This sort of thing is
+  // introduced when SROA promotes an aggregate to a single large integer type.
+  if (isa<IntegerType>(PN.getType()) && TD &&
+      !TD->isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
+    if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
+      return Res;
+  
   return 0;
 }
 
@@ -12210,6 +12371,47 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
                                       exti, exte);
   }
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
+    // We're extracting from an intrinsic, see if we're the only user, which
+    // allows us to simplify multiple result intrinsics to simpler things that
+    // just get one value..
+    if (II->hasOneUse()) {
+      // Check if we're grabbing the overflow bit or the result of a 'with
+      // overflow' intrinsic.  If it's the latter we can remove the intrinsic
+      // and replace it with a traditional binary instruction.
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::sadd_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
+          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          EraseInstFromFunction(*II);
+          return BinaryOperator::CreateAdd(LHS, RHS);
+        }
+        break;
+      case Intrinsic::usub_with_overflow:
+      case Intrinsic::ssub_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
+          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          EraseInstFromFunction(*II);
+          return BinaryOperator::CreateSub(LHS, RHS);
+        }
+        break;
+      case Intrinsic::umul_with_overflow:
+      case Intrinsic::smul_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
+          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          EraseInstFromFunction(*II);
+          return BinaryOperator::CreateMul(LHS, RHS);
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
   // Can't simplify extracts from other values. Note that nested extracts are
   // already simplified implicitely by the above (extract ( extract (insert) )
   // will be translated into extract ( insert ( extract ) ) first and then just
@@ -12715,29 +12917,33 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     if (isa<UndefValue>(RHS)) {
       std::vector<unsigned> LHSMask = getShuffleMask(LHSSVI);
 
-      std::vector<unsigned> NewMask;
-      for (unsigned i = 0, e = Mask.size(); i != e; ++i)
-        if (Mask[i] >= 2*e)
-          NewMask.push_back(2*e);
-        else
-          NewMask.push_back(LHSMask[Mask[i]]);
+      if (LHSMask.size() == Mask.size()) {
+        std::vector<unsigned> NewMask;
+        for (unsigned i = 0, e = Mask.size(); i != e; ++i)
+          if (Mask[i] >= 2*e)
+            NewMask.push_back(2*e);
+          else
+            NewMask.push_back(LHSMask[Mask[i]]);
       
-      // If the result mask is equal to the src shuffle or this shuffle mask, do
-      // the replacement.
-      if (NewMask == LHSMask || NewMask == Mask) {
-        unsigned LHSInNElts =
-          cast<VectorType>(LHSSVI->getOperand(0)->getType())->getNumElements();
-        std::vector<Constant*> Elts;
-        for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
-          if (NewMask[i] >= LHSInNElts*2) {
-            Elts.push_back(UndefValue::get(Type::getInt32Ty(*Context)));
-          } else {
-            Elts.push_back(ConstantInt::get(Type::getInt32Ty(*Context), NewMask[i]));
+        // If the result mask is equal to the src shuffle or this
+        // shuffle mask, do the replacement.
+        if (NewMask == LHSMask || NewMask == Mask) {
+          unsigned LHSInNElts =
+            cast<VectorType>(LHSSVI->getOperand(0)->getType())->
+            getNumElements();
+          std::vector<Constant*> Elts;
+          for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
+            if (NewMask[i] >= LHSInNElts*2) {
+              Elts.push_back(UndefValue::get(Type::getInt32Ty(*Context)));
+            } else {
+              Elts.push_back(ConstantInt::get(Type::getInt32Ty(*Context),
+                                              NewMask[i]));
+            }
           }
+          return new ShuffleVectorInst(LHSSVI->getOperand(0),
+                                       LHSSVI->getOperand(1),
+                                       ConstantVector::get(Elts));
         }
-        return new ShuffleVectorInst(LHSSVI->getOperand(0),
-                                     LHSSVI->getOperand(1),
-                                     ConstantVector::get(Elts));
       }
     }
   }
@@ -12824,7 +13030,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
-        if (Constant *C = ConstantFoldInstruction(Inst, BB->getContext(), TD)) {
+        if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
           DEBUG(errs() << "IC: ConstFold to: " << *C << " from: "
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
@@ -12846,8 +13052,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
           if (!FoldedConstants.insert(CE))
             continue;
           
-          Constant *NewC =
-            ConstantFoldConstantExpression(CE, BB->getContext(), TD);
+          Constant *NewC = ConstantFoldConstantExpression(CE, TD);
           if (NewC && NewC != CE) {
             *i = NewC;
             MadeIRChange = true;
@@ -12954,7 +13159,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Instruction isn't dead, see if we can constant propagate it.
     if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
-      if (Constant *C = ConstantFoldInstruction(I, F.getContext(), TD)) {
+      if (Constant *C = ConstantFoldInstruction(I, TD)) {
         DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
         // Add operands to the worklist.
@@ -13065,7 +13270,7 @@ bool InstCombiner::runOnFunction(Function &F) {
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
   IRBuilder<true, TargetFolder, InstCombineIRInserter> 
-    TheBuilder(F.getContext(), TargetFolder(TD, F.getContext()),
+    TheBuilder(F.getContext(), TargetFolder(TD),
                InstCombineIRInserter(Worklist));
   Builder = &TheBuilder;
   
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 10c9ec6..5864113 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -16,7 +16,8 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -40,6 +41,12 @@ Threshold("jump-threading-threshold",
           cl::desc("Max block size to duplicate for jump threading"),
           cl::init(6), cl::Hidden);
 
+// Turn on use of LazyValueInfo.
+static cl::opt<bool>
+EnableLVI("enable-jump-threading-lvi", cl::ReallyHidden);
+
+
+
 namespace {
   /// This pass performs 'jump threading', which looks at blocks that have
   /// multiple predecessors and multiple successors.  If one or more of the
@@ -59,6 +66,7 @@ namespace {
   ///
   class JumpThreading : public FunctionPass {
     TargetData *TD;
+    LazyValueInfo *LVI;
 #ifdef NDEBUG
     SmallPtrSet<BasicBlock*, 16> LoopHeaders;
 #else
@@ -69,20 +77,31 @@ namespace {
     JumpThreading() : FunctionPass(&ID) {}
 
     bool runOnFunction(Function &F);
-    void FindLoopHeaders(Function &F);
     
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      if (EnableLVI)
+        AU.addRequired<LazyValueInfo>();
+    }
+    
+    void FindLoopHeaders(Function &F);
     bool ProcessBlock(BasicBlock *BB);
-    bool ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, BasicBlock *SuccBB);
+    bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
+                    BasicBlock *SuccBB);
     bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
                                           BasicBlock *PredBB);
-
-    BasicBlock *FactorCommonPHIPreds(PHINode *PN, Value *Val);
+    
+    typedef SmallVectorImpl<std::pair<ConstantInt*,
+                                      BasicBlock*> > PredValueInfo;
+    
+    bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
+                                         PredValueInfo &Result);
+    bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB);
+    
+    
     bool ProcessBranchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
     bool ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
 
     bool ProcessJumpOnPHI(PHINode *PN);
-    bool ProcessBranchOnLogical(Value *V, BasicBlock *BB, bool isAnd);
-    bool ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB);
     
     bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
   };
@@ -100,6 +119,7 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
 bool JumpThreading::runOnFunction(Function &F) {
   DEBUG(errs() << "Jump threading on function '" << F.getName() << "'\n");
   TD = getAnalysisIfAvailable<TargetData>();
+  LVI = EnableLVI ? &getAnalysis<LazyValueInfo>() : 0;
   
   FindLoopHeaders(F);
   
@@ -109,6 +129,7 @@ bool JumpThreading::runOnFunction(Function &F) {
     bool Changed = false;
     for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
       BasicBlock *BB = I;
+      // Thread all of the branches we can over this block. 
       while (ProcessBlock(BB))
         Changed = true;
       
@@ -123,6 +144,29 @@ bool JumpThreading::runOnFunction(Function &F) {
         LoopHeaders.erase(BB);
         DeleteDeadBlock(BB);
         Changed = true;
+      } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+        // Can't thread an unconditional jump, but if the block is "almost
+        // empty", we can replace uses of it with uses of the successor and make
+        // this dead.
+        if (BI->isUnconditional() && 
+            BB != &BB->getParent()->getEntryBlock()) {
+          BasicBlock::iterator BBI = BB->getFirstNonPHI();
+          // Ignore dbg intrinsics.
+          while (isa<DbgInfoIntrinsic>(BBI))
+            ++BBI;
+          // If the terminator is the only non-phi instruction, try to nuke it.
+          if (BBI->isTerminator()) {
+            // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
+            // block, we have to make sure it isn't in the LoopHeaders set.  We
+            // reinsert afterward in the rare case when the block isn't deleted.
+            bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
+            
+            if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
+              Changed = true;
+            else if (ErasedFromLoopHeaders)
+              LoopHeaders.insert(BB);
+          }
+        }
       }
     }
     AnotherIteration = Changed;
@@ -139,6 +183,10 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I = BB->getFirstNonPHI();
   
+  // FIXME: THREADING will delete values that are just used to compute the
+  // branch, so they shouldn't count against the duplication cost.
+  
+  
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
@@ -173,8 +221,6 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
   return Size;
 }
 
-
-
 /// FindLoopHeaders - We do not want jump threading to turn proper loop
 /// structures into irreducible loops.  Doing this breaks up the loop nesting
 /// hierarchy and pessimizes later transformations.  To prevent this from
@@ -198,29 +244,181 @@ void JumpThreading::FindLoopHeaders(Function &F) {
     LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second));
 }
 
-
-/// FactorCommonPHIPreds - If there are multiple preds with the same incoming
-/// value for the PHI, factor them together so we get one block to thread for
-/// the whole group.
-/// This is important for things like "phi i1 [true, true, false, true, x]"
-/// where we only need to clone the block for the true blocks once.
+/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// if we can infer that the value is a known ConstantInt in any of our
+/// predecessors.  If so, return the known list of value and pred BB in the
+/// result vector.  If a value is known to be undef, it is returned as null.
+///
+/// This returns true if there were any known values.
 ///
-BasicBlock *JumpThreading::FactorCommonPHIPreds(PHINode *PN, Value *Val) {
-  SmallVector<BasicBlock*, 16> CommonPreds;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-    if (PN->getIncomingValue(i) == Val)
-      CommonPreds.push_back(PN->getIncomingBlock(i));
-  
-  if (CommonPreds.size() == 1)
-    return CommonPreds[0];
+bool JumpThreading::
+ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
+  // If V is a constantint, then it is known in all predecessors.
+  if (isa<ConstantInt>(V) || isa<UndefValue>(V)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(V);
     
-  DEBUG(errs() << "  Factoring out " << CommonPreds.size()
-        << " common predecessors.\n");
-  return SplitBlockPredecessors(PN->getParent(),
-                                &CommonPreds[0], CommonPreds.size(),
-                                ".thr_comm", this);
-}
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+      Result.push_back(std::make_pair(CI, *PI));
+    return true;
+  }
+  
+  // If V is a non-instruction value, or an instruction in a different block,
+  // then it can't be derived from a PHI.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || I->getParent() != BB) {
+    
+    // Okay, if this is a live-in value, see if it has a known value at the end
+    // of any of our predecessors.
+    //
+    // FIXME: This should be an edge property, not a block end property.
+    /// TODO: Per PR2563, we could infer value range information about a
+    /// predecessor based on its terminator.
+    //
+    if (LVI) {
+      // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+      // "I" is a non-local compare-with-a-constant instruction.  This would be
+      // able to handle value inequalities better, for example if the compare is
+      // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+      // Perhaps getConstantOnEdge should be smart enough to do this?
+      
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        // If the value is known by LazyValueInfo to be a constant in a
+        // predecessor, use that information to try to thread this block.
+        Constant *PredCst = LVI->getConstantOnEdge(V, *PI, BB);
+        if (PredCst == 0 ||
+            (!isa<ConstantInt>(PredCst) && !isa<UndefValue>(PredCst)))
+          continue;
+        
+        Result.push_back(std::make_pair(dyn_cast<ConstantInt>(PredCst), *PI));
+      }
+      
+      return !Result.empty();
+    }
+    
+    return false;
+  }
+  
+  /// If I is a PHI node, then we know the incoming values for any constants.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      if (isa<ConstantInt>(InVal) || isa<UndefValue>(InVal)) {
+        ConstantInt *CI = dyn_cast<ConstantInt>(InVal);
+        Result.push_back(std::make_pair(CI, PN->getIncomingBlock(i)));
+      }
+    }
+    return !Result.empty();
+  }
+  
+  SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> LHSVals, RHSVals;
+
+  // Handle some boolean conditions.
+  if (I->getType()->getPrimitiveSizeInBits() == 1) { 
+    // X | true -> true
+    // X & false -> false
+    if (I->getOpcode() == Instruction::Or ||
+        I->getOpcode() == Instruction::And) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals);
+      ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals);
+      
+      if (LHSVals.empty() && RHSVals.empty())
+        return false;
+      
+      ConstantInt *InterestingVal;
+      if (I->getOpcode() == Instruction::Or)
+        InterestingVal = ConstantInt::getTrue(I->getContext());
+      else
+        InterestingVal = ConstantInt::getFalse(I->getContext());
+      
+      // Scan for the sentinel.
+      for (unsigned i = 0, e = LHSVals.size(); i != e; ++i)
+        if (LHSVals[i].first == InterestingVal || LHSVals[i].first == 0)
+          Result.push_back(LHSVals[i]);
+      for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
+        if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0)
+          Result.push_back(RHSVals[i]);
+      return !Result.empty();
+    }
+    
+    // Handle the NOT form of XOR.
+    if (I->getOpcode() == Instruction::Xor &&
+        isa<ConstantInt>(I->getOperand(1)) &&
+        cast<ConstantInt>(I->getOperand(1))->isOne()) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result);
+      if (Result.empty())
+        return false;
+
+      // Invert the known values.
+      for (unsigned i = 0, e = Result.size(); i != e; ++i)
+        if (Result[i].first)
+          Result[i].first =
+            cast<ConstantInt>(ConstantExpr::getNot(Result[i].first));
+      return true;
+    }
+  }
   
+  // Handle compare with phi operand, where the PHI is defined in this block.
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+    if (PN && PN->getParent() == BB) {
+      // We can do this simplification if any comparisons fold to true or false.
+      // See if any do.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *PredBB = PN->getIncomingBlock(i);
+        Value *LHS = PN->getIncomingValue(i);
+        Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+        
+        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD);
+        if (Res == 0) {
+          if (!LVI || !isa<Constant>(RHS))
+            continue;
+          
+          LazyValueInfo::Tristate 
+            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+                                           cast<Constant>(RHS), PredBB, BB);
+          if (ResT == LazyValueInfo::Unknown)
+            continue;
+          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+        }
+        
+        if (isa<UndefValue>(Res))
+          Result.push_back(std::make_pair((ConstantInt*)0, PredBB));
+        else if (ConstantInt *CI = dyn_cast<ConstantInt>(Res))
+          Result.push_back(std::make_pair(CI, PredBB));
+      }
+      
+      return !Result.empty();
+    }
+    
+    
+    // If comparing a live-in value against a constant, see if we know the
+    // live-in value on any predecessors.
+    if (LVI && isa<Constant>(Cmp->getOperand(1)) &&
+        Cmp->getType()->isInteger() && // Not vector compare.
+        (!isa<Instruction>(Cmp->getOperand(0)) ||
+         cast<Instruction>(Cmp->getOperand(0))->getParent() != BB)) {
+      Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+      
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        // If the value is known by LazyValueInfo to be a constant in a
+        // predecessor, use that information to try to thread this block.
+        LazyValueInfo::Tristate
+          Res = LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+                                        RHSCst, *PI, BB);
+        if (Res == LazyValueInfo::Unknown)
+          continue;
+
+        Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), *PI));
+      }
+      
+      return !Result.empty();
+    }
+  }
+  return false;
+}
+
+
 
 /// GetBestDestForBranchOnUndef - If we determine that the specified block ends
 /// in an undefined jump, decide which block is best to revector to.
@@ -251,7 +449,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   // successor, merge the blocks.  This encourages recursive jump threading
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
-  if (BasicBlock *SinglePred = BB->getSinglePredecessor())
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
     if (SinglePred->getTerminator()->getNumSuccessors() == 1 &&
         SinglePred != BB) {
       // If SinglePred was a loop header, BB becomes one.
@@ -267,10 +465,10 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
         BB->moveBefore(&BB->getParent()->getEntryBlock());
       return true;
     }
-  
-  // See if this block ends with a branch or switch.  If so, see if the
-  // condition is a phi node.  If so, and if an entry of the phi node is a
-  // constant, we can thread the block.
+  }
+
+  // Look to see if the terminator is a branch of switch, if not we can't thread
+  // it.
   Value *Condition;
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
     // Can't thread an unconditional jump.
@@ -301,7 +499,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     TerminatorInst *BBTerm = BB->getTerminator();
     for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
       if (i == BestSucc) continue;
-      BBTerm->getSuccessor(i)->removePredecessor(BB);
+      RemovePredecessorAndSimplify(BBTerm->getSuccessor(i), BB, TD);
     }
     
     DEBUG(errs() << "  In block '" << BB->getName()
@@ -318,7 +516,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   //     br COND, BBX, BBY
   //  BBX:
   //     br COND, BBZ, BBW
-  if (!Condition->hasOneUse() && // Multiple uses.
+  if (!LVI &&
+      !Condition->hasOneUse() && // Multiple uses.
       (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition.
     pred_iterator PI = pred_begin(BB), E = pred_end(BB);
     if (isa<BranchInst>(BB->getTerminator())) {
@@ -338,52 +537,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   }
 
   // All the rest of our checks depend on the condition being an instruction.
-  if (CondInst == 0)
+  if (CondInst == 0) {
+    // FIXME: Unify this with code below.
+    if (LVI && ProcessThreadableEdges(Condition, BB))
+      return true;
     return false;
+  }  
+    
   
   // See if this is a phi node in the current block.
   if (PHINode *PN = dyn_cast<PHINode>(CondInst))
     if (PN->getParent() == BB)
       return ProcessJumpOnPHI(PN);
   
-  // If this is a conditional branch whose condition is and/or of a phi, try to
-  // simplify it.
-  if ((CondInst->getOpcode() == Instruction::And || 
-       CondInst->getOpcode() == Instruction::Or) &&
-      isa<BranchInst>(BB->getTerminator()) &&
-      ProcessBranchOnLogical(CondInst, BB,
-                             CondInst->getOpcode() == Instruction::And))
-    return true;
-  
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
-    if (isa<PHINode>(CondCmp->getOperand(0))) {
-      // If we have "br (phi != 42)" and the phi node has any constant values
-      // as operands, we can thread through this block.
-      // 
-      // If we have "br (cmp phi, x)" and the phi node contains x such that the
-      // comparison uniquely identifies the branch target, we can thread
-      // through this block.
-
-      if (ProcessBranchOnCompare(CondCmp, BB))
-        return true;      
-    }
-    
-    // If we have a comparison, loop over the predecessors to see if there is
-    // a condition with the same value.
-    pred_iterator PI = pred_begin(BB), E = pred_end(BB);
-    for (; PI != E; ++PI)
-      if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
-        if (PBI->isConditional() && *PI != BB) {
-          if (CmpInst *CI = dyn_cast<CmpInst>(PBI->getCondition())) {
-            if (CI->getOperand(0) == CondCmp->getOperand(0) &&
-                CI->getOperand(1) == CondCmp->getOperand(1) &&
-                CI->getPredicate() == CondCmp->getPredicate()) {
-              // TODO: Could handle things like (x != 4) --> (x == 17)
-              if (ProcessBranchOnDuplicateCond(*PI, BB))
-                return true;
+    if (!LVI &&
+        (!isa<PHINode>(CondCmp->getOperand(0)) ||
+         cast<PHINode>(CondCmp->getOperand(0))->getParent() != BB)) {
+      // If we have a comparison, loop over the predecessors to see if there is
+      // a condition with a lexically identical value.
+      pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+      for (; PI != E; ++PI)
+        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+          if (PBI->isConditional() && *PI != BB) {
+            if (CmpInst *CI = dyn_cast<CmpInst>(PBI->getCondition())) {
+              if (CI->getOperand(0) == CondCmp->getOperand(0) &&
+                  CI->getOperand(1) == CondCmp->getOperand(1) &&
+                  CI->getPredicate() == CondCmp->getPredicate()) {
+                // TODO: Could handle things like (x != 4) --> (x == 17)
+                if (ProcessBranchOnDuplicateCond(*PI, BB))
+                  return true;
+              }
             }
           }
-        }
+    }
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -398,10 +585,21 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     if (isa<Constant>(CondCmp->getOperand(1)))
       SimplifyValue = CondCmp->getOperand(0);
   
+  // TODO: There are other places where load PRE would be profitable, such as
+  // more complex comparisons.
   if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
     if (SimplifyPartiallyRedundantLoad(LI))
       return true;
   
+  
+  // Handle a variety of cases where we are branching on something derived from
+  // a PHI node in the current block.  If we can prove that any predecessors
+  // compute a predictable value based on a PHI node, thread those predecessors.
+  //
+  if (ProcessThreadableEdges(CondInst, BB))
+    return true;
+  
+  
   // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
   // "(X == 4)" thread through this block.
   
@@ -459,8 +657,11 @@ bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB,
   // Next, figure out which successor we are threading to.
   BasicBlock *SuccBB = DestBI->getSuccessor(!BranchDir);
   
+  SmallVector<BasicBlock*, 2> Preds;
+  Preds.push_back(PredBB);
+  
   // Ok, try to thread it!
-  return ThreadEdge(BB, PredBB, SuccBB);
+  return ThreadEdge(BB, Preds, SuccBB);
 }
 
 /// ProcessSwitchOnDuplicateCond - We found a block and a predecessor of that
@@ -553,7 +754,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   Value *LoadedPtr = LI->getOperand(0);
 
   // If the loaded operand is defined in the LoadBB, it can't be available.
-  // FIXME: Could do PHI translation, that would be fun :)
+  // TODO: Could do simple PHI translation, that would be fun :)
   if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
     if (PtrOp->getParent() == LoadBB)
       return false;
@@ -562,8 +763,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // the entry to its block.
   BasicBlock::iterator BBIt = LI;
 
-  if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, 
-                                                     BBIt, 6)) {
+  if (Value *AvailableVal = 
+        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) {
     // If the value if the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
     //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
@@ -646,7 +847,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     // Split them out to their own block.
     UnavailablePred =
       SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
-                             "thread-split", this);
+                             "thread-pre-split", this);
   }
   
   // If the value isn't available in all predecessors, then there will be
@@ -655,7 +856,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr",
+    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+                                 LI->getAlignment(),
                                  UnavailablePred->getTerminator());
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
@@ -690,55 +892,183 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   return true;
 }
 
-
-/// ProcessJumpOnPHI - We have a conditional branch or switch on a PHI node in
-/// the current block.  See if there are any simplifications we can do based on
-/// inputs to the phi node.
-/// 
-bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
-  BasicBlock *BB = PN->getParent();
+/// FindMostPopularDest - The specified list contains multiple possible
+/// threadable destinations.  Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
+FindMostPopularDest(BasicBlock *BB,
+                    const SmallVectorImpl<std::pair<BasicBlock*,
+                                  BasicBlock*> > &PredToDestList) {
+  assert(!PredToDestList.empty());
+  
+  // Determine popularity.  If there are multiple possible destinations, we
+  // explicitly choose to ignore 'undef' destinations.  We prefer to thread
+  // blocks with known and real destinations to threading undef.  We'll handle
+  // them later if interesting.
+  DenseMap<BasicBlock*, unsigned> DestPopularity;
+  for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i)
+    if (PredToDestList[i].second)
+      DestPopularity[PredToDestList[i].second]++;
+  
+  // Find the most popular dest.
+  DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
+  BasicBlock *MostPopularDest = DPI->first;
+  unsigned Popularity = DPI->second;
+  SmallVector<BasicBlock*, 4> SamePopularity;
+  
+  for (++DPI; DPI != DestPopularity.end(); ++DPI) {
+    // If the popularity of this entry isn't higher than the popularity we've
+    // seen so far, ignore it.
+    if (DPI->second < Popularity)
+      ; // ignore.
+    else if (DPI->second == Popularity) {
+      // If it is the same as what we've seen so far, keep track of it.
+      SamePopularity.push_back(DPI->first);
+    } else {
+      // If it is more popular, remember it.
+      SamePopularity.clear();
+      MostPopularDest = DPI->first;
+      Popularity = DPI->second;
+    }      
+  }
   
-  // See if the phi node has any constant integer or undef values.  If so, we
-  // can determine where the corresponding predecessor will branch.
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    Value *PredVal = PN->getIncomingValue(i);
-    
-    // Check to see if this input is a constant integer.  If so, the direction
-    // of the branch is predictable.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(PredVal)) {
-      // Merge any common predecessors that will act the same.
-      BasicBlock *PredBB = FactorCommonPHIPreds(PN, CI);
+  // Okay, now we know the most popular destination.  If there is more than
+  // destination, we need to determine one.  This is arbitrary, but we need
+  // to make a deterministic decision.  Pick the first one that appears in the
+  // successor list.
+  if (!SamePopularity.empty()) {
+    SamePopularity.push_back(MostPopularDest);
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0; ; ++i) {
+      assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
       
-      BasicBlock *SuccBB;
-      if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
-        SuccBB = BI->getSuccessor(CI->isZero());
-      else {
-        SwitchInst *SI = cast<SwitchInst>(BB->getTerminator());
-        SuccBB = SI->getSuccessor(SI->findCaseValue(CI));
-      }
+      if (std::find(SamePopularity.begin(), SamePopularity.end(),
+                    TI->getSuccessor(i)) == SamePopularity.end())
+        continue;
       
-      // Ok, try to thread it!
-      return ThreadEdge(BB, PredBB, SuccBB);
+      MostPopularDest = TI->getSuccessor(i);
+      break;
     }
+  }
+  
+  // Okay, we have finally picked the most popular destination.
+  return MostPopularDest;
+}
+
+bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB) {
+  // If threading this would thread across a loop header, don't even try to
+  // thread the edge.
+  if (LoopHeaders.count(BB))
+    return false;
+  
+  SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> PredValues;
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues))
+    return false;
+  assert(!PredValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  DEBUG(errs() << "IN BB: " << *BB;
+        for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
+          errs() << "  BB '" << BB->getName() << "': FOUND condition = ";
+          if (PredValues[i].first)
+            errs() << *PredValues[i].first;
+          else
+            errs() << "UNDEF";
+          errs() << " for pred '" << PredValues[i].second->getName()
+          << "'.\n";
+        });
+  
+  // Decide what we want to thread through.  Convert our list of known values to
+  // a list of known destinations for each pred.  This also discards duplicate
+  // predecessors and keeps track of the undefined inputs (which are represented
+  // as a null dest in the PredToDestList).
+  SmallPtrSet<BasicBlock*, 16> SeenPreds;
+  SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+  
+  BasicBlock *OnlyDest = 0;
+  BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+  
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
+    BasicBlock *Pred = PredValues[i].second;
+    if (!SeenPreds.insert(Pred))
+      continue;  // Duplicate predecessor entry.
     
-    // If the input is an undef, then it doesn't matter which way it will go.
-    // Pick an arbitrary dest and thread the edge.
-    if (UndefValue *UV = dyn_cast<UndefValue>(PredVal)) {
-      // Merge any common predecessors that will act the same.
-      BasicBlock *PredBB = FactorCommonPHIPreds(PN, UV);
-      BasicBlock *SuccBB =
-        BB->getTerminator()->getSuccessor(GetBestDestForJumpOnUndef(BB));
-      
-      // Ok, try to thread it!
-      return ThreadEdge(BB, PredBB, SuccBB);
+    // If the predecessor ends with an indirect goto, we can't change its
+    // destination.
+    if (isa<IndirectBrInst>(Pred->getTerminator()))
+      continue;
+    
+    ConstantInt *Val = PredValues[i].first;
+    
+    BasicBlock *DestBB;
+    if (Val == 0)      // Undef.
+      DestBB = 0;
+    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+      DestBB = BI->getSuccessor(Val->isZero());
+    else {
+      SwitchInst *SI = cast<SwitchInst>(BB->getTerminator());
+      DestBB = SI->getSuccessor(SI->findCaseValue(Val));
     }
+
+    // If we have exactly one destination, remember it for efficiency below.
+    if (i == 0)
+      OnlyDest = DestBB;
+    else if (OnlyDest != DestBB)
+      OnlyDest = MultipleDestSentinel;
+    
+    PredToDestList.push_back(std::make_pair(Pred, DestBB));
   }
   
-  // If the incoming values are all variables, we don't know the destination of
-  // any predecessors.  However, if any of the predecessor blocks end in an
-  // unconditional branch, we can *duplicate* the jump into that block in order
-  // to further encourage jump threading and to eliminate cases where we have
-  // branch on a phi of an icmp (branch on icmp is much better).
+  // If all edges were unthreadable, we fail.
+  if (PredToDestList.empty())
+    return false;
+  
+  // Determine which is the most common successor.  If we have many inputs and
+  // this block is a switch, we want to start by threading the batch that goes
+  // to the most popular destination first.  If we only know about one
+  // threadable destination (the common case) we can avoid this.
+  BasicBlock *MostPopularDest = OnlyDest;
+  
+  if (MostPopularDest == MultipleDestSentinel)
+    MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+  
+  // Now that we know what the most popular destination is, factor all
+  // predecessors that will jump to it into a single predecessor.
+  SmallVector<BasicBlock*, 16> PredsToFactor;
+  for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i)
+    if (PredToDestList[i].second == MostPopularDest) {
+      BasicBlock *Pred = PredToDestList[i].first;
+      
+      // This predecessor may be a switch or something else that has multiple
+      // edges to the block.  Factor each of these edges by listing them
+      // according to # occurrences in PredsToFactor.
+      TerminatorInst *PredTI = Pred->getTerminator();
+      for (unsigned i = 0, e = PredTI->getNumSuccessors(); i != e; ++i)
+        if (PredTI->getSuccessor(i) == BB)
+          PredsToFactor.push_back(Pred);
+    }
+
+  // If the threadable edges are branching on an undefined value, we get to pick
+  // the destination that these predecessors should get to.
+  if (MostPopularDest == 0)
+    MostPopularDest = BB->getTerminator()->
+                            getSuccessor(GetBestDestForJumpOnUndef(BB));
+        
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+}
+
+/// ProcessJumpOnPHI - We have a conditional branch or switch on a PHI node in
+/// the current block.  See if there are any simplifications we can do based on
+/// inputs to the phi node.
+/// 
+bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
+  BasicBlock *BB = PN->getParent();
+  
+  // If any of the predecessor blocks end in an unconditional branch, we can
+  // *duplicate* the jump into that block in order to further encourage jump
+  // threading and to eliminate cases where we have branch on a phi of an icmp
+  // (branch on icmp is much better).
 
   // We don't want to do this tranformation for switches, because we don't
   // really want to duplicate a switch.
@@ -759,137 +1089,6 @@ bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
 }
 
 
-/// ProcessJumpOnLogicalPHI - PN's basic block contains a conditional branch
-/// whose condition is an AND/OR where one side is PN.  If PN has constant
-/// operands that permit us to evaluate the condition for some operand, thread
-/// through the block.  For example with:
-///   br (and X, phi(Y, Z, false))
-/// the predecessor corresponding to the 'false' will always jump to the false
-/// destination of the branch.
-///
-bool JumpThreading::ProcessBranchOnLogical(Value *V, BasicBlock *BB,
-                                           bool isAnd) {
-  // If this is a binary operator tree of the same AND/OR opcode, check the
-  // LHS/RHS.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
-    if ((isAnd && BO->getOpcode() == Instruction::And) ||
-        (!isAnd && BO->getOpcode() == Instruction::Or)) {
-      if (ProcessBranchOnLogical(BO->getOperand(0), BB, isAnd))
-        return true;
-      if (ProcessBranchOnLogical(BO->getOperand(1), BB, isAnd))
-        return true;
-    }
-      
-  // If this isn't a PHI node, we can't handle it.
-  PHINode *PN = dyn_cast<PHINode>(V);
-  if (!PN || PN->getParent() != BB) return false;
-                                             
-  // We can only do the simplification for phi nodes of 'false' with AND or
-  // 'true' with OR.  See if we have any entries in the phi for this.
-  unsigned PredNo = ~0U;
-  ConstantInt *PredCst = ConstantInt::get(Type::getInt1Ty(BB->getContext()),
-                                          !isAnd);
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    if (PN->getIncomingValue(i) == PredCst) {
-      PredNo = i;
-      break;
-    }
-  }
-  
-  // If no match, bail out.
-  if (PredNo == ~0U)
-    return false;
-  
-  // If so, we can actually do this threading.  Merge any common predecessors
-  // that will act the same.
-  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredCst);
-  
-  // Next, figure out which successor we are threading to.  If this was an AND,
-  // the constant must be FALSE, and we must be targeting the 'false' block.
-  // If this is an OR, the constant must be TRUE, and we must be targeting the
-  // 'true' block.
-  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(isAnd);
-  
-  // Ok, try to thread it!
-  return ThreadEdge(BB, PredBB, SuccBB);
-}
-
-/// GetResultOfComparison - Given an icmp/fcmp predicate and the left and right
-/// hand sides of the compare instruction, try to determine the result. If the
-/// result can not be determined, a null pointer is returned.
-static Constant *GetResultOfComparison(CmpInst::Predicate pred,
-                                       Value *LHS, Value *RHS,
-                                       LLVMContext &Context) {
-  if (Constant *CLHS = dyn_cast<Constant>(LHS))
-    if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      return ConstantExpr::getCompare(pred, CLHS, CRHS);
-
-  if (LHS == RHS)
-    if (isa<IntegerType>(LHS->getType()) || isa<PointerType>(LHS->getType()))
-      return ICmpInst::isTrueWhenEqual(pred) ? 
-                 ConstantInt::getTrue(Context) : ConstantInt::getFalse(Context);
-
-  return 0;
-}
-
-/// ProcessBranchOnCompare - We found a branch on a comparison between a phi
-/// node and a value.  If we can identify when the comparison is true between
-/// the phi inputs and the value, we can fold the compare for that edge and
-/// thread through it.
-bool JumpThreading::ProcessBranchOnCompare(CmpInst *Cmp, BasicBlock *BB) {
-  PHINode *PN = cast<PHINode>(Cmp->getOperand(0));
-  Value *RHS = Cmp->getOperand(1);
-  
-  // If the phi isn't in the current block, an incoming edge to this block
-  // doesn't control the destination.
-  if (PN->getParent() != BB)
-    return false;
-  
-  // We can do this simplification if any comparisons fold to true or false.
-  // See if any do.
-  Value *PredVal = 0;
-  bool TrueDirection = false;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    PredVal = PN->getIncomingValue(i);
-    
-    Constant *Res = GetResultOfComparison(Cmp->getPredicate(), PredVal,
-                                          RHS, Cmp->getContext());
-    if (!Res) {
-      PredVal = 0;
-      continue;
-    }
-    
-    // If this folded to a constant expr, we can't do anything.
-    if (ConstantInt *ResC = dyn_cast<ConstantInt>(Res)) {
-      TrueDirection = ResC->getZExtValue();
-      break;
-    }
-    // If this folded to undef, just go the false way.
-    if (isa<UndefValue>(Res)) {
-      TrueDirection = false;
-      break;
-    }
-    
-    // Otherwise, we can't fold this input.
-    PredVal = 0;
-  }
-  
-  // If no match, bail out.
-  if (PredVal == 0)
-    return false;
-  
-  // If so, we can actually do this threading.  Merge any common predecessors
-  // that will act the same.
-  BasicBlock *PredBB = FactorCommonPHIPreds(PN, PredVal);
-  
-  // Next, get our successor.
-  BasicBlock *SuccBB = BB->getTerminator()->getSuccessor(!TrueDirection);
-  
-  // Ok, try to thread it!
-  return ThreadEdge(BB, PredBB, SuccBB);
-}
-
-
 /// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
 /// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
 /// NewPred using the entries from OldPred (suitably mapped).
@@ -914,10 +1113,11 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
   }
 }
 
-/// ThreadEdge - We have decided that it is safe and profitable to thread an
-/// edge from PredBB to SuccBB across BB.  Transform the IR to reflect this
-/// change.
-bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB, 
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB.  Transform the IR to reflect this change.
+bool JumpThreading::ThreadEdge(BasicBlock *BB, 
+                               const SmallVectorImpl<BasicBlock*> &PredBBs, 
                                BasicBlock *SuccBB) {
   // If threading to the same block as we come from, we would infinite loop.
   if (SuccBB == BB) {
@@ -929,8 +1129,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB,
   // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above FindLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB)) {
-    DEBUG(errs() << "  Not threading from '" << PredBB->getName()
-          << "' across loop header BB '" << BB->getName()
+    DEBUG(errs() << "  Not threading across loop header BB '" << BB->getName()
           << "' to dest BB '" << SuccBB->getName()
           << "' - it might create an irreducible loop!\n");
     return false;
@@ -943,6 +1142,17 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB,
     return false;
   }
   
+  // And finally, do it!  Start by factoring the predecessors is needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(errs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(),
+                                    ".thr_comm", this);
+  }
+  
   // And finally, do it!
   DEBUG(errs() << "  Threading edge from '" << PredBB->getName() << "' to '"
         << SuccBB->getName() << "' with cost: " << JumpThreadCost
@@ -1034,7 +1244,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB,
   TerminatorInst *PredTerm = PredBB->getTerminator();
   for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
     if (PredTerm->getSuccessor(i) == BB) {
-      BB->removePredecessor(PredBB);
+      RemovePredecessorAndSimplify(BB, PredBB, TD);
       PredTerm->setSuccessor(i, NewBB);
     }
   
@@ -1044,9 +1254,12 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BasicBlock *PredBB,
   BI = NewBB->begin();
   for (BasicBlock::iterator E = NewBB->end(); BI != E; ) {
     Instruction *Inst = BI++;
-    if (Constant *C = ConstantFoldInstruction(Inst, BB->getContext(), TD)) {
-      Inst->replaceAllUsesWith(C);
-      Inst->eraseFromParent();
+    
+    if (Value *V = SimplifyInstruction(Inst, TD)) {
+      WeakVH BIHandle(BI);
+      ReplaceAndSimplifyAllUses(Inst, V, TD);
+      if (BIHandle == 0)
+        BI = NewBB->begin();
       continue;
     }
     
@@ -1164,7 +1377,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   
   // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
   // that we nuked.
-  BB->removePredecessor(PredBB);
+  RemovePredecessorAndSimplify(BB, PredBB, TD);
   
   // Remove the unconditional branch at the end of the PredBB block.
   OldPredBranch->eraseFromParent();
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 756fbf3..104c873 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -263,7 +263,6 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Get the preheader block to move instructions into...
   Preheader = L->getLoopPreheader();
-  assert(Preheader&&"Preheader insertion pass guarantees we have a preheader!");
 
   // Loop over the body of this loop, looking for calls, invokes, and stores.
   // Because subloops have already been incorporated into AST, we skip blocks in
@@ -286,12 +285,14 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // us to sink instructions in one pass, without iteration.  After sinking
   // instructions, we perform another pass to hoist them out of the loop.
   //
-  SinkRegion(DT->getNode(L->getHeader()));
-  HoistRegion(DT->getNode(L->getHeader()));
+  if (L->hasDedicatedExits())
+    SinkRegion(DT->getNode(L->getHeader()));
+  if (Preheader)
+    HoistRegion(DT->getNode(L->getHeader()));
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can...
-  if (!DisablePromotion)
+  if (!DisablePromotion && Preheader && L->hasDedicatedExits())
     PromoteValuesInLoop();
 
   // Clear out loops state information for the next iteration
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 866d8b4..48817ab 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -115,6 +115,10 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   if (!preheader)
     return false;
   
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->hasDedicatedExits())
+    return false;
+
   // We can't remove loops that contain subloops.  If the subloops were dead,
   // they would already have been removed in earlier executions of this pass.
   if (L->begin() != L->end())
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
index 920d85c..8b6a233 100644
--- a/lib/Transforms/Scalar/LoopIndexSplit.cpp
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -209,6 +209,10 @@ bool LoopIndexSplit::runOnLoop(Loop *IncomingLoop, LPPassManager &LPM_Ref) {
   L = IncomingLoop;
   LPM = &LPM_Ref;
 
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
   // FIXME - Nested loops make dominator info updates tricky. 
   if (!L->getSubLoops().empty())
     return false;
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 7a4bb35..5004483 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -49,6 +48,7 @@ namespace {
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<DominanceFrontier>();
@@ -104,17 +104,18 @@ bool LoopRotate::runOnLoop(Loop *Lp, LPPassManager &LPM) {
 bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
   L = Lp;
 
-  OrigHeader =  L->getHeader();
   OrigPreHeader = L->getLoopPreheader();
+  if (!OrigPreHeader) return false;
+
   OrigLatch = L->getLoopLatch();
+  if (!OrigLatch) return false;
+
+  OrigHeader =  L->getHeader();
 
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
-  assert(OrigHeader && OrigLatch && OrigPreHeader &&
-         "Loop is not in canonical form");
-
   // If the loop header is not one of the loop exiting blocks then
   // either this loop is already rotated or it is not
   // suitable for loop rotation transformations.
@@ -287,7 +288,7 @@ void LoopRotate::preserveCanonicalLoopForm(LPPassManager &LPM) {
                                                 "bb.nph",
                                                 OrigHeader->getParent(), 
                                                 NewHeader);
-  LoopInfo &LI = LPM.getAnalysis<LoopInfo>();
+  LoopInfo &LI = getAnalysis<LoopInfo>();
   if (Loop *PL = LI.getLoopFor(OrigPreHeader))
     PL->addBasicBlockToLoop(NewPreHeader, LI.getBase());
   BranchInst::Create(NewHeader, NewPreHeader);
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e20fb16..564c7ac 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -51,6 +51,7 @@ STATISTIC(NumEliminated,  "Number of strides eliminated");
 STATISTIC(NumShadow,      "Number of Shadow IVs optimized");
 STATISTIC(NumImmSunk,     "Number of common expr immediates sunk into uses");
 STATISTIC(NumLoopCond,    "Number of loop terminating conds optimized");
+STATISTIC(NumCountZero,   "Number of count iv optimized to count toward zero");
 
 static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
                                        cl::init(false),
@@ -107,7 +108,7 @@ namespace {
 
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopStrengthReduce(const TargetLowering *tli = NULL) : 
+    explicit LoopStrengthReduce(const TargetLowering *tli = NULL) :
       LoopPass(&ID), TLI(tli) {
     }
 
@@ -131,12 +132,10 @@ namespace {
     }
 
   private:
-    ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
-                                  IVStrideUse* &CondUse,
-                                  const SCEV *const *  &CondStride);
-
     void OptimizeIndvars(Loop *L);
-    void OptimizeLoopCountIV(Loop *L);
+
+    /// OptimizeLoopTermCond - Change loop terminating condition to use the
+    /// postinc iv when possible.
     void OptimizeLoopTermCond(Loop *L);
 
     /// OptimizeShadowIV - If IV is used in a int-to-float cast
@@ -148,8 +147,28 @@ namespace {
     ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond,
                           IVStrideUse* &CondUse);
 
+    /// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for
+    /// deciding when to exit the loop is used only for that purpose, try to
+    /// rearrange things so it counts down to a test against zero.
+    bool OptimizeLoopCountIV(Loop *L);
+    bool OptimizeLoopCountIVOfStride(const SCEV* &Stride,
+                                     IVStrideUse* &CondUse, Loop *L);
+
+    /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a
+    /// single stride of IV.  All of the users may have different starting
+    /// values, and this may not be the only stride.
+    void StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
+                                      IVUsersOfOneStride &Uses,
+                                      Loop *L);
+    void StrengthReduceIVUsers(Loop *L);
+
+    ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
+                                  IVStrideUse* &CondUse,
+                                  const SCEV* &CondStride,
+                                  bool PostPass = false);
+
     bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
-                           const SCEV *const * &CondStride);
+                           const SCEV* &CondStride);
     bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
     const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *const&,
                              IVExpr&, const Type*,
@@ -164,6 +183,7 @@ namespace {
                               bool &AllUsesAreAddresses,
                               bool &AllUsesAreOutsideLoop,
                               std::vector<BasedUser> &UsersToProcess);
+    bool StrideMightBeShared(const SCEV *Stride, Loop *L, bool CheckPreInc);
     bool ShouldUseFullStrengthReductionMode(
                                 const std::vector<BasedUser> &UsersToProcess,
                                 const Loop *L,
@@ -188,9 +208,7 @@ namespace {
                                   Instruction *IVIncInsertPt,
                                   const Loop *L,
                                   SCEVExpander &PreheaderRewriter);
-    void StrengthReduceStridedIVUsers(const SCEV *const &Stride,
-                                      IVUsersOfOneStride &Uses,
-                                      Loop *L);
+
     void DeleteTriviallyDeadInstructions();
   };
 }
@@ -208,11 +226,11 @@ Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
 /// their operands subsequently dead.
 void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
   if (DeadInsts.empty()) return;
-  
+
   while (!DeadInsts.empty()) {
     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.back());
     DeadInsts.pop_back();
-    
+
     if (I == 0 || !isInstructionTriviallyDead(I))
       continue;
 
@@ -223,14 +241,14 @@ void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
           DeadInsts.push_back(U);
       }
     }
-    
+
     I->eraseFromParent();
     Changed = true;
   }
 }
 
-/// containsAddRecFromDifferentLoop - Determine whether expression S involves a 
-/// subexpression that is an AddRec from a loop other than L.  An outer loop 
+/// containsAddRecFromDifferentLoop - Determine whether expression S involves a
+/// subexpression that is an AddRec from a loop other than L.  An outer loop
 /// of L is OK, but not an inner loop nor a disjoint loop.
 static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
   // This is very common, put it first.
@@ -256,7 +274,7 @@ static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
     return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
            containsAddRecFromDifferentLoop(DE->getRHS(), L);
 #if 0
-  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll 
+  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll
   // need this when it is.
   if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
     return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
@@ -328,7 +346,7 @@ namespace {
     /// field to the Imm field (below).  BasedUser values are sorted by this
     /// field.
     const SCEV *Base;
-    
+
     /// Inst - The instruction using the induction variable.
     Instruction *Inst;
 
@@ -352,11 +370,11 @@ namespace {
     // instruction for a loop and uses outside the loop that are dominated by
     // the loop.
     bool isUseOfPostIncrementedValue;
-    
+
     BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
       : SE(se), Base(IVSU.getOffset()), Inst(IVSU.getUser()),
         OperandValToReplace(IVSU.getOperandValToReplace()),
-        Imm(SE->getIntegerSCEV(0, Base->getType())), 
+        Imm(SE->getIntegerSCEV(0, Base->getType())),
         isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {}
 
     // Once we rewrite the code to insert the new IVs we want, update the
@@ -367,8 +385,8 @@ namespace {
                                        SCEVExpander &Rewriter, Loop *L, Pass *P,
                                         LoopInfo &LI,
                                         SmallVectorImpl<WeakVH> &DeadInsts);
-    
-    Value *InsertCodeForBaseAtPosition(const SCEV *const &NewBase, 
+
+    Value *InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
                                        const Type *Ty,
                                        SCEVExpander &Rewriter,
                                        Instruction *IP, Loop *L,
@@ -383,7 +401,7 @@ void BasedUser::dump() const {
   errs() << "   Inst: " << *Inst;
 }
 
-Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase, 
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
                                               const Type *Ty,
                                               SCEVExpander &Rewriter,
                                               Instruction *IP, Loop *L,
@@ -393,10 +411,10 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
   // want to insert this expression before the user, we'd rather pull it out as
   // many loops as possible.
   Instruction *BaseInsertPt = IP;
-  
+
   // Figure out the most-nested loop that IP is in.
   Loop *InsertLoop = LI.getLoopFor(IP->getParent());
-  
+
   // If InsertLoop is not L, and InsertLoop is nested inside of L, figure out
   // the preheader of the outer-most loop where NewBase is not loop invariant.
   if (L->contains(IP->getParent()))
@@ -404,7 +422,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
       BaseInsertPt = InsertLoop->getLoopPreheader()->getTerminator();
       InsertLoop = InsertLoop->getParentLoop();
     }
-  
+
   Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt);
 
   const SCEV *NewValSCEV = SE->getUnknown(Base);
@@ -430,7 +448,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
   if (!isa<PHINode>(Inst)) {
     // By default, insert code at the user instruction.
     BasicBlock::iterator InsertPt = Inst;
-    
+
     // However, if the Operand is itself an instruction, the (potentially
     // complex) inserted code may be shared by many users.  Because of this, we
     // want to emit code for the computation of the operand right before its old
@@ -442,7 +460,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
     //
     // If this is a use outside the loop (which means after, since it is based
     // on a loop indvar) we use the post-incremented value, so that we don't
-    // artificially make the preinc value live out the bottom of the loop. 
+    // artificially make the preinc value live out the bottom of the loop.
     if (!isUseOfPostIncrementedValue && L->contains(Inst->getParent())) {
       if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
         InsertPt = NewBasePt;
@@ -477,7 +495,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
     if (PN->getIncomingValue(i) == OperandValToReplace) {
       // If the original expression is outside the loop, put the replacement
       // code in the same place as the original expression,
-      // which need not be an immediate predecessor of this PHI.  This way we 
+      // which need not be an immediate predecessor of this PHI.  This way we
       // need only one copy of it even if it is referenced multiple times in
       // the PHI.  We don't do this when the original expression is inside the
       // loop because multiple copies sometimes do useful sinking of code in
@@ -490,6 +508,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
         // is the canonical backedge for this loop, as this can make some
         // inserted code be in an illegal position.
         if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 &&
+            !isa<IndirectBrInst>(PHIPred->getTerminator()) &&
             (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) {
 
           // First step, split the critical edge.
@@ -572,11 +591,11 @@ static bool fitsInAddressMode(const SCEV *const &V, const Type *AccessTy,
 static void MoveLoopVariantsToImmediateField(const SCEV *&Val, const SCEV *&Imm,
                                              Loop *L, ScalarEvolution *SE) {
   if (Val->isLoopInvariant(L)) return;  // Nothing to do.
-  
+
   if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
     SmallVector<const SCEV *, 4> NewOps;
     NewOps.reserve(SAE->getNumOperands());
-    
+
     for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
       if (!SAE->getOperand(i)->isLoopInvariant(L)) {
         // If this is a loop-variant expression, it must stay in the immediate
@@ -594,7 +613,7 @@ static void MoveLoopVariantsToImmediateField(const SCEV *&Val, const SCEV *&Imm,
     // Try to pull immediates out of the start value of nested addrec's.
     const SCEV *Start = SARE->getStart();
     MoveLoopVariantsToImmediateField(Start, Imm, L, SE);
-    
+
     SmallVector<const SCEV *, 4> Ops(SARE->op_begin(), SARE->op_end());
     Ops[0] = Start;
     Val = SE->getAddRecExpr(Ops, SARE->getLoop());
@@ -617,11 +636,11 @@ static void MoveImmediateValues(const TargetLowering *TLI,
   if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
     SmallVector<const SCEV *, 4> NewOps;
     NewOps.reserve(SAE->getNumOperands());
-    
+
     for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
       const SCEV *NewOp = SAE->getOperand(i);
       MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE);
-      
+
       if (!NewOp->isLoopInvariant(L)) {
         // If this is a loop-variant expression, it must stay in the immediate
         // field of the expression.
@@ -640,7 +659,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
     // Try to pull immediates out of the start value of nested addrec's.
     const SCEV *Start = SARE->getStart();
     MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE);
-    
+
     if (Start != SARE->getStart()) {
       SmallVector<const SCEV *, 4> Ops(SARE->op_begin(), SARE->op_end());
       Ops[0] = Start;
@@ -656,8 +675,8 @@ static void MoveImmediateValues(const TargetLowering *TLI,
       const SCEV *SubImm = SE->getIntegerSCEV(0, Val->getType());
       const SCEV *NewOp = SME->getOperand(1);
       MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE);
-      
-      // If we extracted something out of the subexpressions, see if we can 
+
+      // If we extracted something out of the subexpressions, see if we can
       // simplify this!
       if (NewOp != SME->getOperand(1)) {
         // Scale SubImm up by "8".  If the result is a target constant, we are
@@ -666,7 +685,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,
         if (fitsInAddressMode(SubImm, AccessTy, TLI, false)) {
           // Accumulate the immediate.
           Imm = SE->getAddExpr(Imm, SubImm);
-          
+
           // Update what is left of 'Val'.
           Val = SE->getMulExpr(SME->getOperand(0), NewOp);
           return;
@@ -714,7 +733,7 @@ static void SeparateSubExprs(SmallVector<const SCEV *, 16> &SubExprs,
       SmallVector<const SCEV *, 4> Ops(SARE->op_begin(), SARE->op_end());
       Ops[0] = Zero;   // Start with zero base.
       SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop()));
-      
+
 
       SeparateSubExprs(SubExprs, SARE->getOperand(0), SE);
     }
@@ -724,7 +743,7 @@ static void SeparateSubExprs(SmallVector<const SCEV *, 16> &SubExprs,
   }
 }
 
-// This is logically local to the following function, but C++ says we have 
+// This is logically local to the following function, but C++ says we have
 // to make it file scope.
 struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
 
@@ -762,7 +781,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
   // an addressing mode "for free"; such expressions are left within the loop.
   // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
   std::map<const SCEV *, SubExprUseData> SubExpressionUseData;
-  
+
   // UniqueSubExprs - Keep track of all of the subexpressions we see in the
   // order we see them.
   SmallVector<const SCEV *, 16> UniqueSubExprs;
@@ -779,7 +798,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
     if (!L->contains(Uses[i].Inst->getParent()))
       continue;
     NumUsesInsideLoop++;
-    
+
     // If the base is zero (which is common), return zero now, there are no
     // CSEs we can find.
     if (Uses[i].Base == Zero) return Zero;
@@ -811,13 +830,13 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
   // Now that we know how many times each is used, build Result.  Iterate over
   // UniqueSubexprs so that we have a stable ordering.
   for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
-    std::map<const SCEV *, SubExprUseData>::iterator I = 
+    std::map<const SCEV *, SubExprUseData>::iterator I =
        SubExpressionUseData.find(UniqueSubExprs[i]);
     assert(I != SubExpressionUseData.end() && "Entry not found?");
-    if (I->second.Count == NumUsesInsideLoop) { // Found CSE! 
+    if (I->second.Count == NumUsesInsideLoop) { // Found CSE!
       if (I->second.notAllUsesAreFree)
         Result = SE->getAddExpr(Result, I->first);
-      else 
+      else
         FreeResult = SE->getAddExpr(FreeResult, I->first);
     } else
       // Remove non-cse's from SubExpressionUseData.
@@ -849,13 +868,13 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
 
   // If we found no CSE's, return now.
   if (Result == Zero) return Result;
-  
+
   // If we still have a FreeResult, remove its subexpressions from
   // SubExpressionUseData.  This means they will remain in the use Bases.
   if (FreeResult != Zero) {
     SeparateSubExprs(SubExprs, FreeResult, SE);
     for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
-      std::map<const SCEV *, SubExprUseData>::iterator I = 
+      std::map<const SCEV *, SubExprUseData>::iterator I =
          SubExpressionUseData.find(SubExprs[j]);
       SubExpressionUseData.erase(I);
     }
@@ -882,7 +901,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
         SubExprs.erase(SubExprs.begin()+j);
         --j; --e;
       }
-    
+
     // Finally, add the non-shared expressions together.
     if (SubExprs.empty())
       Uses[i].Base = Zero;
@@ -890,11 +909,11 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
       Uses[i].Base = SE->getAddExpr(SubExprs);
     SubExprs.clear();
   }
- 
+
   return Result;
 }
 
-/// ValidScale - Check whether the given Scale is valid for all loads and 
+/// ValidScale - Check whether the given Scale is valid for all loads and
 /// stores in UsersToProcess.
 ///
 bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
@@ -911,7 +930,7 @@ bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
       AccessTy = getAccessType(UsersToProcess[i].Inst);
     else if (isa<PHINode>(UsersToProcess[i].Inst))
       continue;
-    
+
     TargetLowering::AddrMode AM;
     if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
       AM.BaseOffs = SC->getValue()->getSExtValue();
@@ -983,13 +1002,13 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
 /// reuse is possible.  Factors can be negative on same targets, e.g. ARM.
 ///
 /// If all uses are outside the loop, we don't require that all multiplies
-/// be folded into the addressing mode, nor even that the factor be constant; 
-/// a multiply (executed once) outside the loop is better than another IV 
+/// be folded into the addressing mode, nor even that the factor be constant;
+/// a multiply (executed once) outside the loop is better than another IV
 /// within.  Well, usually.
 const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
                                 bool AllUsesAreAddresses,
                                 bool AllUsesAreOutsideLoop,
-                                const SCEV *const &Stride, 
+                                const SCEV *const &Stride,
                                 IVExpr &IV, const Type *Ty,
                                 const std::vector<BasedUser>& UsersToProcess) {
   if (StrideNoReuse.count(Stride))
@@ -999,11 +1018,16 @@ const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     int64_t SInt = SC->getValue()->getSExtValue();
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
-      std::map<const SCEV *, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV *, IVsOfOneStride>::iterator SI =
                 IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
           StrideNoReuse.count(SI->first))
         continue;
+      // The other stride has no uses, don't reuse it.
+      std::map<const SCEV *, IVUsersOfOneStride *>::iterator UI =
+        IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
+      if (UI->second->Users.empty())
+        continue;
       int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
       if (SI->first != Stride &&
           (unsigned(abs64(SInt)) < SSInt || (SInt % SSInt) != 0))
@@ -1052,7 +1076,7 @@ const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     // an existing IV if we can.
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
-      std::map<const SCEV *, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV *, IVsOfOneStride>::iterator SI =
                 IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
         continue;
@@ -1072,9 +1096,9 @@ const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     // -1*old.
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
-      std::map<const SCEV *, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV *, IVsOfOneStride>::iterator SI =
                 IVsByStride.find(IU->StrideOrder[NewStride]);
-      if (SI == IVsByStride.end()) 
+      if (SI == IVsByStride.end())
         continue;
       if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(SI->first))
         if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(ME->getOperand(0)))
@@ -1104,18 +1128,18 @@ static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
 static bool isNonConstantNegative(const SCEV *const &Expr) {
   const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
   if (!Mul) return false;
-  
+
   // If there is a constant factor, it will be first.
   const SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
   if (!SC) return false;
-  
+
   // Return true if the value is negative, this matches things like (-42 * V).
   return SC->getValue()->getValue().isNegative();
 }
 
 /// CollectIVUsers - Transform our list of users and offsets to a bit more
-/// complex table. In this new vector, each 'BasedUser' contains 'Base', the base
-/// of the strided accesses, as well as the old information from Uses. We
+/// complex table. In this new vector, each 'BasedUser' contains 'Base', the
+/// base of the strided accesses, as well as the old information from Uses. We
 /// progressively move information from the Base field to the Imm field, until
 /// we eventually have the full access expression to rewrite the use.
 const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
@@ -1145,7 +1169,7 @@ const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
   // We now have a whole bunch of uses of like-strided induction variables, but
   // they might all have different bases.  We want to emit one PHI node for this
   // stride which we fold as many common expressions (between the IVs) into as
-  // possible.  Start by identifying the common expressions in the base values 
+  // possible.  Start by identifying the common expressions in the base values
   // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
   // "A+B"), emit it to the preheader, then remove the expression from the
   // UsersToProcess base values.
@@ -1165,11 +1189,11 @@ const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
     if (!L->contains(UsersToProcess[i].Inst->getParent())) {
       UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm,
                                              UsersToProcess[i].Base);
-      UsersToProcess[i].Base = 
+      UsersToProcess[i].Base =
         SE->getIntegerSCEV(0, UsersToProcess[i].Base->getType());
     } else {
       // Not all uses are outside the loop.
-      AllUsesAreOutsideLoop = false; 
+      AllUsesAreOutsideLoop = false;
 
       // Addressing modes can be folded into loads and stores.  Be careful that
       // the store is through the expression, not of the expression though.
@@ -1183,11 +1207,11 @@ const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
 
       if (isAddress)
         HasAddress = true;
-     
+
       // If this use isn't an address, then not all uses are addresses.
       if (!isAddress && !isPHI)
         AllUsesAreAddresses = false;
-      
+
       MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
                           UsersToProcess[i].Imm, isAddress, L, SE);
     }
@@ -1198,7 +1222,7 @@ const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
   // for one fewer iv.
   if (NumPHI > 1)
     AllUsesAreAddresses = false;
-    
+
   // There are no in-loop address uses.
   if (AllUsesAreAddresses && (!HasAddress && !AllUsesAreOutsideLoop))
     AllUsesAreAddresses = false;
@@ -1491,12 +1515,13 @@ static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
   return true;
 }
 
-/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
+/// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a single
 /// stride of IV.  All of the users may have different starting values, and this
 /// may not be the only stride.
-void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
-                                                      IVUsersOfOneStride &Uses,
-                                                      Loop *L) {
+void
+LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
+                                                  IVUsersOfOneStride &Uses,
+                                                  Loop *L) {
   // If all the users are moved to another stride, then there is nothing to do.
   if (Uses.Users.empty())
     return;
@@ -1518,8 +1543,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
   // have the full access expression to rewrite the use.
   std::vector<BasedUser> UsersToProcess;
   const SCEV *CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
-                                          AllUsesAreOutsideLoop,
-                                          UsersToProcess);
+                                           AllUsesAreOutsideLoop,
+                                           UsersToProcess);
 
   // Sort the UsersToProcess array so that users with common bases are
   // next to each other.
@@ -1588,12 +1613,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
   const SCEV *RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
   IVExpr   ReuseIV(SE->getIntegerSCEV(0,
                                     Type::getInt32Ty(Preheader->getContext())),
-                   SE->getIntegerSCEV(0, 
+                   SE->getIntegerSCEV(0,
                                     Type::getInt32Ty(Preheader->getContext())),
                    0);
 
-  /// Choose a strength-reduction strategy and prepare for it by creating
-  /// the necessary PHIs and adjusting the bookkeeping.
+  // Choose a strength-reduction strategy and prepare for it by creating
+  // the necessary PHIs and adjusting the bookkeeping.
   if (ShouldUseFullStrengthReductionMode(UsersToProcess, L,
                                          AllUsesAreAddresses, Stride)) {
     PrepareToStrengthReduceFully(UsersToProcess, Stride, CommonExprs, L,
@@ -1606,7 +1631,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
     // If all uses are addresses, check if it is possible to reuse an IV.  The
     // new IV must have a stride that is a multiple of the old stride; the
     // multiple must be a number that can be encoded in the scale field of the
-    // target addressing mode; and we must have a valid instruction after this 
+    // target addressing mode; and we must have a valid instruction after this
     // substitution, including the immediate field, if any.
     RewriteFactor = CheckForIVReuse(HaveCommonExprs, AllUsesAreAddresses,
                                     AllUsesAreOutsideLoop,
@@ -1649,7 +1674,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
         // We want this constant emitted into the preheader! This is just
         // using cast as a copy so BitCast (no-op cast) is appropriate
         BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert",
-                                PreInsertPt);       
+                                PreInsertPt);
       }
     }
 
@@ -1723,7 +1748,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
             assert(SE->getTypeSizeInBits(RewriteExpr->getType()) <
                    SE->getTypeSizeInBits(ReuseIV.Base->getType()) &&
                    "Unexpected lengthening conversion!");
-            typedBase = SE->getTruncateExpr(ReuseIV.Base, 
+            typedBase = SE->getTruncateExpr(ReuseIV.Base,
                                             RewriteExpr->getType());
           }
           RewriteExpr = SE->getMinusSCEV(RewriteExpr, typedBase);
@@ -1775,11 +1800,29 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV *const &Stride,
   // different starting values, into different PHIs.
 }
 
+void LoopStrengthReduce::StrengthReduceIVUsers(Loop *L) {
+  // Note: this processes each stride/type pair individually.  All users
+  // passed into StrengthReduceIVUsersOfStride have the same type AND stride.
+  // Also, note that we iterate over IVUsesByStride indirectly by using
+  // StrideOrder. This extra layer of indirection makes the ordering of
+  // strides deterministic - not dependent on map order.
+  for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e; ++Stride) {
+    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    // FIXME: Generalize to non-affine IV's.
+    if (!SI->first->isLoopInvariant(L))
+      continue;
+    StrengthReduceIVUsersOfStride(SI->first, *SI->second, L);
+  }
+}
+
 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
 /// set the IV user and stride information and return true, otherwise return
 /// false.
-bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
-                                       const SCEV *const * &CondStride) {
+bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond,
+                                           IVStrideUse *&CondUse,
+                                           const SCEV* &CondStride) {
   for (unsigned Stride = 0, e = IU->StrideOrder.size();
        Stride != e && !CondUse; ++Stride) {
     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
@@ -1793,12 +1836,12 @@ bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse
         // InstCombine does it as well for simple uses, it's not clear that it
         // occurs enough in real life to handle.
         CondUse = UI;
-        CondStride = &SI->first;
+        CondStride = SI->first;
         return true;
       }
   }
   return false;
-}    
+}
 
 namespace {
   // Constant strides come first which in turns are sorted by their absolute
@@ -1851,8 +1894,9 @@ namespace {
 /// v1 = v1 + 3
 /// if (v1 < 30) goto loop
 ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
-                                                IVStrideUse* &CondUse,
-                                              const SCEV *const* &CondStride) {
+                                                  IVStrideUse* &CondUse,
+                                                  const SCEV* &CondStride,
+                                                  bool PostPass) {
   // If there's only one stride in the loop, there's nothing to do here.
   if (IU->StrideOrder.size() < 2)
     return Cond;
@@ -1860,23 +1904,31 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
   // trying to change the condition because the stride will still
   // remain.
   std::map<const SCEV *, IVUsersOfOneStride *>::iterator I =
-    IU->IVUsesByStride.find(*CondStride);
-  if (I == IU->IVUsesByStride.end() ||
-      I->second->Users.size() != 1)
+    IU->IVUsesByStride.find(CondStride);
+  if (I == IU->IVUsesByStride.end())
     return Cond;
+  if (I->second->Users.size() > 1) {
+    for (ilist<IVStrideUse>::iterator II = I->second->Users.begin(),
+           EE = I->second->Users.end(); II != EE; ++II) {
+      if (II->getUser() == Cond)
+        continue;
+      if (!isInstructionTriviallyDead(II->getUser()))
+        return Cond;
+    }
+  }
   // Only handle constant strides for now.
-  const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride);
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(CondStride);
   if (!SC) return Cond;
 
   ICmpInst::Predicate Predicate = Cond->getPredicate();
   int64_t CmpSSInt = SC->getValue()->getSExtValue();
-  unsigned BitWidth = SE->getTypeSizeInBits((*CondStride)->getType());
+  unsigned BitWidth = SE->getTypeSizeInBits(CondStride->getType());
   uint64_t SignBit = 1ULL << (BitWidth-1);
   const Type *CmpTy = Cond->getOperand(0)->getType();
   const Type *NewCmpTy = NULL;
   unsigned TyBits = SE->getTypeSizeInBits(CmpTy);
   unsigned NewTyBits = 0;
-  const SCEV **NewStride = NULL;
+  const SCEV *NewStride = NULL;
   Value *NewCmpLHS = NULL;
   Value *NewCmpRHS = NULL;
   int64_t Scale = 1;
@@ -1885,16 +1937,31 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
   if (ConstantInt *C = dyn_cast<ConstantInt>(Cond->getOperand(1))) {
     int64_t CmpVal = C->getValue().getSExtValue();
 
+    // Check the relevant induction variable for conformance to
+    // the pattern.
+    const SCEV *IV = SE->getSCEV(Cond->getOperand(0));
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+    if (!AR || !AR->isAffine())
+      return Cond;
+
+    const SCEVConstant *StartC = dyn_cast<SCEVConstant>(AR->getStart());
     // Check stride constant and the comparision constant signs to detect
     // overflow.
-    if ((CmpVal & SignBit) != (CmpSSInt & SignBit))
-      return Cond;
+    if (StartC) {
+      if ((StartC->getValue()->getSExtValue() < CmpVal && CmpSSInt < 0) ||
+          (StartC->getValue()->getSExtValue() > CmpVal && CmpSSInt > 0))
+        return Cond;
+    } else {
+      // More restrictive check for the other cases.
+      if ((CmpVal & SignBit) != (CmpSSInt & SignBit))
+        return Cond;
+    }
 
     // Look for a suitable stride / iv as replacement.
     for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
       std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
         IU->IVUsesByStride.find(IU->StrideOrder[i]);
-      if (!isa<SCEVConstant>(SI->first))
+      if (!isa<SCEVConstant>(SI->first) || SI->second->Users.empty())
         continue;
       int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
       if (SSInt == CmpSSInt ||
@@ -1904,6 +1971,14 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
 
       Scale = SSInt / CmpSSInt;
       int64_t NewCmpVal = CmpVal * Scale;
+
+      // If old icmp value fits in icmp immediate field, but the new one doesn't
+      // try something else.
+      if (TLI &&
+          TLI->isLegalICmpImmediate(CmpVal) &&
+          !TLI->isLegalICmpImmediate(NewCmpVal))
+        continue;
+
       APInt Mul = APInt(BitWidth*2, CmpVal, true);
       Mul = Mul * APInt(BitWidth*2, Scale, true);
       // Check for overflow.
@@ -1918,8 +1993,6 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
           (CmpVal & SignBit) != (NewCmpVal & SignBit))
         continue;
 
-      if (NewCmpVal == CmpVal)
-        continue;
       // Pick the best iv to use trying to avoid a cast.
       NewCmpLHS = NULL;
       for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
@@ -1969,19 +2042,21 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       if (NewTyBits != TyBits && !isa<SCEVConstant>(CondUse->getOffset()))
         continue;
 
-      bool AllUsesAreAddresses = true;
-      bool AllUsesAreOutsideLoop = true;
-      std::vector<BasedUser> UsersToProcess;
-      const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
-                                              AllUsesAreAddresses,
-                                              AllUsesAreOutsideLoop,
-                                              UsersToProcess);
-      // Avoid rewriting the compare instruction with an iv of new stride
-      // if it's likely the new stride uses will be rewritten using the
-      // stride of the compare instruction.
-      if (AllUsesAreAddresses &&
-          ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
-        continue;
+      if (!PostPass) {
+        bool AllUsesAreAddresses = true;
+        bool AllUsesAreOutsideLoop = true;
+        std::vector<BasedUser> UsersToProcess;
+        const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+                                                 AllUsesAreAddresses,
+                                                 AllUsesAreOutsideLoop,
+                                                 UsersToProcess);
+        // Avoid rewriting the compare instruction with an iv of new stride
+        // if it's likely the new stride uses will be rewritten using the
+        // stride of the compare instruction.
+        if (AllUsesAreAddresses &&
+            ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+          continue;
+      }
 
       // Avoid rewriting the compare instruction with an iv which has
       // implicit extension or truncation built into it.
@@ -1994,7 +2069,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       if (Scale < 0 && !Cond->isEquality())
         Predicate = ICmpInst::getSwappedPredicate(Predicate);
 
-      NewStride = &IU->StrideOrder[i];
+      NewStride = IU->StrideOrder[i];
       if (!isa<PointerType>(NewCmpTy))
         NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal);
       else {
@@ -2031,13 +2106,16 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
     Cond = new ICmpInst(OldCond, Predicate, NewCmpLHS, NewCmpRHS,
                         L->getHeader()->getName() + ".termcond");
 
+    DEBUG(errs() << "    Change compare stride in Inst " << *OldCond);
+    DEBUG(errs() << " to " << *Cond << '\n');
+
     // Remove the old compare instruction. The old indvar is probably dead too.
     DeadInsts.push_back(CondUse->getOperandValToReplace());
     OldCond->replaceAllUsesWith(Cond);
     OldCond->eraseFromParent();
 
-    IU->IVUsesByStride[*NewStride]->addUser(NewOffset, Cond, NewCmpLHS);
-    CondUse = &IU->IVUsesByStride[*NewStride]->Users.back();
+    IU->IVUsesByStride[NewStride]->addUser(NewOffset, Cond, NewCmpLHS);
+    CondUse = &IU->IVUsesByStride[NewStride]->Users.back();
     CondStride = NewStride;
     ++NumEliminated;
     Changed = true;
@@ -2180,7 +2258,7 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return;
-    
+
   for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e;
        ++Stride) {
     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
@@ -2199,13 +2277,13 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
       /* If shadow use is a int->float cast then insert a second IV
          to eliminate this cast.
 
-           for (unsigned i = 0; i < n; ++i) 
+           for (unsigned i = 0; i < n; ++i)
              foo((double)i);
 
          is transformed into
 
            double d = 0.0;
-           for (unsigned i = 0; i < n; ++i, ++d) 
+           for (unsigned i = 0; i < n; ++i, ++d)
              foo(d);
       */
       if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
@@ -2227,7 +2305,7 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
 
       const Type *SrcTy = PH->getType();
       int Mantissa = DestTy->getFPMantissaWidth();
-      if (Mantissa == -1) continue; 
+      if (Mantissa == -1) continue;
       if ((int)SE->getTypeSizeInBits(SrcTy) > Mantissa)
         continue;
 
@@ -2239,12 +2317,12 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
         Entry = 1;
         Latch = 0;
       }
-        
+
       ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
       if (!Init) continue;
       Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
 
-      BinaryOperator *Incr = 
+      BinaryOperator *Incr =
         dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
       if (!Incr) continue;
       if (Incr->getOpcode() != Instruction::Add
@@ -2271,7 +2349,7 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
 
       /* create new increment. '++d' in above example. */
       Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
-      BinaryOperator *NewIncr = 
+      BinaryOperator *NewIncr =
         BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
                                  Instruction::FAdd : Instruction::FSub,
                                NewPH, CFP, "IV.S.next.", Incr);
@@ -2297,237 +2375,385 @@ void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
   OptimizeShadowIV(L);
 }
 
-/// OptimizeLoopTermCond - Change loop terminating condition to use the 
+bool LoopStrengthReduce::StrideMightBeShared(const SCEV* Stride, Loop *L,
+                                             bool CheckPreInc) {
+  int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    const SCEV *Share = SI->first;
+    if (!isa<SCEVConstant>(SI->first) || Share == Stride)
+      continue;
+    int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
+    if (SSInt == SInt)
+      return true; // This can definitely be reused.
+    if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
+      continue;
+    int64_t Scale = SSInt / SInt;
+    bool AllUsesAreAddresses = true;
+    bool AllUsesAreOutsideLoop = true;
+    std::vector<BasedUser> UsersToProcess;
+    const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+                                             AllUsesAreAddresses,
+                                             AllUsesAreOutsideLoop,
+                                             UsersToProcess);
+    if (AllUsesAreAddresses &&
+        ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess)) {
+      if (!CheckPreInc)
+        return true;
+      // Any pre-inc iv use?
+      IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[Share];
+      for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
+             E = StrideUses.Users.end(); I != E; ++I) {
+        if (!I->isUseOfPostIncrementedValue())
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// isUsedByExitBranch - Return true if icmp is used by a loop terminating
+/// conditional branch or it's and / or with other conditions before being used
+/// as the condition.
+static bool isUsedByExitBranch(ICmpInst *Cond, Loop *L) {
+  BasicBlock *CondBB = Cond->getParent();
+  if (!L->isLoopExiting(CondBB))
+    return false;
+  BranchInst *TermBr = dyn_cast<BranchInst>(CondBB->getTerminator());
+  if (!TermBr || !TermBr->isConditional())
+    return false;
+
+  Value *User = *Cond->use_begin();
+  Instruction *UserInst = dyn_cast<Instruction>(User);
+  while (UserInst &&
+         (UserInst->getOpcode() == Instruction::And ||
+          UserInst->getOpcode() == Instruction::Or)) {
+    if (!UserInst->hasOneUse() || UserInst->getParent() != CondBB)
+      return false;
+    User = *User->use_begin();
+    UserInst = dyn_cast<Instruction>(User);
+  }
+  return User == TermBr;
+}
+
+static bool ShouldCountToZero(ICmpInst *Cond, IVStrideUse* &CondUse,
+                              ScalarEvolution *SE, Loop *L,
+                              const TargetLowering *TLI = 0) {
+  if (!L->contains(Cond->getParent()))
+    return false;
+
+  if (!isa<SCEVConstant>(CondUse->getOffset()))
+    return false;
+
+  // Handle only tests for equality for the moment.
+  if (!Cond->isEquality() || !Cond->hasOneUse())
+    return false;
+  if (!isUsedByExitBranch(Cond, L))
+    return false;
+
+  Value *CondOp0 = Cond->getOperand(0);
+  const SCEV *IV = SE->getSCEV(CondOp0);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine())
+    return false;
+
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+  if (!SC || SC->getValue()->getSExtValue() < 0)
+    // If it's already counting down, don't do anything.
+    return false;
+
+  // If the RHS of the comparison is not an loop invariant, the rewrite
+  // cannot be done. Also bail out if it's already comparing against a zero.
+  // If we are checking this before cmp stride optimization, check if it's
+  // comparing against a already legal immediate.
+  Value *RHS = Cond->getOperand(1);
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS);
+  if (!L->isLoopInvariant(RHS) ||
+      (RHSC && RHSC->isZero()) ||
+      (RHSC && TLI && TLI->isLegalICmpImmediate(RHSC->getSExtValue())))
+    return false;
+
+  // Make sure the IV is only used for counting.  Value may be preinc or
+  // postinc; 2 uses in either case.
+  if (!CondOp0->hasNUses(2))
+    return false;
+
+  return true;
+}
+
+/// OptimizeLoopTermCond - Change loop terminating condition to use the
 /// postinc iv when possible.
 void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
-  // Finally, get the terminating condition for the loop if possible.  If we
-  // can, we want to change it to use a post-incremented version of its
-  // induction variable, to allow coalescing the live ranges for the IV into
-  // one register value.
   BasicBlock *LatchBlock = L->getLoopLatch();
-  BasicBlock *ExitingBlock = L->getExitingBlock();
-  
-  if (!ExitingBlock)
-    // Multiple exits, just look at the exit in the latch block if there is one.
-    ExitingBlock = LatchBlock;
-  BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-  if (!TermBr)
-    return;
-  if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
-    return;
+  bool LatchExit = L->isLoopExiting(LatchBlock);
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
 
-  // Search IVUsesByStride to find Cond's IVUse if there is one.
-  IVStrideUse *CondUse = 0;
-  const SCEV *const *CondStride = 0;
-  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
-  if (!FindIVUserForCond(Cond, CondUse, CondStride))
-    return; // setcc doesn't use the IV.
-
-  if (ExitingBlock != LatchBlock) {
-    if (!Cond->hasOneUse())
-      // See below, we don't want the condition to be cloned.
-      return;
-
-    // If exiting block is the latch block, we know it's safe and profitable to
-    // transform the icmp to use post-inc iv. Otherwise do so only if it would
-    // not reuse another iv and its iv would be reused by other uses. We are
-    // optimizing for the case where the icmp is the only use of the iv.
-    IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[*CondStride];
-    for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
-         E = StrideUses.Users.end(); I != E; ++I) {
-      if (I->getUser() == Cond)
-        continue;
-      if (!I->isUseOfPostIncrementedValue())
-        return;
-    }
+  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitingBlock = ExitingBlocks[i];
 
-    // FIXME: This is expensive, and worse still ChangeCompareStride does a
-    // similar check. Can we perform all the icmp related transformations after
-    // StrengthReduceStridedIVUsers?
-    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
-      int64_t SInt = SC->getValue()->getSExtValue();
-      for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee;
-           ++NewStride) {
-        std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
-          IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
-        if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
-          continue;
-        int64_t SSInt =
-          cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
-        if (SSInt == SInt)
-          return; // This can definitely be reused.
-        if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
-          continue;
-        int64_t Scale = SSInt / SInt;
-        bool AllUsesAreAddresses = true;
-        bool AllUsesAreOutsideLoop = true;
-        std::vector<BasedUser> UsersToProcess;
-        const SCEV *CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
-                                                AllUsesAreAddresses,
-                                                AllUsesAreOutsideLoop,
-                                                UsersToProcess);
-        // Avoid rewriting the compare instruction with an iv of new stride
-        // if it's likely the new stride uses will be rewritten using the
-        // stride of the compare instruction.
-        if (AllUsesAreAddresses &&
-            ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
-          return;
-      }
-    }
+    // Finally, get the terminating condition for the loop if possible.  If we
+    // can, we want to change it to use a post-incremented version of its
+    // induction variable, to allow coalescing the live ranges for the IV into
+    // one register value.
 
-    StrideNoReuse.insert(*CondStride);
-  }
+    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    if (!TermBr)
+      continue;
+    // FIXME: Overly conservative, termination condition could be an 'or' etc..
+    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+      continue;
 
-  // If the trip count is computed in terms of a max (due to ScalarEvolution
-  // being unable to find a sufficient guard, for example), change the loop
-  // comparison to use SLT or ULT instead of NE.
-  Cond = OptimizeMax(L, Cond, CondUse);
-
-  // If possible, change stride and operands of the compare instruction to
-  // eliminate one stride.
-  if (ExitingBlock == LatchBlock)
-    Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
-
-  // It's possible for the setcc instruction to be anywhere in the loop, and
-  // possible for it to have multiple users.  If it is not immediately before
-  // the latch block branch, move it.
-  if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) {
-    if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
-      Cond->moveBefore(TermBr);
-    } else {
-      // Otherwise, clone the terminating condition and insert into the loopend.
-      Cond = cast<ICmpInst>(Cond->clone());
-      Cond->setName(L->getHeader()->getName() + ".termcond");
-      LatchBlock->getInstList().insert(TermBr, Cond);
-      
-      // Clone the IVUse, as the old use still exists!
-      IU->IVUsesByStride[*CondStride]->addUser(CondUse->getOffset(), Cond,
-                                             CondUse->getOperandValToReplace());
-      CondUse = &IU->IVUsesByStride[*CondStride]->Users.back();
+    // Search IVUsesByStride to find Cond's IVUse if there is one.
+    IVStrideUse *CondUse = 0;
+    const SCEV *CondStride = 0;
+    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+    if (!FindIVUserForCond(Cond, CondUse, CondStride))
+      continue;
+
+    // If the latch block is exiting and it's not a single block loop, it's
+    // not safe to use postinc iv in other exiting blocks. FIXME: overly
+    // conservative? How about icmp stride optimization?
+    bool UsePostInc =  !(e > 1 && LatchExit && ExitingBlock != LatchBlock);
+    if (UsePostInc && ExitingBlock != LatchBlock) {
+      if (!Cond->hasOneUse())
+        // See below, we don't want the condition to be cloned.
+        UsePostInc = false;
+      else {
+        // If exiting block is the latch block, we know it's safe and profitable
+        // to transform the icmp to use post-inc iv. Otherwise do so only if it
+        // would not reuse another iv and its iv would be reused by other uses.
+        // We are optimizing for the case where the icmp is the only use of the
+        // iv.
+        IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[CondStride];
+        for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
+               E = StrideUses.Users.end(); I != E; ++I) {
+          if (I->getUser() == Cond)
+            continue;
+          if (!I->isUseOfPostIncrementedValue()) {
+            UsePostInc = false;
+            break;
+          }
+        }
+      }
+
+      // If iv for the stride might be shared and any of the users use pre-inc
+      // iv might be used, then it's not safe to use post-inc iv.
+      if (UsePostInc &&
+          isa<SCEVConstant>(CondStride) &&
+          StrideMightBeShared(CondStride, L, true))
+        UsePostInc = false;
     }
-  }
 
-  // If we get to here, we know that we can transform the setcc instruction to
-  // use the post-incremented version of the IV, allowing us to coalesce the
-  // live ranges for the IV correctly.
-  CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), *CondStride));
-  CondUse->setIsUseOfPostIncrementedValue(true);
-  Changed = true;
+    // If the trip count is computed in terms of a max (due to ScalarEvolution
+    // being unable to find a sufficient guard, for example), change the loop
+    // comparison to use SLT or ULT instead of NE.
+    Cond = OptimizeMax(L, Cond, CondUse);
+
+    // If possible, change stride and operands of the compare instruction to
+    // eliminate one stride. However, avoid rewriting the compare instruction
+    // with an iv of new stride if it's likely the new stride uses will be
+    // rewritten using the stride of the compare instruction.
+    if (ExitingBlock == LatchBlock && isa<SCEVConstant>(CondStride)) {
+      // If the condition stride is a constant and it's the only use, we might
+      // want to optimize it first by turning it to count toward zero.
+      if (!StrideMightBeShared(CondStride, L, false) &&
+          !ShouldCountToZero(Cond, CondUse, SE, L, TLI))
+        Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
+    }
 
-  ++NumLoopCond;
-}
+    if (!UsePostInc)
+      continue;
 
-/// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
-/// when to exit the loop is used only for that purpose, try to rearrange things
-/// so it counts down to a test against zero.
-void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
+    DEBUG(errs() << "  Change loop exiting icmp to use postinc iv: "
+          << *Cond << '\n');
 
-  // If the number of times the loop is executed isn't computable, give up.
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
-    return;
+    // It's possible for the setcc instruction to be anywhere in the loop, and
+    // possible for it to have multiple users.  If it is not immediately before
+    // the exiting block branch, move it.
+    if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) {
+      if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
+        Cond->moveBefore(TermBr);
+      } else {
+        // Otherwise, clone the terminating condition and insert into the
+        // loopend.
+        Cond = cast<ICmpInst>(Cond->clone());
+        Cond->setName(L->getHeader()->getName() + ".termcond");
+        ExitingBlock->getInstList().insert(TermBr, Cond);
+
+        // Clone the IVUse, as the old use still exists!
+        IU->IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
+                                             CondUse->getOperandValToReplace());
+        CondUse = &IU->IVUsesByStride[CondStride]->Users.back();
+      }
+    }
 
-  // Get the terminating condition for the loop if possible (this isn't
-  // necessarily in the latch, or a block that's a predecessor of the header).
-  if (!L->getExitBlock())
-    return; // More than one loop exit blocks.
+    // If we get to here, we know that we can transform the setcc instruction to
+    // use the post-incremented version of the IV, allowing us to coalesce the
+    // live ranges for the IV correctly.
+    CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), CondStride));
+    CondUse->setIsUseOfPostIncrementedValue(true);
+    Changed = true;
 
-  // Okay, there is one exit block.  Try to find the condition that causes the
-  // loop to be exited.
-  BasicBlock *ExitingBlock = L->getExitingBlock();
-  if (!ExitingBlock)
-    return; // More than one block exiting!
+    ++NumLoopCond;
+  }
+}
 
-  // Okay, we've computed the exiting block.  See what condition causes us to
-  // exit.
-  //
-  // FIXME: we should be able to handle switch instructions (with a single exit)
-  BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-  if (TermBr == 0) return;
-  assert(TermBr->isConditional() && "If unconditional, it can't be in loop!");
-  if (!isa<ICmpInst>(TermBr->getCondition()))
-    return;
-  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+bool LoopStrengthReduce::OptimizeLoopCountIVOfStride(const SCEV* &Stride,
+                                                     IVStrideUse* &CondUse,
+                                                     Loop *L) {
+  // If the only use is an icmp of a loop exiting conditional branch, then
+  // attempt the optimization.
+  BasedUser User = BasedUser(*CondUse, SE);
+  assert(isa<ICmpInst>(User.Inst) && "Expecting an ICMPInst!");
+  ICmpInst *Cond = cast<ICmpInst>(User.Inst);
+
+  // Less strict check now that compare stride optimization is done.
+  if (!ShouldCountToZero(Cond, CondUse, SE, L))
+    return false;
 
-  // Handle only tests for equality for the moment, and only stride 1.
-  if (Cond->getPredicate() != CmpInst::ICMP_EQ)
-    return;
-  const SCEV *IV = SE->getSCEV(Cond->getOperand(0));
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
-  const SCEV *One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
-  if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One)
-    return;
-  // If the RHS of the comparison is defined inside the loop, the rewrite
-  // cannot be done.
-  if (Instruction *CR = dyn_cast<Instruction>(Cond->getOperand(1)))
-    if (L->contains(CR->getParent()))
-      return;
+  Value *CondOp0 = Cond->getOperand(0);
+  PHINode *PHIExpr = dyn_cast<PHINode>(CondOp0);
+  Instruction *Incr;
+  if (!PHIExpr) {
+    // Value tested is postinc. Find the phi node.
+    Incr = dyn_cast<BinaryOperator>(CondOp0);
+    // FIXME: Just use User.OperandValToReplace here?
+    if (!Incr || Incr->getOpcode() != Instruction::Add)
+      return false;
 
-  // Make sure the IV is only used for counting.  Value may be preinc or
-  // postinc; 2 uses in either case.
-  if (!Cond->getOperand(0)->hasNUses(2))
-    return;
-  PHINode *phi = dyn_cast<PHINode>(Cond->getOperand(0));
-  Instruction *incr;
-  if (phi && phi->getParent()==L->getHeader()) {
-    // value tested is preinc.  Find the increment.
-    // A CmpInst is not a BinaryOperator; we depend on this.
-    Instruction::use_iterator UI = phi->use_begin();
-    incr = dyn_cast<BinaryOperator>(UI);
-    if (!incr)
-      incr = dyn_cast<BinaryOperator>(++UI);
-    // 1 use for postinc value, the phi.  Unnecessarily conservative?
-    if (!incr || !incr->hasOneUse() || incr->getOpcode()!=Instruction::Add)
-      return;
-  } else {
-    // Value tested is postinc.  Find the phi node.
-    incr = dyn_cast<BinaryOperator>(Cond->getOperand(0));
-    if (!incr || incr->getOpcode()!=Instruction::Add)
-      return;
-
-    Instruction::use_iterator UI = Cond->getOperand(0)->use_begin();
-    phi = dyn_cast<PHINode>(UI);
-    if (!phi)
-      phi = dyn_cast<PHINode>(++UI);
+    PHIExpr = dyn_cast<PHINode>(Incr->getOperand(0));
+    if (!PHIExpr)
+      return false;
     // 1 use for preinc value, the increment.
-    if (!phi || phi->getParent()!=L->getHeader() || !phi->hasOneUse())
-      return;
+    if (!PHIExpr->hasOneUse())
+      return false;
+  } else {
+    assert(isa<PHINode>(CondOp0) &&
+           "Unexpected loop exiting counting instruction sequence!");
+    PHIExpr = cast<PHINode>(CondOp0);
+    // Value tested is preinc.  Find the increment.
+    // A CmpInst is not a BinaryOperator; we depend on this.
+    Instruction::use_iterator UI = PHIExpr->use_begin();
+    Incr = dyn_cast<BinaryOperator>(UI);
+    if (!Incr)
+      Incr = dyn_cast<BinaryOperator>(++UI);
+    // One use for postinc value, the phi.  Unnecessarily conservative?
+    if (!Incr || !Incr->hasOneUse() || Incr->getOpcode() != Instruction::Add)
+      return false;
   }
 
   // Replace the increment with a decrement.
-  BinaryOperator *decr = 
-    BinaryOperator::Create(Instruction::Sub, incr->getOperand(0),
-                           incr->getOperand(1), "tmp", incr);
-  incr->replaceAllUsesWith(decr);
-  incr->eraseFromParent();
+  DEBUG(errs() << "LSR: Examining use ");
+  DEBUG(WriteAsOperand(errs(), CondOp0, /*PrintType=*/false));
+  DEBUG(errs() << " in Inst: " << *Cond << '\n');
+  BinaryOperator *Decr =  BinaryOperator::Create(Instruction::Sub,
+                         Incr->getOperand(0), Incr->getOperand(1), "tmp", Incr);
+  Incr->replaceAllUsesWith(Decr);
+  Incr->eraseFromParent();
 
   // Substitute endval-startval for the original startval, and 0 for the
-  // original endval.  Since we're only testing for equality this is OK even 
+  // original endval.  Since we're only testing for equality this is OK even
   // if the computation wraps around.
   BasicBlock  *Preheader = L->getLoopPreheader();
   Instruction *PreInsertPt = Preheader->getTerminator();
-  int inBlock = L->contains(phi->getIncomingBlock(0)) ? 1 : 0;
-  Value *startVal = phi->getIncomingValue(inBlock);
-  Value *endVal = Cond->getOperand(1);
-  // FIXME check for case where both are constant
+  unsigned InBlock = L->contains(PHIExpr->getIncomingBlock(0)) ? 1 : 0;
+  Value *StartVal = PHIExpr->getIncomingValue(InBlock);
+  Value *EndVal = Cond->getOperand(1);
+  DEBUG(errs() << "    Optimize loop counting iv to count down ["
+        << *EndVal << " .. " << *StartVal << "]\n");
+
+  // FIXME: check for case where both are constant.
   Constant* Zero = ConstantInt::get(Cond->getOperand(1)->getType(), 0);
-  BinaryOperator *NewStartVal = 
-    BinaryOperator::Create(Instruction::Sub, endVal, startVal,
-                           "tmp", PreInsertPt);
-  phi->setIncomingValue(inBlock, NewStartVal);
+  BinaryOperator *NewStartVal = BinaryOperator::Create(Instruction::Sub,
+                                          EndVal, StartVal, "tmp", PreInsertPt);
+  PHIExpr->setIncomingValue(InBlock, NewStartVal);
   Cond->setOperand(1, Zero);
+  DEBUG(errs() << "    New icmp: " << *Cond << "\n");
+
+  int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
+  const SCEV *NewStride = 0;
+  bool Found = false;
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    const SCEV *OldStride = IU->StrideOrder[i];
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(OldStride))
+      if (SC->getValue()->getSExtValue() == -SInt) {
+        Found = true;
+        NewStride = OldStride;
+        break;
+      }
+  }
+
+  if (!Found)
+    NewStride = SE->getIntegerSCEV(-SInt, Stride->getType());
+  IU->AddUser(NewStride, CondUse->getOffset(), Cond, Cond->getOperand(0));
+  IU->IVUsesByStride[Stride]->removeUser(CondUse);
+
+  CondUse = &IU->IVUsesByStride[NewStride]->Users.back();
+  Stride = NewStride;
 
-  Changed = true;
+  ++NumCountZero;
+
+  return true;
 }
 
-bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
+/// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
+/// when to exit the loop is used only for that purpose, try to rearrange things
+/// so it counts down to a test against zero.
+bool LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
+  bool ThisChanged = false;
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    const SCEV *Stride = IU->StrideOrder[i];
+    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(Stride);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    // FIXME: Generalize to non-affine IV's.
+    if (!SI->first->isLoopInvariant(L))
+      continue;
+    // If stride is a constant and it has an icmpinst use, check if we can
+    // optimize the loop to count down.
+    if (isa<SCEVConstant>(Stride) && SI->second->Users.size() == 1) {
+      Instruction *User = SI->second->Users.begin()->getUser();
+      if (!isa<ICmpInst>(User))
+        continue;
+      const SCEV *CondStride = Stride;
+      IVStrideUse *Use = &*SI->second->Users.begin();
+      if (!OptimizeLoopCountIVOfStride(CondStride, Use, L))
+        continue;
+      ThisChanged = true;
 
+      // Now check if it's possible to reuse this iv for other stride uses.
+      for (unsigned j = 0, ee = IU->StrideOrder.size(); j != ee; ++j) {
+        const SCEV *SStride = IU->StrideOrder[j];
+        if (SStride == CondStride)
+          continue;
+        std::map<const SCEV *, IVUsersOfOneStride *>::iterator SII =
+          IU->IVUsesByStride.find(SStride);
+        assert(SII != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+        // FIXME: Generalize to non-affine IV's.
+        if (!SII->first->isLoopInvariant(L))
+          continue;
+        // FIXME: Rewrite other stride using CondStride.
+      }
+    }
+  }
+
+  Changed |= ThisChanged;
+  return ThisChanged;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
   IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   DT = &getAnalysis<DominatorTree>();
   SE = &getAnalysis<ScalarEvolution>();
   Changed = false;
 
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->getLoopPreheader() || !L->getLoopLatch())
+    return false;
+
   if (!IU->IVUsesByStride.empty()) {
     DEBUG(errs() << "\nLSR on \"" << L->getHeader()->getParent()->getName()
           << "\" ";
@@ -2545,7 +2771,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
 
     // Change loop terminating condition to use the postinc iv when possible
     // and optimize loop terminating compare. FIXME: Move this after
-    // StrengthReduceStridedIVUsers?
+    // StrengthReduceIVUsersOfStride?
     OptimizeLoopTermCond(L);
 
     // FIXME: We can shrink overlarge IV's here.  e.g. if the code has
@@ -2561,26 +2787,12 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
     // IVsByStride keeps IVs for one particular loop.
     assert(IVsByStride.empty() && "Stale entries in IVsByStride?");
 
-    // Note: this processes each stride/type pair individually.  All users
-    // passed into StrengthReduceStridedIVUsers have the same type AND stride.
-    // Also, note that we iterate over IVUsesByStride indirectly by using
-    // StrideOrder. This extra layer of indirection makes the ordering of
-    // strides deterministic - not dependent on map order.
-    for (unsigned Stride = 0, e = IU->StrideOrder.size();
-         Stride != e; ++Stride) {
-      std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
-        IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
-      assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
-      // FIXME: Generalize to non-affine IV's.
-      if (!SI->first->isLoopInvariant(L))
-        continue;
-      StrengthReduceStridedIVUsers(SI->first, *SI->second, L);
-    }
-  }
+    StrengthReduceIVUsers(L);
 
-  // After all sharing is done, see if we can adjust the loop to test against
-  // zero instead of counting up to a maximum.  This is usually faster.
-  OptimizeLoopCountIV(L);
+    // After all sharing is done, see if we can adjust the loop to test against
+    // zero instead of counting up to a maximum.  This is usually faster.
+    OptimizeLoopCountIV(L);
+  }
 
   // We're done analyzing this loop; release all the state we built up for it.
   IVsByStride.clear();
diff --git a/lib/Transforms/Scalar/LoopUnroll.cpp b/lib/Transforms/Scalar/LoopUnroll.cpp
deleted file mode 100644
index 837ec59..0000000
--- a/lib/Transforms/Scalar/LoopUnroll.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements a simple loop unroller.  It works best when loops have
-// been canonicalized by the -indvars pass, allowing it to determine the trip
-// counts of loops easily.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "loop-unroll"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <climits>
-
-using namespace llvm;
-
-static cl::opt<unsigned>
-UnrollThreshold("unroll-threshold", cl::init(100), cl::Hidden,
-  cl::desc("The cut-off point for automatic loop unrolling"));
-
-static cl::opt<unsigned>
-UnrollCount("unroll-count", cl::init(0), cl::Hidden,
-  cl::desc("Use this unroll count for all loops, for testing purposes"));
-
-static cl::opt<bool>
-UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
-  cl::desc("Allows loops to be partially unrolled until "
-           "-unroll-threshold loop size is reached."));
-
-namespace {
-  class LoopUnroll : public LoopPass {
-  public:
-    static char ID; // Pass ID, replacement for typeid
-    LoopUnroll() : LoopPass(&ID) {}
-
-    /// A magic value for use with the Threshold parameter to indicate
-    /// that the loop unroll should be performed regardless of how much
-    /// code expansion would result.
-    static const unsigned NoThreshold = UINT_MAX;
-
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
-
-    /// This transformation requires natural loop information & requires that
-    /// loop preheaders be inserted into the CFG...
-    ///
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequiredID(LoopSimplifyID);
-      AU.addRequiredID(LCSSAID);
-      AU.addRequired<LoopInfo>();
-      AU.addPreservedID(LCSSAID);
-      AU.addPreserved<LoopInfo>();
-      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
-      // If loop unroll does not preserve dom info then LCSSA pass on next
-      // loop will receive invalid dom info.
-      // For now, recreate dom info, if loop is unrolled.
-      AU.addPreserved<DominatorTree>();
-      AU.addPreserved<DominanceFrontier>();
-    }
-  };
-}
-
-char LoopUnroll::ID = 0;
-static RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
-
-Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
-
-/// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L) {
-  unsigned Size = 0;
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
-    Instruction *Term = BB->getTerminator();
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      if (isa<PHINode>(I) && BB == L->getHeader()) {
-        // Ignore PHI nodes in the header.
-      } else if (I->hasOneUse() && I->use_back() == Term) {
-        // Ignore instructions only used by the loop terminator.
-      } else if (isa<DbgInfoIntrinsic>(I)) {
-        // Ignore debug instructions
-      } else if (isa<GetElementPtrInst>(I) && I->hasOneUse()) {
-        // Ignore GEP as they generally are subsumed into a load or store.
-      } else if (isa<CallInst>(I)) {
-        // Estimate size overhead introduced by call instructions which
-        // is higher than other instructions. Here 3 and 10 are magic
-        // numbers that help one isolated test case from PR2067 without
-        // negatively impacting measured benchmarks.
-        Size += isa<IntrinsicInst>(I) ? 3 : 10;
-      } else {
-        ++Size;
-      }
-
-      // TODO: Ignore expressions derived from PHI and constants if inval of phi
-      // is a constant, or if operation is associative.  This will get induction
-      // variables.
-    }
-  }
-
-  return Size;
-}
-
-bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
-  assert(L->isLCSSAForm());
-  LoopInfo *LI = &getAnalysis<LoopInfo>();
-
-  BasicBlock *Header = L->getHeader();
-  DEBUG(errs() << "Loop Unroll: F[" << Header->getParent()->getName()
-        << "] Loop %" << Header->getName() << "\n");
-  (void)Header;
-
-  // Find trip count
-  unsigned TripCount = L->getSmallConstantTripCount();
-  unsigned Count = UnrollCount;
-
-  // Automatically select an unroll count.
-  if (Count == 0) {
-    // Conservative heuristic: if we know the trip count, see if we can
-    // completely unroll (subject to the threshold, checked below); otherwise
-    // try to find greatest modulo of the trip count which is still under
-    // threshold value.
-    if (TripCount == 0)
-      return false;
-    Count = TripCount;
-  }
-
-  // Enforce the threshold.
-  if (UnrollThreshold != NoThreshold) {
-    unsigned LoopSize = ApproximateLoopSize(L);
-    DEBUG(errs() << "  Loop Size = " << LoopSize << "\n");
-    uint64_t Size = (uint64_t)LoopSize*Count;
-    if (TripCount != 1 && Size > UnrollThreshold) {
-      DEBUG(errs() << "  Too large to fully unroll with count: " << Count
-            << " because size: " << Size << ">" << UnrollThreshold << "\n");
-      if (!UnrollAllowPartial) {
-        DEBUG(errs() << "  will not try to unroll partially because "
-              << "-unroll-allow-partial not given\n");
-        return false;
-      }
-      // Reduce unroll count to be modulo of TripCount for partial unrolling
-      Count = UnrollThreshold / LoopSize;
-      while (Count != 0 && TripCount%Count != 0) {
-        Count--;
-      }
-      if (Count < 2) {
-        DEBUG(errs() << "  could not unroll partially\n");
-        return false;
-      }
-      DEBUG(errs() << "  partially unrolling with count: " << Count << "\n");
-    }
-  }
-
-  // Unroll the loop.
-  Function *F = L->getHeader()->getParent();
-  if (!UnrollLoop(L, Count, LI, &LPM))
-    return false;
-
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
-  if (DT) {
-    DT->runOnFunction(*F);
-    DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>();
-    if (DF)
-      DF->runOnFunction(*F);
-  }
-  return true;
-}
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index c7b00da..38d267a 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -32,7 +32,6 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -407,6 +406,10 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val){
   initLoopData();
   Function *F = loopHeader->getParent();
 
+  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+  if (!loopPreheader)
+    return false;
+
   // If the condition is trivial, always unswitch.  There is no code growth for
   // this case.
   if (!IsTrivialUnswitchCondition(LoopCond)) {
@@ -957,7 +960,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     Worklist.pop_back();
     
     // Simple constant folding.
-    if (Constant *C = ConstantFoldInstruction(I, I->getContext())) {
+    if (Constant *C = ConstantFoldInstruction(I)) {
       ReplaceUsesOfWith(I, C, Worklist, L, LPM);
       continue;
     }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index af29f97..8466918 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Pass.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
@@ -198,8 +197,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
 static Instruction *LowerNegateToMultiply(Instruction *Neg,
-                              std::map<AssertingVH<>, unsigned> &ValueRankMap,
-                              LLVMContext &Context) {
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
   Constant *Cst = Constant::getAllOnesValue(Neg->getType());
 
   Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
@@ -255,7 +253,6 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
                                     std::vector<ValueEntry> &Ops) {
   Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
   unsigned Opcode = I->getOpcode();
-  LLVMContext &Context = I->getContext();
 
   // First step, linearize the expression if it is in ((A+B)+(C+D)) form.
   BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode);
@@ -265,13 +262,11 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
   // transform them into multiplies by -1 so they can be reassociated.
   if (I->getOpcode() == Instruction::Mul) {
     if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) {
-      LHS = LowerNegateToMultiply(cast<Instruction>(LHS),
-                                  ValueRankMap, Context);
+      LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap);
       LHSBO = isReassociableOp(LHS, Opcode);
     }
     if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) {
-      RHS = LowerNegateToMultiply(cast<Instruction>(RHS),
-                                  ValueRankMap, Context);
+      RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap);
       RHSBO = isReassociableOp(RHS, Opcode);
     }
   }
@@ -373,7 +368,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
 // version of the value is returned, and BI is left pointing at the instruction
 // that should be processed next by the reassociation pass.
 //
-static Value *NegateValue(LLVMContext &Context, Value *V, Instruction *BI) {
+static Value *NegateValue(Value *V, Instruction *BI) {
   // We are trying to expose opportunity for reassociation.  One of the things
   // that we want to do to achieve this is to push a negation as deep into an
   // expression chain as possible, to expose the add instructions.  In practice,
@@ -386,8 +381,8 @@ static Value *NegateValue(LLVMContext &Context, Value *V, Instruction *BI) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
       // Push the negates through the add.
-      I->setOperand(0, NegateValue(Context, I->getOperand(0), BI));
-      I->setOperand(1, NegateValue(Context, I->getOperand(1), BI));
+      I->setOperand(0, NegateValue(I->getOperand(0), BI));
+      I->setOperand(1, NegateValue(I->getOperand(1), BI));
 
       // We must move the add instruction here, because the neg instructions do
       // not dominate the old add instruction in general.  By moving it, we are
@@ -407,7 +402,7 @@ static Value *NegateValue(LLVMContext &Context, Value *V, Instruction *BI) {
 
 /// ShouldBreakUpSubtract - Return true if we should break up this subtract of
 /// X-Y into (X + -Y).
-static bool ShouldBreakUpSubtract(LLVMContext &Context, Instruction *Sub) {
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
   if (BinaryOperator::isNeg(Sub))
     return false;
@@ -431,7 +426,7 @@ static bool ShouldBreakUpSubtract(LLVMContext &Context, Instruction *Sub) {
 /// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
 /// only used by an add, transform this into (X+(0-Y)) to promote better
 /// reassociation.
-static Instruction *BreakUpSubtract(LLVMContext &Context, Instruction *Sub,
+static Instruction *BreakUpSubtract(Instruction *Sub,
                               std::map<AssertingVH<>, unsigned> &ValueRankMap) {
   // Convert a subtract into an add and a neg instruction... so that sub
   // instructions can be commuted with other add instructions...
@@ -439,7 +434,7 @@ static Instruction *BreakUpSubtract(LLVMContext &Context, Instruction *Sub,
   // Calculate the negative value of Operand 1 of the sub instruction...
   // and set it as the RHS of the add instruction we just made...
   //
-  Value *NegVal = NegateValue(Context, Sub->getOperand(1), Sub);
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
   Instruction *New =
     BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
   New->takeName(Sub);
@@ -457,8 +452,7 @@ static Instruction *BreakUpSubtract(LLVMContext &Context, Instruction *Sub,
 /// by one, change this into a multiply by a constant to assist with further
 /// reassociation.
 static Instruction *ConvertShiftToMul(Instruction *Shl, 
-                              std::map<AssertingVH<>, unsigned> &ValueRankMap,
-                              LLVMContext &Context) {
+                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
   // If an operand of this shift is a reassociable multiply, or if the shift
   // is used by a reassociable multiply or add, turn into a multiply.
   if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
@@ -781,13 +775,11 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
 /// ReassociateBB - Inspect all of the instructions in this basic block,
 /// reassociating them as we go.
 void Reassociate::ReassociateBB(BasicBlock *BB) {
-  LLVMContext &Context = BB->getContext();
-  
   for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) {
     Instruction *BI = BBI++;
     if (BI->getOpcode() == Instruction::Shl &&
         isa<ConstantInt>(BI->getOperand(1)))
-      if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap, Context)) {
+      if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) {
         MadeChange = true;
         BI = NI;
       }
@@ -800,8 +792,8 @@ void Reassociate::ReassociateBB(BasicBlock *BB) {
     // If this is a subtract instruction which is not already in negate form,
     // see if we can convert it to X+-Y.
     if (BI->getOpcode() == Instruction::Sub) {
-      if (ShouldBreakUpSubtract(Context, BI)) {
-        BI = BreakUpSubtract(Context, BI, ValueRankMap);
+      if (ShouldBreakUpSubtract(BI)) {
+        BI = BreakUpSubtract(BI, ValueRankMap);
         MadeChange = true;
       } else if (BinaryOperator::isNeg(BI)) {
         // Otherwise, this is a negation.  See if the operand is a multiply tree
@@ -809,7 +801,7 @@ void Reassociate::ReassociateBB(BasicBlock *BB) {
         if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
             (!BI->hasOneUse() ||
              !isReassociableOp(BI->use_back(), Instruction::Mul))) {
-          BI = LowerNegateToMultiply(BI, ValueRankMap, Context);
+          BI = LowerNegateToMultiply(BI, ValueRankMap);
           MadeChange = true;
         }
       }
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 509a6db..c202a2c 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -795,9 +795,14 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
     return markOverdefined(&EVI);
 
   Value *AggVal = EVI.getAggregateOperand();
-  unsigned i = *EVI.idx_begin();
-  LatticeVal EltVal = getStructValueState(AggVal, i);
-  mergeInValue(getValueState(&EVI), &EVI, EltVal);
+  if (isa<StructType>(AggVal->getType())) {
+    unsigned i = *EVI.idx_begin();
+    LatticeVal EltVal = getStructValueState(AggVal, i);
+    mergeInValue(getValueState(&EVI), &EVI, EltVal);
+  } else {
+    // Otherwise, must be extracting from an array.
+    return markOverdefined(&EVI);
+  }
 }
 
 void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
diff --git a/lib/Transforms/Scalar/SCCVN.cpp b/lib/Transforms/Scalar/SCCVN.cpp
index c047fca..001267a 100644
--- a/lib/Transforms/Scalar/SCCVN.cpp
+++ b/lib/Transforms/Scalar/SCCVN.cpp
@@ -507,7 +507,7 @@ void ValueTable::erase(Value *V) {
 /// verifyRemoved - Verify that the value is removed from all internal data
 /// structures.
 void ValueTable::verifyRemoved(const Value *V) const {
-  for (DenseMap<Value*, uint32_t>::iterator
+  for (DenseMap<Value*, uint32_t>::const_iterator
          I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
     assert(I->first != V && "Inst still occurs in value numbering map!");
   }
@@ -629,9 +629,6 @@ bool SCCVN::runOnFunction(Function& F) {
     }
   }
 
-  // FIXME: This code is commented out for now, because it can lead to the
-  // insertion of a lot of redundant PHIs being inserted by SSAUpdater.
-#if 0
   // Perform a forward data-flow to compute availability at all points on
   // the CFG.
   do {
@@ -709,7 +706,6 @@ bool SCCVN::runOnFunction(Function& F) {
       CurInst->eraseFromParent();
     }
   }
-#endif
 
   VT.clear();
   for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 5669da0..b54565c 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -26,10 +26,6 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
 
-void LLVMAddCondPropagationPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createCondPropagationPass());
-}
-
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 575c93b..611505e 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -100,7 +100,7 @@ public:
 
   /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
   /// is an integer.
-  void EmitPutChar(Value *Char, IRBuilder<> &B);
+  Value *EmitPutChar(Value *Char, IRBuilder<> &B);
 
   /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
   /// some pointer.
@@ -252,18 +252,20 @@ Value *LibCallOptimization::EmitUnaryFloatFnCall(Value *Op, const char *Name,
 
 /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
 /// is an integer.
-void LibCallOptimization::EmitPutChar(Value *Char, IRBuilder<> &B) {
+Value *LibCallOptimization::EmitPutChar(Value *Char, IRBuilder<> &B) {
   Module *M = Caller->getParent();
   Value *PutChar = M->getOrInsertFunction("putchar", Type::getInt32Ty(*Context),
                                           Type::getInt32Ty(*Context), NULL);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
 					      Type::getInt32Ty(*Context),
+                                              /*isSigned*/true,
 					      "chari"),
                               "putchar");
 
   if (const Function *F = dyn_cast<Function>(PutChar->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
+  return CI;
 }
 
 /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
@@ -302,7 +304,8 @@ void LibCallOptimization::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B) {
 			       Type::getInt32Ty(*Context),
 			       Type::getInt32Ty(*Context),
                                File->getType(), NULL);
-  Char = B.CreateIntCast(Char, Type::getInt32Ty(*Context), "chari");
+  Char = B.CreateIntCast(Char, Type::getInt32Ty(*Context), /*isSigned*/true,
+                         "chari");
   CallInst *CI = B.CreateCall2(F, Char, File, "fputc");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -955,6 +958,17 @@ struct MemCmpOpt : public LibCallOptimization {
       return B.CreateZExt(B.CreateXor(LHSV, RHSV, "shortdiff"), CI->getType());
     }
 
+    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+    std::string LHSStr, RHSStr;
+    if (GetConstantStringInfo(LHS, LHSStr) &&
+        GetConstantStringInfo(RHS, RHSStr)) {
+      // Make sure we're not reading out-of-bounds memory.
+      if (Len > LHSStr.length() || Len > RHSStr.length())
+        return 0;
+      uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      return ConstantInt::get(CI->getType(), Ret);
+    }
+
     return 0;
   }
 };
@@ -1314,11 +1328,13 @@ struct PrintFOpt : public LibCallOptimization {
       return CI->use_empty() ? (Value*)CI :
                                ConstantInt::get(CI->getType(), 0);
 
-    // printf("x") -> putchar('x'), even for '%'.
+    // printf("x") -> putchar('x'), even for '%'.  Return the result of putchar
+    // in case there is an error writing to stdout.
     if (FormatStr.size() == 1) {
-      EmitPutChar(ConstantInt::get(Type::getInt32Ty(*Context), FormatStr[0]), B);
-      return CI->use_empty() ? (Value*)CI :
-                               ConstantInt::get(CI->getType(), 1);
+      Value *Res = EmitPutChar(ConstantInt::get(Type::getInt32Ty(*Context),
+                                                FormatStr[0]), B);
+      if (CI->use_empty()) return CI;
+      return B.CreateIntCast(Res, CI->getType(), true);
     }
 
     // printf("foo\n") --> puts("foo")
@@ -1339,9 +1355,10 @@ struct PrintFOpt : public LibCallOptimization {
     // printf("%c", chr) --> putchar(*(i8*)dst)
     if (FormatStr == "%c" && CI->getNumOperands() > 2 &&
         isa<IntegerType>(CI->getOperand(2)->getType())) {
-      EmitPutChar(CI->getOperand(2), B);
-      return CI->use_empty() ? (Value*)CI :
-                               ConstantInt::get(CI->getType(), 1);
+      Value *Res = EmitPutChar(CI->getOperand(2), B);
+      
+      if (CI->use_empty()) return CI;
+      return B.CreateIntCast(Res, CI->getType(), true);
     }
 
     // printf("%s\n", str) --> puts(str)
@@ -2479,10 +2496,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 // lround, lroundf, lroundl:
 //   * lround(cnst) -> cnst'
 //
-// memcmp:
-//   * memcmp(x,y,l)   -> cnst
-//      (if all arguments are constant and strlen(x) <= l and strlen(y) <= l)
-//
 // pow, powf, powl:
 //   * pow(exp(x),y)  -> exp(x*y)
 //   * pow(sqrt(x),y) -> pow(x,y*0.5)
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
index 4864e23..b06ae3d 100644
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ b/lib/Transforms/Scalar/TailDuplication.cpp
@@ -359,8 +359,7 @@ void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
       Instruction *Inst = BI++;
       if (isInstructionTriviallyDead(Inst))
         Inst->eraseFromParent();
-      else if (Constant *C = ConstantFoldInstruction(Inst,
-                                                     Inst->getContext())) {
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
         Inst->replaceAllUsesWith(C);
         Inst->eraseFromParent();
       }
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index b56e170..4119cb9 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -25,7 +25,7 @@
 //     unlikely, that the return returns something else (like constant 0), and
 //     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
 //     the function return the exact same value.
-//  4. If it can prove that callees do not access theier caller stack frame,
+//  4. If it can prove that callees do not access their caller stack frame,
 //     they are marked as eligible for tail call elimination (by the code
 //     generator).
 //
@@ -58,6 +58,7 @@
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -75,7 +76,7 @@ namespace {
   private:
     bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
                                bool &TailCallsAreMarkedTail,
-                               std::vector<PHINode*> &ArgumentPHIs,
+                               SmallVector<PHINode*, 8> &ArgumentPHIs,
                                bool CannotTailCallElimCallsMarkedTail);
     bool CanMoveAboveCall(Instruction *I, CallInst *CI);
     Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
@@ -90,7 +91,6 @@ FunctionPass *llvm::createTailCallEliminationPass() {
   return new TailCallElim();
 }
 
-
 /// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
 /// callees of this function.  We only do very simple analysis right now, this
 /// could be expanded in the future to use mod/ref information for particular
@@ -100,7 +100,7 @@ static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
   return true;
 }
 
-/// FunctionContainsAllocas - Scan the specified basic block for alloca
+/// CheckForEscapingAllocas - Scan the specified basic block for alloca
 /// instructions.  If it contains any that might be accessed by calls, return
 /// true.
 static bool CheckForEscapingAllocas(BasicBlock *BB,
@@ -127,7 +127,7 @@ bool TailCallElim::runOnFunction(Function &F) {
 
   BasicBlock *OldEntry = 0;
   bool TailCallsAreMarkedTail = false;
-  std::vector<PHINode*> ArgumentPHIs;
+  SmallVector<PHINode*, 8> ArgumentPHIs;
   bool MadeChange = false;
 
   bool FunctionContainsEscapingAllocas = false;
@@ -154,7 +154,6 @@ bool TailCallElim::runOnFunction(Function &F) {
   /// happen.  This bug is PR962.
   if (FunctionContainsEscapingAllocas)
     return false;
-  
 
   // Second pass, change any tail calls to loops.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -204,7 +203,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
     return false;
   
-  if (LoadInst* L = dyn_cast<LoadInst>(I)) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
     // Loads may always be moved above calls without side effects.
     if (CI->mayHaveSideEffects()) {
       // Non-volatile loads may be moved above a call with side effects if it
@@ -235,7 +234,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
 // We currently handle static constants and arguments that are not modified as
 // part of the recursion.
 //
-static bool isDynamicConstant(Value *V, CallInst *CI) {
+static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
   if (isa<Constant>(V)) return true; // Static constants are always dyn consts
 
   // Check to see if this is an immutable argument, if so, the value
@@ -253,6 +252,15 @@ static bool isDynamicConstant(Value *V, CallInst *CI) {
     if (CI->getOperand(ArgNo+1) == Arg)
       return true;
   }
+
+  // Switch cases are always constant integers. If the value is being switched
+  // on and the return is only reachable from one of its cases, it's
+  // effectively constant.
+  if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
+      if (SI->getCondition() == V)
+        return SI->getDefaultDest() != RI->getParent();
+
   // Not a constant or immutable argument, we can't safely transform.
   return false;
 }
@@ -265,10 +273,6 @@ static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
   Function *F = TheRI->getParent()->getParent();
   Value *ReturnedValue = 0;
 
-  // TODO: Handle multiple value ret instructions;
-  if (isa<StructType>(F->getReturnType()))
-      return 0;
-
   for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
       if (RI != TheRI) {
@@ -278,7 +282,7 @@ static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
         // evaluatable at the start of the initial invocation of the function,
         // instead of at the end of the evaluation.
         //
-        if (!isDynamicConstant(RetOp, CI))
+        if (!isDynamicConstant(RetOp, CI, RI))
           return 0;
 
         if (ReturnedValue && RetOp != ReturnedValue)
@@ -315,7 +319,7 @@ Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
 
 bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
                                          bool &TailCallsAreMarkedTail,
-                                         std::vector<PHINode*> &ArgumentPHIs,
+                                         SmallVector<PHINode*, 8> &ArgumentPHIs,
                                        bool CannotTailCallElimCallsMarkedTail) {
   BasicBlock *BB = Ret->getParent();
   Function *F = BB->getParent();
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index c728c0b..2974592 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -275,8 +275,6 @@ void llvm::RemoveSuccessor(TerminatorInst *TI, unsigned SuccNum) {
 /// SplitEdge -  Split the edge connecting specified block. Pass P must 
 /// not be NULL. 
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
-  assert(!isa<IndirectBrInst>(BB->getTerminator()) &&
-         "Cannot split an edge from an IndirectBrInst");
   TerminatorInst *LatchTerm = BB->getTerminator();
   unsigned SuccNum = 0;
 #ifndef NDEBUG
@@ -386,6 +384,12 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
   for (unsigned i = 0; i != NumPreds; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+
     Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
 
     if (LI) {
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index fd8862c..162d7b3 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -322,8 +323,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
 /// mapping its operands through ValueMap if they are available.
 Constant *PruningFunctionCloner::
 ConstantFoldMappedInstruction(const Instruction *I) {
-  LLVMContext &Context = I->getContext();
-  
   SmallVector<Constant*, 8> Ops;
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
     if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
@@ -333,9 +332,8 @@ ConstantFoldMappedInstruction(const Instruction *I) {
       return 0;  // All operands not constant!
 
   if (const CmpInst *CI = dyn_cast<CmpInst>(I))
-    return ConstantFoldCompareInstOperands(CI->getPredicate(),
-                                           &Ops[0], Ops.size(), 
-                                           Context, TD);
+    return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
+                                           TD);
 
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
@@ -346,7 +344,28 @@ ConstantFoldMappedInstruction(const Instruction *I) {
                                                           CE);
 
   return ConstantFoldInstOperands(I->getOpcode(), I->getType(), &Ops[0],
-                                  Ops.size(), Context, TD);
+                                  Ops.size(), TD);
+}
+
+static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD,
+                                   LLVMContext &Context) {
+  DILocation ILoc(InsnMD);
+  if (ILoc.isNull()) return InsnMD;
+
+  DILocation CallLoc(TheCallMD);
+  if (CallLoc.isNull()) return InsnMD;
+
+  DILocation OrigLocation = ILoc.getOrigLocation();
+  MDNode *NewLoc = TheCallMD;
+  if (!OrigLocation.isNull())
+    NewLoc = UpdateInlinedAtInfo(OrigLocation.getNode(), TheCallMD, Context);
+
+  SmallVector<Value *, 4> MDVs;
+  MDVs.push_back(InsnMD->getElement(0)); // Line
+  MDVs.push_back(InsnMD->getElement(1)); // Col
+  MDVs.push_back(InsnMD->getElement(2)); // Scope
+  MDVs.push_back(NewLoc);
+  return MDNode::get(Context, MDVs.data(), MDVs.size());
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
@@ -361,7 +380,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                      SmallVectorImpl<ReturnInst*> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
-                                     const TargetData *TD) {
+                                     const TargetData *TD,
+                                     Instruction *TheCall) {
   assert(NameSuffix && "NameSuffix cannot be null!");
   
 #ifndef NDEBUG
@@ -400,19 +420,52 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     // references as we go.  This uses ValueMap to do all the hard work.
     //
     BasicBlock::iterator I = NewBB->begin();
+
+    LLVMContext &Context = OldFunc->getContext();
+    unsigned DbgKind = Context.getMetadata().getMDKind("dbg");
+    MDNode *TheCallMD = NULL;
+    SmallVector<Value *, 4> MDVs;
+    if (TheCall && TheCall->hasMetadata()) 
+      TheCallMD = Context.getMetadata().getMD(DbgKind, TheCall);
     
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
     if (PHINode *PN = dyn_cast<PHINode>(I)) {
       // Skip over all PHI nodes, remembering them for later.
       BasicBlock::const_iterator OldI = BI->begin();
-      for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI)
+      for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI) {
+        if (I->hasMetadata()) {
+          if (TheCallMD) {
+            if (MDNode *IMD = Context.getMetadata().getMD(DbgKind, I)) {
+              MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD, Context);
+              Context.getMetadata().addMD(DbgKind, NewMD, I);
+            }
+          } else {
+            // The cloned instruction has dbg info but the call instruction
+            // does not have dbg info. Remove dbg info from cloned instruction.
+            Context.getMetadata().removeMD(DbgKind, I);
+          }
+        }
         PHIToResolve.push_back(cast<PHINode>(OldI));
+      }
     }
     
     // Otherwise, remap the rest of the instructions normally.
-    for (; I != NewBB->end(); ++I)
+    for (; I != NewBB->end(); ++I) {
+      if (I->hasMetadata()) {
+        if (TheCallMD) {
+          if (MDNode *IMD = Context.getMetadata().getMD(DbgKind, I)) {
+            MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD, Context);
+            Context.getMetadata().addMD(DbgKind, NewMD, I);
+          }
+        } else {
+          // The cloned instruction has dbg info but the call instruction
+          // does not have dbg info. Remove dbg info from cloned instruction.
+          Context.getMetadata().removeMD(DbgKind, I);
+        }
+      }
       RemapInstruction(I, ValueMap);
+    }
   }
   
   // Defer PHI resolution until rest of function is resolved, PHI resolution
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 20f5a4a..043046c 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -386,7 +386,7 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD,
     // (which can happen, e.g., because an argument was constant), but we'll be
     // happy with whatever the cloner can do.
     CloneAndPruneFunctionInto(Caller, CalledFunc, ValueMap, Returns, ".i",
-                              &InlinedFunctionInfo, TD);
+                              &InlinedFunctionInfo, TD, TheCall);
 
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 56e662e..590d667 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -50,7 +50,6 @@ namespace {
     LCSSA() : LoopPass(&ID) {}
 
     // Cached analysis information for the current function.
-    LoopInfo *LI;
     DominatorTree *DT;
     std::vector<BasicBlock*> LoopBlocks;
     PredIteratorCache PredCache;
@@ -64,6 +63,9 @@ namespace {
     ///
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
+
+      // LCSSA doesn't actually require LoopSimplify, but the PassManager
+      // doesn't know how to schedule LoopSimplify by itself.
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredTransitive<LoopInfo>();
@@ -121,7 +123,6 @@ static bool BlockDominatesAnExit(BasicBlock *BB,
 bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
   L = TheLoop;
   
-  LI = &LPM.getAnalysis<LoopInfo>();
   DT = &getAnalysis<DominatorTree>();
 
   // Get the set of exiting blocks.
@@ -216,7 +217,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
   SSAUpdate.Initialize(Inst);
   
   // Insert the LCSSA phi's into all of the exit blocks dominated by the
-  // value., and add them to the Phi's map.
+  // value, and add them to the Phi's map.
   for (SmallVectorImpl<BasicBlock*>::const_iterator BBI = ExitBlocks.begin(),
       BBE = ExitBlocks.end(); BBI != BBE; ++BBI) {
     BasicBlock *ExitBB = *BBI;
@@ -230,8 +231,17 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
     PN->reserveOperandSpace(PredCache.GetNumPreds(ExitBB));
 
     // Add inputs from inside the loop for this PHI.
-    for (BasicBlock **PI = PredCache.GetPreds(ExitBB); *PI; ++PI)
+    for (BasicBlock **PI = PredCache.GetPreds(ExitBB); *PI; ++PI) {
       PN->addIncoming(Inst, *PI);
+
+      // If the exit block has a predecessor not within the loop, arrange for
+      // the incoming value use corresponding to that predecessor to be
+      // rewritten in terms of a different LCSSA PHI.
+      if (!inLoop(*PI))
+        UsesToRewrite.push_back(
+          &PN->getOperandUse(
+            PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1)));
+    }
     
     // Remember that this phi makes the value alive in this block.
     SSAUpdate.AddAvailableValue(ExitBB, PN);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 543ddf1..aef0f5f 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -24,10 +24,14 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -236,7 +240,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
 
 
 //===----------------------------------------------------------------------===//
-//  Local dead code elimination...
+//  Local dead code elimination.
 //
 
 /// isInstructionTriviallyDead - Return true if the result produced by the
@@ -248,6 +252,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
   // We don't want debug info removed by anything this general.
   if (isa<DbgInfoIntrinsic>(I)) return false;
 
+  // Likewise for memory use markers.
+  if (isa<MemoryUseIntrinsic>(I)) return false;
+
   if (!I->mayHaveSideEffects()) return true;
 
   // Special case intrinsics that "may have side effects" but can be deleted
@@ -323,9 +330,53 @@ llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
 }
 
 //===----------------------------------------------------------------------===//
-//  Control Flow Graph Restructuring...
+//  Control Flow Graph Restructuring.
 //
 
+
+/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this
+/// method is called when we're about to delete Pred as a predecessor of BB.  If
+/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred.
+///
+/// Unlike the removePredecessor method, this attempts to simplify uses of PHI
+/// nodes that collapse into identity values.  For example, if we have:
+///   x = phi(1, 0, 0, 0)
+///   y = and x, z
+///
+/// .. and delete the predecessor corresponding to the '1', this will attempt to
+/// recursively fold the and to 0.
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
+                                        TargetData *TD) {
+  // This only adjusts blocks with PHI nodes.
+  if (!isa<PHINode>(BB->begin()))
+    return;
+  
+  // Remove the entries for Pred from the PHI nodes in BB, but do not simplify
+  // them down.  This will leave us with single entry phi nodes and other phis
+  // that can be removed.
+  BB->removePredecessor(Pred, true);
+  
+  WeakVH PhiIt = &BB->front();
+  while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
+    PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
+    
+    Value *PNV = PN->hasConstantValue();
+    if (PNV == 0) continue;
+    
+    // If we're able to simplify the phi to a single value, substitute the new
+    // value into all of its uses.
+    assert(PNV != PN && "hasConstantValue broken");
+    
+    ReplaceAndSimplifyAllUses(PN, PNV, TD);
+    
+    // If recursive simplification ended up deleting the next PHI node we would
+    // iterate to, then our iterator is invalid, restart scanning from the top
+    // of the block.
+    if (PhiIt == 0) PhiIt = &BB->front();
+  }
+}
+
+
 /// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
 /// predecessor is known to have one successor (DestBB!).  Eliminate the edge
 /// between them, moving the instructions in the predecessor into DestBB and
@@ -362,6 +413,174 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
   PredBB->eraseFromParent();
 }
 
+/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
+/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+///
+/// Assumption: Succ is the single successor for BB.
+///
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+  DEBUG(errs() << "Looking to fold " << BB->getName() << " into " 
+        << Succ->getName() << "\n");
+  // Shortcut, if there is only a single predecessor it must be BB and merging
+  // is always safe
+  if (Succ->getSinglePredecessor()) return true;
+
+  // Make a list of the predecessors of BB
+  typedef SmallPtrSet<BasicBlock*, 16> BlockSet;
+  BlockSet BBPreds(pred_begin(BB), pred_end(BB));
+
+  // Use that list to make another list of common predecessors of BB and Succ
+  BlockSet CommonPreds;
+  for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
+        PI != PE; ++PI)
+    if (BBPreds.count(*PI))
+      CommonPreds.insert(*PI);
+
+  // Shortcut, if there are no common predecessors, merging is always safe
+  if (CommonPreds.empty())
+    return true;
+  
+  // Look at all the phi nodes in Succ, to see if they present a conflict when
+  // merging these blocks
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // If the incoming value from BB is again a PHINode in
+    // BB which has the same incoming value for *PI as PN does, we can
+    // merge the phi nodes and then the blocks can still be merged
+    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+    if (BBPN && BBPN->getParent() == BB) {
+      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+            PI != PE; PI++) {
+        if (BBPN->getIncomingValueForBlock(*PI) 
+              != PN->getIncomingValueForBlock(*PI)) {
+          DEBUG(errs() << "Can't fold, phi node " << PN->getName() << " in " 
+                << Succ->getName() << " is conflicting with " 
+                << BBPN->getName() << " with regard to common predecessor "
+                << (*PI)->getName() << "\n");
+          return false;
+        }
+      }
+    } else {
+      Value* Val = PN->getIncomingValueForBlock(BB);
+      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
+            PI != PE; PI++) {
+        // See if the incoming value for the common predecessor is equal to the
+        // one for BB, in which case this phi node will not prevent the merging
+        // of the block.
+        if (Val != PN->getIncomingValueForBlock(*PI)) {
+          DEBUG(errs() << "Can't fold, phi node " << PN->getName() << " in " 
+                << Succ->getName() << " is conflicting with regard to common "
+                << "predecessor " << (*PI)->getName() << "\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
+/// unconditional branch, and contains no instructions other than PHI nodes,
+/// potential debug intrinsics and the branch.  If possible, eliminate BB by
+/// rewriting all the predecessors to branch to the successor block and return
+/// true.  If we can't transform, return false.
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+  // We can't eliminate infinite loops.
+  BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
+  if (BB == Succ) return false;
+  
+  // Check to see if merging these blocks would cause conflicts for any of the
+  // phi nodes in BB or Succ. If not, we can safely merge.
+  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+
+  // Check for cases where Succ has multiple predecessors and a PHI node in BB
+  // has uses which will not disappear when the PHI nodes are merged.  It is
+  // possible to handle such cases, but difficult: it requires checking whether
+  // BB dominates Succ, which is non-trivial to calculate in the case where
+  // Succ has multiple predecessors.  Also, it requires checking whether
+  // constructing the necessary self-referential PHI node doesn't intoduce any
+  // conflicts; this isn't too difficult, but the previous code for doing this
+  // was incorrect.
+  //
+  // Note that if this check finds a live use, BB dominates Succ, so BB is
+  // something like a loop pre-header (or rarely, a part of an irreducible CFG);
+  // folding the branch isn't profitable in that case anyway.
+  if (!Succ->getSinglePredecessor()) {
+    BasicBlock::iterator BBI = BB->begin();
+    while (isa<PHINode>(*BBI)) {
+      for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
+           UI != E; ++UI) {
+        if (PHINode* PN = dyn_cast<PHINode>(*UI)) {
+          if (PN->getIncomingBlock(UI) != BB)
+            return false;
+        } else {
+          return false;
+        }
+      }
+      ++BBI;
+    }
+  }
+
+  DEBUG(errs() << "Killing Trivial BB: \n" << *BB);
+  
+  if (isa<PHINode>(Succ->begin())) {
+    // If there is more than one pred of succ, and there are PHI nodes in
+    // the successor, then we need to add incoming edges for the PHI nodes
+    //
+    const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+    
+    // Loop over all of the PHI nodes in the successor of BB.
+    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      Value *OldVal = PN->removeIncomingValue(BB, false);
+      assert(OldVal && "No entry in PHI for Pred BB!");
+      
+      // If this incoming value is one of the PHI nodes in BB, the new entries
+      // in the PHI node are the entries from the old PHI.
+      if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+        PHINode *OldValPN = cast<PHINode>(OldVal);
+        for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i)
+          // Note that, since we are merging phi nodes and BB and Succ might
+          // have common predecessors, we could end up with a phi node with
+          // identical incoming branches. This will be cleaned up later (and
+          // will trigger asserts if we try to clean it up now, without also
+          // simplifying the corresponding conditional branch).
+          PN->addIncoming(OldValPN->getIncomingValue(i),
+                          OldValPN->getIncomingBlock(i));
+      } else {
+        // Add an incoming value for each of the new incoming values.
+        for (unsigned i = 0, e = BBPreds.size(); i != e; ++i)
+          PN->addIncoming(OldVal, BBPreds[i]);
+      }
+    }
+  }
+  
+  while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+    if (Succ->getSinglePredecessor()) {
+      // BB is the only predecessor of Succ, so Succ will end up with exactly
+      // the same predecessors BB had.
+      Succ->getInstList().splice(Succ->begin(),
+                                 BB->getInstList(), BB->begin());
+    } else {
+      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
+      assert(PN->use_empty() && "There shouldn't be any uses here!");
+      PN->eraseFromParent();
+    }
+  }
+    
+  // Everything that jumped to BB now goes to Succ.
+  BB->replaceAllUsesWith(Succ);
+  if (!Succ->hasName()) Succ->takeName(BB);
+  BB->eraseFromParent();              // Delete the old basic block.
+  return true;
+}
+
+
+
 /// OnlyUsedByDbgIntrinsics - Return true if the instruction I is only used
 /// by DbgIntrinsics. If DbgInUses is specified then the vector is filled 
 /// with the DbgInfoIntrinsic that use the instruction I.
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index cd8d952..2ab0972 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -23,6 +23,11 @@
 //
 // This pass also guarantees that loops will have exactly one backedge.
 //
+// Indirectbr instructions introduce several complications. If the loop
+// contains or is entered by an indirectbr instruction, it may not be possible
+// to transform the loop and make these guarantees. Client code should check
+// that these conditions are true before relying on them.
+//
 // Note that the simplifycfg pass will clean up blocks which are split out but
 // end up being unnecessary, so usage of this pass should not pessimize
 // generated code.
@@ -81,17 +86,15 @@ namespace {
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
     }
 
-    /// verifyAnalysis() - Verify loop nest.
-    void verifyAnalysis() const {
-      assert(L->isLoopSimplifyForm() && "LoopSimplify form not preserved!");
-    }
+    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+    void verifyAnalysis() const;
 
   private:
     bool ProcessLoop(Loop *L, LPPassManager &LPM);
     BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
     BasicBlock *InsertPreheaderForLoop(Loop *L);
     Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM);
-    void InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
+    BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
     void PlaceSplitBlockCarefully(BasicBlock *NewBB,
                                   SmallVectorImpl<BasicBlock*> &SplitPreds,
                                   Loop *L);
@@ -160,8 +163,10 @@ ReprocessLoop:
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     Preheader = InsertPreheaderForLoop(L);
-    NumInserted++;
-    Changed = true;
+    if (Preheader) {
+      NumInserted++;
+      Changed = true;
+    }
   }
 
   // Next, check to make sure that all exit nodes of the loop only have
@@ -180,21 +185,22 @@ ReprocessLoop:
       // Must be exactly this loop: no subloops, parent loops, or non-loop preds
       // allowed.
       if (!L->contains(*PI)) {
-        RewriteLoopExitBlock(L, ExitBlock);
-        NumInserted++;
-        Changed = true;
+        if (RewriteLoopExitBlock(L, ExitBlock)) {
+          NumInserted++;
+          Changed = true;
+        }
         break;
       }
   }
 
   // If the header has more than two predecessors at this point (from the
   // preheader and from multiple backedges), we must adjust the loop.
-  unsigned NumBackedges = L->getNumBackEdges();
-  if (NumBackedges != 1) {
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
     // If this is really a nested loop, rip it out into a child loop.  Don't do
     // this for loops with a giant number of backedges, just factor them into a
     // common backedge instead.
-    if (NumBackedges < 8) {
+    if (L->getNumBackEdges() < 8) {
       if (SeparateNestedLoop(L, LPM)) {
         ++NumNested;
         // This is a big restructuring change, reprocess the whole loop.
@@ -207,9 +213,11 @@ ReprocessLoop:
     // If we either couldn't, or didn't want to, identify nesting of the loops,
     // insert a new block that all backedges target, then make it jump to the
     // loop header.
-    InsertUniqueBackedgeBlock(L, Preheader);
-    NumInserted++;
-    Changed = true;
+    LoopLatch = InsertUniqueBackedgeBlock(L, Preheader);
+    if (LoopLatch) {
+      NumInserted++;
+      Changed = true;
+    }
   }
 
   // Scan over the PHI nodes in the loop header.  Since they now have only two
@@ -233,7 +241,14 @@ ReprocessLoop:
   // loop-invariant instructions out of the way to open up more
   // opportunities, and the disadvantage of having the responsibility
   // to preserve dominator information.
-  if (ExitBlocks.size() > 1 && L->getUniqueExitBlock()) {
+  bool UniqueExit = true;
+  if (!ExitBlocks.empty())
+    for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i)
+      if (ExitBlocks[i] != ExitBlocks[0]) {
+        UniqueExit = false;
+        break;
+      }
+  if (UniqueExit) {
     SmallVector<BasicBlock*, 8> ExitingBlocks;
     L->getExitingBlocks(ExitingBlocks);
     for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
@@ -251,7 +266,8 @@ ReprocessLoop:
         Instruction *Inst = I++;
         if (Inst == CI)
           continue;
-        if (!L->makeLoopInvariant(Inst, Changed, Preheader->getTerminator())) {
+        if (!L->makeLoopInvariant(Inst, Changed,
+                                  Preheader ? Preheader->getTerminator() : 0)) {
           AllInvariant = false;
           break;
         }
@@ -303,8 +319,15 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
   SmallVector<BasicBlock*, 8> OutsideBlocks;
   for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
        PI != PE; ++PI)
-    if (!L->contains(*PI))           // Coming in from outside the loop?
-      OutsideBlocks.push_back(*PI);  // Keep track of it...
+    if (!L->contains(*PI)) {         // Coming in from outside the loop?
+      // If the loop is branched to from an indirect branch, we won't
+      // be able to fully transform the loop, because it prohibits
+      // edge splitting.
+      if (isa<IndirectBrInst>((*PI)->getTerminator())) return 0;
+
+      // Keep track of it.
+      OutsideBlocks.push_back(*PI);
+    }
 
   // Split out the loop pre-header.
   BasicBlock *NewBB =
@@ -324,8 +347,12 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
 BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
   SmallVector<BasicBlock*, 8> LoopBlocks;
   for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I)
-    if (L->contains(*I))
+    if (L->contains(*I)) {
+      // Don't do this if the loop is exited via an indirect branch.
+      if (isa<IndirectBrInst>((*I)->getTerminator())) return 0;
+
       LoopBlocks.push_back(*I);
+    }
 
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
   BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], 
@@ -519,13 +546,18 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
 /// backedges to target a new basic block and have that block branch to the loop
 /// header.  This ensures that loops have exactly one backedge.
 ///
-void LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
+BasicBlock *
+LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
   assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
 
   // Get information about the loop
   BasicBlock *Header = L->getHeader();
   Function *F = Header->getParent();
 
+  // Unique backedge insertion currently depends on having a preheader.
+  if (!Preheader)
+    return 0;
+
   // Figure out which basic blocks contain back-edges to the loop header.
   std::vector<BasicBlock*> BackedgeBlocks;
   for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I)
@@ -612,4 +644,40 @@ void LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
   DT->splitBlock(BEBlock);
   if (DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>())
     DF->splitBlock(BEBlock);
+
+  return BEBlock;
+}
+
+void LoopSimplify::verifyAnalysis() const {
+  // It used to be possible to just assert L->isLoopSimplifyForm(), however
+  // with the introduction of indirectbr, there are now cases where it's
+  // not possible to transform a loop as necessary. We can at least check
+  // that there is an indirectbr near any time there's trouble.
+
+  // Indirectbr can interfere with preheader and unique backedge insertion.
+  if (!L->getLoopPreheader() || !L->getLoopLatch()) {
+    bool HasIndBrPred = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PE = pred_end(L->getHeader()); PI != PE; ++PI)
+      if (isa<IndirectBrInst>((*PI)->getTerminator())) {
+        HasIndBrPred = true;
+        break;
+      }
+    assert(HasIndBrPred &&
+           "LoopSimplify has no excuse for missing loop header info!");
+  }
+
+  // Indirectbr can interfere with exit block canonicalization.
+  if (!L->hasDedicatedExits()) {
+    bool HasIndBrExiting = false;
+    SmallVector<BasicBlock*, 8> ExitingBlocks;
+    L->getExitingBlocks(ExitingBlocks);
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i)
+      if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
+        HasIndBrExiting = true;
+        break;
+      }
+    assert(HasIndBrExiting &&
+           "LoopSimplify has no excuse for missing exit block info!");
+  }
 }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index d68427a..6232f32 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -108,8 +108,19 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
 bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM) {
   assert(L->isLCSSAForm());
 
-  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(errs() << "  Can't unroll; loop preheader-insertion failed.\n");
+    return false;
+  }
+
   BasicBlock *LatchBlock = L->getLoopLatch();
+  if (!LatchBlock) {
+    DEBUG(errs() << "  Can't unroll; loop exit-block-insertion failed.\n");
+    return false;
+  }
+
+  BasicBlock *Header = L->getHeader();
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
   
   if (!BI || BI->isUnconditional()) {
@@ -351,8 +362,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
 
       if (isInstructionTriviallyDead(Inst))
         (*BB)->getInstList().erase(Inst);
-      else if (Constant *C = ConstantFoldInstruction(Inst, 
-                                                     Header->getContext())) {
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
         Inst->replaceAllUsesWith(C);
         (*BB)->getInstList().erase(Inst);
       }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 8e1fb98..8dbc808 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -78,166 +78,6 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
     PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
 }
 
-/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
-/// almost-empty BB ending in an unconditional branch to Succ, into succ.
-///
-/// Assumption: Succ is the single successor for BB.
-///
-static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
-  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
-
-  DEBUG(errs() << "Looking to fold " << BB->getName() << " into " 
-        << Succ->getName() << "\n");
-  // Shortcut, if there is only a single predecessor it must be BB and merging
-  // is always safe
-  if (Succ->getSinglePredecessor()) return true;
-
-  // Make a list of the predecessors of BB
-  typedef SmallPtrSet<BasicBlock*, 16> BlockSet;
-  BlockSet BBPreds(pred_begin(BB), pred_end(BB));
-
-  // Use that list to make another list of common predecessors of BB and Succ
-  BlockSet CommonPreds;
-  for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
-        PI != PE; ++PI)
-    if (BBPreds.count(*PI))
-      CommonPreds.insert(*PI);
-
-  // Shortcut, if there are no common predecessors, merging is always safe
-  if (CommonPreds.empty())
-    return true;
-  
-  // Look at all the phi nodes in Succ, to see if they present a conflict when
-  // merging these blocks
-  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-
-    // If the incoming value from BB is again a PHINode in
-    // BB which has the same incoming value for *PI as PN does, we can
-    // merge the phi nodes and then the blocks can still be merged
-    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
-    if (BBPN && BBPN->getParent() == BB) {
-      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
-            PI != PE; PI++) {
-        if (BBPN->getIncomingValueForBlock(*PI) 
-              != PN->getIncomingValueForBlock(*PI)) {
-          DEBUG(errs() << "Can't fold, phi node " << PN->getName() << " in " 
-                << Succ->getName() << " is conflicting with " 
-                << BBPN->getName() << " with regard to common predecessor "
-                << (*PI)->getName() << "\n");
-          return false;
-        }
-      }
-    } else {
-      Value* Val = PN->getIncomingValueForBlock(BB);
-      for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end();
-            PI != PE; PI++) {
-        // See if the incoming value for the common predecessor is equal to the
-        // one for BB, in which case this phi node will not prevent the merging
-        // of the block.
-        if (Val != PN->getIncomingValueForBlock(*PI)) {
-          DEBUG(errs() << "Can't fold, phi node " << PN->getName() << " in " 
-                << Succ->getName() << " is conflicting with regard to common "
-                << "predecessor " << (*PI)->getName() << "\n");
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
-/// TryToSimplifyUncondBranchFromEmptyBlock - BB contains an unconditional
-/// branch to Succ, and contains no instructions other than PHI nodes and the
-/// branch.  If possible, eliminate BB.
-static bool TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
-                                                    BasicBlock *Succ) {
-  // Check to see if merging these blocks would cause conflicts for any of the
-  // phi nodes in BB or Succ. If not, we can safely merge.
-  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
-
-  // Check for cases where Succ has multiple predecessors and a PHI node in BB
-  // has uses which will not disappear when the PHI nodes are merged.  It is
-  // possible to handle such cases, but difficult: it requires checking whether
-  // BB dominates Succ, which is non-trivial to calculate in the case where
-  // Succ has multiple predecessors.  Also, it requires checking whether
-  // constructing the necessary self-referential PHI node doesn't intoduce any
-  // conflicts; this isn't too difficult, but the previous code for doing this
-  // was incorrect.
-  //
-  // Note that if this check finds a live use, BB dominates Succ, so BB is
-  // something like a loop pre-header (or rarely, a part of an irreducible CFG);
-  // folding the branch isn't profitable in that case anyway.
-  if (!Succ->getSinglePredecessor()) {
-    BasicBlock::iterator BBI = BB->begin();
-    while (isa<PHINode>(*BBI)) {
-      for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
-           UI != E; ++UI) {
-        if (PHINode* PN = dyn_cast<PHINode>(*UI)) {
-          if (PN->getIncomingBlock(UI) != BB)
-            return false;
-        } else {
-          return false;
-        }
-      }
-      ++BBI;
-    }
-  }
-
-  DEBUG(errs() << "Killing Trivial BB: \n" << *BB);
-  
-  if (isa<PHINode>(Succ->begin())) {
-    // If there is more than one pred of succ, and there are PHI nodes in
-    // the successor, then we need to add incoming edges for the PHI nodes
-    //
-    const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
-    
-    // Loop over all of the PHI nodes in the successor of BB.
-    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
-      PHINode *PN = cast<PHINode>(I);
-      Value *OldVal = PN->removeIncomingValue(BB, false);
-      assert(OldVal && "No entry in PHI for Pred BB!");
-      
-      // If this incoming value is one of the PHI nodes in BB, the new entries
-      // in the PHI node are the entries from the old PHI.
-      if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
-        PHINode *OldValPN = cast<PHINode>(OldVal);
-        for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i)
-          // Note that, since we are merging phi nodes and BB and Succ might
-          // have common predecessors, we could end up with a phi node with
-          // identical incoming branches. This will be cleaned up later (and
-          // will trigger asserts if we try to clean it up now, without also
-          // simplifying the corresponding conditional branch).
-          PN->addIncoming(OldValPN->getIncomingValue(i),
-                          OldValPN->getIncomingBlock(i));
-      } else {
-        // Add an incoming value for each of the new incoming values.
-        for (unsigned i = 0, e = BBPreds.size(); i != e; ++i)
-          PN->addIncoming(OldVal, BBPreds[i]);
-      }
-    }
-  }
-  
-  while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
-    if (Succ->getSinglePredecessor()) {
-      // BB is the only predecessor of Succ, so Succ will end up with exactly
-      // the same predecessors BB had.
-      Succ->getInstList().splice(Succ->begin(),
-                                 BB->getInstList(), BB->begin());
-    } else {
-      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
-      assert(PN->use_empty() && "There shouldn't be any uses here!");
-      PN->eraseFromParent();
-    }
-  }
-    
-  // Everything that jumped to BB now goes to Succ.
-  BB->replaceAllUsesWith(Succ);
-  if (!Succ->hasName()) Succ->takeName(BB);
-  BB->eraseFromParent();              // Delete the old basic block.
-  return true;
-}
 
 /// GetIfCondition - Given a basic block (BB) with two predecessors (and
 /// presumably PHI nodes in it), check to see if the merge at this block is due
@@ -1217,7 +1057,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI) {
           }
           
           // Check for trivial simplification.
-          if (Constant *C = ConstantFoldInstruction(N, BB->getContext())) {
+          if (Constant *C = ConstantFoldInstruction(N)) {
             TranslateMap[BBI] = C;
             delete N;   // Constant folded away, don't need actual inst
           } else {
@@ -1983,13 +1823,11 @@ bool llvm::SimplifyCFG(BasicBlock *BB) {
     if (BI->isUnconditional()) {
       BasicBlock::iterator BBI = BB->getFirstNonPHI();
 
-      BasicBlock *Succ = BI->getSuccessor(0);
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(BBI))
         ++BBI;
-      if (BBI->isTerminator() &&  // Terminator is the only non-phi instruction!
-          Succ != BB)             // Don't hurt infinite loops!
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB, Succ))
+      if (BBI->isTerminator()) // Terminator is the only non-phi instruction!
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
           return true;
       
     } else {  // Conditional branch
author	rdivacky <rdivacky@FreeBSD.org>	2009-11-18 14:58:34 +0000
committer	rdivacky <rdivacky@FreeBSD.org>	2009-11-18 14:58:34 +0000
commit	d2e985fd323c167e20f77b045a1d99ad166e65db (patch)
tree	6a111e552c75afc66228e3d8f19b6731e4013f10 /lib/Transforms
parent	ded64d5d348ce8d8c5aa42cf63f6de9dd84b7e89 (diff)
download	FreeBSD-src-d2e985fd323c167e20f77b045a1d99ad166e65db.zip FreeBSD-src-d2e985fd323c167e20f77b045a1d99ad166e65db.tar.gz