Update LLVM to 92395.

author: rdivacky <rdivacky@FreeBSD.org> 2010-01-01 10:31:22 +0000
committer: rdivacky <rdivacky@FreeBSD.org> 2010-01-01 10:31:22 +0000
commit: a16c51cee9225a354c999dd1076d5dba2aa79807 (patch)
tree: dba00119388b84f9f44e6ec5e9129f807fd79ca3 /lib/Transforms
parent: 40a6fcdb85efd93fe0e36c9552cfb0b18b5eacd6 (diff)
download: FreeBSD-src-a16c51cee9225a354c999dd1076d5dba2aa79807.zip
FreeBSD-src-a16c51cee9225a354c999dd1076d5dba2aa79807.tar.gz
22 files changed, 1403 insertions, 1000 deletions
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index 91534a7..eac4e17 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -56,7 +56,7 @@ namespace {
     // We don't modify the program, so we preserve all analyses
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
-    };
+    }
   };
 }
 
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 0b5e007..ae88d9e 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/DebugInfo.h"
@@ -220,17 +219,14 @@ static bool StripDebugInfo(Module &M) {
     Changed = true;
     NMD->eraseFromParent();
   }
-  MetadataContext &TheMetadata = M.getContext().getMetadata();
-  unsigned MDDbgKind = TheMetadata.getMDKind("dbg");
-  if (!MDDbgKind)
-    return Changed;
-
+  
+  unsigned MDDbgKind = M.getMDKindID("dbg");
   for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI) 
     for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
          ++FI)
       for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
            ++BI) 
-        TheMetadata.removeMD(MDDbgKind, BI);
+        BI->setMetadata(MDDbgKind, 0);
 
   return true;
 }
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index e4c4ae5..372616c 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -48,7 +48,7 @@ namespace {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
     const TargetLowering *TLI;
-    ProfileInfo *PI;
+    ProfileInfo *PFI;
 
     /// BackEdges - Keep a set of all the loop back edges.
     ///
@@ -99,7 +99,7 @@ void CodeGenPrepare::findLoopBackEdges(const Function &F) {
 bool CodeGenPrepare::runOnFunction(Function &F) {
   bool EverMadeChange = false;
 
-  PI = getAnalysisIfAvailable<ProfileInfo>();
+  PFI = getAnalysisIfAvailable<ProfileInfo>();
   // First pass, eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
@@ -288,9 +288,9 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
   // The PHIs are now updated, change everything that refers to BB to use
   // DestBB and remove BB.
   BB->replaceAllUsesWith(DestBB);
-  if (PI) {
-    PI->replaceAllUses(BB, DestBB);
-    PI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
+  if (PFI) {
+    PFI->replaceAllUses(BB, DestBB);
+    PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
   }
   BB->eraseFromParent();
 
@@ -368,9 +368,9 @@ static void SplitEdgeNicely(TerminatorInst *TI, unsigned SuccNum,
 
       // If we found a workable predecessor, change TI to branch to Succ.
       if (FoundMatch) {
-        ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
-        if (PI)
-          PI->splitEdge(TIBB, Dest, Pred);
+        ProfileInfo *PFI = P->getAnalysisIfAvailable<ProfileInfo>();
+        if (PFI)
+          PFI->splitEdge(TIBB, Dest, Pred);
         Dest->removePredecessor(TIBB);
         TI->setSuccessor(SuccNum, Pred);
         return;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 222792b..612b415 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,6 +20,7 @@
 #include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
@@ -48,7 +49,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
-#include <cstdio>
 using namespace llvm;
 
 STATISTIC(NumGVNInstr,  "Number of instructions deleted");
@@ -733,13 +733,13 @@ static RegisterPass<GVN> X("gvn",
                            "Global Value Numbering");
 
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
-  printf("{\n");
+  errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
        E = d.end(); I != E; ++I) {
-      printf("%d\n", I->first);
+      errs() << I->first << "\n";
       I->second->dump();
   }
-  printf("}\n");
+  errs() << "}\n";
 }
 
 static bool isSafeReplacement(PHINode* p, Instruction *inst) {
@@ -1278,6 +1278,32 @@ struct AvailableValueInBlock {
     assert(!isSimpleValue() && "Wrong accessor");
     return cast<MemIntrinsic>(Val.getPointer());
   }
+  
+  /// MaterializeAdjustedValue - Emit code into this block to adjust the value
+  /// defined here to the specified type.  This handles various coercion cases.
+  Value *MaterializeAdjustedValue(const Type *LoadTy,
+                                  const TargetData *TD) const {
+    Value *Res;
+    if (isSimpleValue()) {
+      Res = getSimpleValue();
+      if (Res->getType() != LoadTy) {
+        assert(TD && "Need target data to handle type mismatch case");
+        Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
+                                   *TD);
+        
+        DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+                     << *getSimpleValue() << '\n'
+                     << *Res << '\n' << "\n\n\n");
+      }
+    } else {
+      Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
+                                   LoadTy, BB->getTerminator(), *TD);
+      DEBUG(errs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                   << "  " << *getMemIntrinValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+    return Res;
+  }
 };
 
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
@@ -1286,7 +1312,15 @@ struct AvailableValueInBlock {
 static Value *ConstructSSAForLoadSet(LoadInst *LI, 
                          SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
                                      const TargetData *TD,
+                                     const DominatorTree &DT,
                                      AliasAnalysis *AA) {
+  // Check for the fully redundant, dominating load case.  In this case, we can
+  // just use the dominating value directly.
+  if (ValuesPerBlock.size() == 1 && 
+      DT.properlyDominates(ValuesPerBlock[0].BB, LI->getParent()))
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), TD);
+
+  // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI);
@@ -1300,28 +1334,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
 
-    unsigned Offset = AV.Offset;
-
-    Value *AvailableVal;
-    if (AV.isSimpleValue()) {
-      AvailableVal = AV.getSimpleValue();
-      if (AvailableVal->getType() != LoadTy) {
-        assert(TD && "Need target data to handle type mismatch case");
-        AvailableVal = GetStoreValueForLoad(AvailableVal, Offset, LoadTy,
-                                            BB->getTerminator(), *TD);
-        
-        DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
-              << *AV.getSimpleValue() << '\n'
-              << *AvailableVal << '\n' << "\n\n\n");
-      }
-    } else {
-      AvailableVal = GetMemInstValueForLoad(AV.getMemIntrinValue(), Offset,
-                                            LoadTy, BB->getTerminator(), *TD);
-      DEBUG(errs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
-            << "  " << *AV.getMemIntrinValue() << '\n'
-            << *AvailableVal << '\n' << "\n\n\n");
-    }
-    SSAUpdate.AddAvailableValue(BB, AvailableVal);
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, TD));
   }
   
   // Perform PHI construction.
@@ -1346,7 +1359,7 @@ static bool isLifetimeStart(Instruction *Inst) {
 bool GVN::processNonLocalLoad(LoadInst *LI,
                               SmallVectorImpl<Instruction*> &toErase) {
   // Find the non-local dependencies of the load.
-  SmallVector<NonLocalDepEntry, 64> Deps;
+  SmallVector<NonLocalDepResult, 64> Deps;
   MD->getNonLocalPointerDependency(LI->getOperand(0), true, LI->getParent(),
                                    Deps);
   //DEBUG(errs() << "INVESTIGATING NONLOCAL LOAD: "
@@ -1490,7 +1503,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     DEBUG(errs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
     
     // Perform PHI construction.
-    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD,
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT,
                                       VN.getAliasAnalysis());
     LI->replaceAllUsesWith(V);
 
@@ -1679,7 +1692,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
   ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,NewLoad));
 
   // Perform PHI construction.
-  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD,
+  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT,
                                     VN.getAliasAnalysis());
   LI->replaceAllUsesWith(V);
   if (isa<PHINode>(V))
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 2912421..3aa4fd3 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -258,7 +258,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
 
         // Check that InVal is defined in the loop.
         Instruction *Inst = cast<Instruction>(InVal);
-        if (!L->contains(Inst->getParent()))
+        if (!L->contains(Inst))
           continue;
 
         // Okay, this instruction has a user outside of the current loop
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
index b9c536f..516d72e 100644
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -75,6 +75,15 @@ STATISTIC(NumDeadInst , "Number of dead inst eliminated");
 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
 STATISTIC(NumSunkInst , "Number of instructions sunk");
 
+/// SelectPatternFlavor - We can match a variety of different patterns for
+/// select operations.
+enum SelectPatternFlavor {
+  SPF_UNKNOWN = 0,
+  SPF_SMIN, SPF_UMIN,
+  SPF_SMAX, SPF_UMAX
+  //SPF_ABS - TODO.
+};
+
 namespace {
   /// InstCombineWorklist - This is the worklist management logic for
   /// InstCombine.
@@ -257,7 +266,8 @@ namespace {
                                                 ConstantInt *RHS);
     Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                                 ConstantInt *DivRHS);
-
+    Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI,
+                                  ICmpInst::Predicate Pred, Value *TheAdd);
     Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                              ICmpInst::Predicate Cond, Instruction &I);
     Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
@@ -280,6 +290,9 @@ namespace {
     Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                                 Instruction *FI);
     Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
+    Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+                              Value *A, Value *B, Instruction &Outer,
+                              SelectPatternFlavor SPF2, Value *C);
     Instruction *visitSelectInst(SelectInst &SI);
     Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
     Instruction *visitCallInst(CallInst &CI);
@@ -648,6 +661,57 @@ static inline Value *dyn_castFNegVal(Value *V) {
   return 0;
 }
 
+/// MatchSelectPattern - Pattern match integer [SU]MIN, [SU]MAX, and ABS idioms,
+/// returning the kind and providing the out parameter results if we
+/// successfully match.
+static SelectPatternFlavor
+MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
+  SelectInst *SI = dyn_cast<SelectInst>(V);
+  if (SI == 0) return SPF_UNKNOWN;
+  
+  ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition());
+  if (ICI == 0) return SPF_UNKNOWN;
+  
+  LHS = ICI->getOperand(0);
+  RHS = ICI->getOperand(1);
+  
+  // (icmp X, Y) ? X : Y 
+  if (SI->getTrueValue() == ICI->getOperand(0) &&
+      SI->getFalseValue() == ICI->getOperand(1)) {
+    switch (ICI->getPredicate()) {
+    default: return SPF_UNKNOWN; // Equality.
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE: return SPF_UMAX;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE: return SPF_SMAX;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE: return SPF_UMIN;
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE: return SPF_SMIN;
+    }
+  }
+  
+  // (icmp X, Y) ? Y : X 
+  if (SI->getTrueValue() == ICI->getOperand(1) &&
+      SI->getFalseValue() == ICI->getOperand(0)) {
+    switch (ICI->getPredicate()) {
+      default: return SPF_UNKNOWN; // Equality.
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_UGE: return SPF_UMIN;
+      case ICmpInst::ICMP_SGT:
+      case ICmpInst::ICMP_SGE: return SPF_SMIN;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_ULE: return SPF_UMAX;
+      case ICmpInst::ICMP_SLT:
+      case ICmpInst::ICMP_SLE: return SPF_SMAX;
+    }
+  }
+  
+  // TODO: (X > 4) ? X : 5   -->  (X >= 5) ? X : 5  -->  MAX(X, 5)
+  
+  return SPF_UNKNOWN;
+}
+
 /// isFreeToInvert - Return true if the specified value is free to invert (apply
 /// ~ to).  This happens in cases where the ~ can be eliminated.
 static inline bool isFreeToInvert(Value *V) {
@@ -732,12 +796,12 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
 
   APInt MulExt = LHSExt * RHSExt;
 
-  if (sign) {
-    APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
-    APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
-    return MulExt.slt(Min) || MulExt.sgt(Max);
-  } else 
+  if (!sign)
     return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
+  
+  APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
+  APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
+  return MulExt.slt(Min) || MulExt.sgt(Max);
 }
 
 
@@ -2736,9 +2800,13 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (Op0 == Op1)                        // sub X, X  -> 0
     return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
 
-  // If this is a 'B = x-(-A)', change to B = x+A.
-  if (Value *V = dyn_castNegVal(Op1))
-    return BinaryOperator::CreateAdd(Op0, V);
+  // If this is a 'B = x-(-A)', change to B = x+A.  This preserves NSW/NUW.
+  if (Value *V = dyn_castNegVal(Op1)) {
+    BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
+    Res->setHasNoSignedWrap(I.hasNoSignedWrap());
+    Res->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+    return Res;
+  }
 
   if (isa<UndefValue>(Op0))
     return ReplaceInstUsesWith(I, Op0);    // undef - X -> undef
@@ -6356,24 +6424,26 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // comparison into the select arms, which will cause one to be
         // constant folded and the select turned into a bitwise or.
         Value *Op1 = 0, *Op2 = 0;
-        if (LHSI->hasOneUse()) {
-          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
-            // Fold the known value into the constant operand.
-            Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
-            // Insert a new ICmp of the other select operand.
-            Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2),
-                                      RHSC, I.getName());
-          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
-            // Fold the known value into the constant operand.
-            Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
-            // Insert a new ICmp of the other select operand.
+        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1)))
+          Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2)))
+          Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+
+        // We only want to perform this transformation if it will not lead to
+        // additional code. This is true if either both sides of the select
+        // fold to a constant (in which case the icmp is replaced with a select
+        // which will usually simplify) or this is the only user of the
+        // select (in which case we are trading a select+icmp for a simpler
+        // select+icmp).
+        if ((Op1 && Op2) || (LHSI->hasOneUse() && (Op1 || Op2))) {
+          if (!Op1)
             Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1),
                                       RHSC, I.getName());
-          }
-        }
-
-        if (Op1)
+          if (!Op2)
+            Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2),
+                                      RHSC, I.getName());
           return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        }
         break;
       }
       case Instruction::Call:
@@ -6452,7 +6522,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     //   if (X) ...
     // For generality, we handle any zero-extension of any operand comparison
     // with a constant or another cast from the same type.
-    if (isa<ConstantInt>(Op1) || isa<CastInst>(Op1))
+    if (isa<Constant>(Op1) || isa<CastInst>(Op1))
       if (Instruction *R = visitICmpInstWithCastAndCast(I))
         return R;
   }
@@ -6598,9 +6668,112 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
     }
   }
+  
+  {
+    Value *X; ConstantInt *Cst;
+    // icmp X+Cst, X
+    if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
+      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0);
+
+    // icmp X, X+Cst
+    if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
+      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1);
+  }
   return Changed ? &I : 0;
 }
 
+/// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
+Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
+                                            Value *X, ConstantInt *CI,
+                                            ICmpInst::Predicate Pred,
+                                            Value *TheAdd) {
+  // If we have X+0, exit early (simplifying logic below) and let it get folded
+  // elsewhere.   icmp X+0, X  -> icmp X, X
+  if (CI->isZero()) {
+    bool isTrue = ICmpInst::isTrueWhenEqual(Pred);
+    return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
+  }
+  
+  // (X+4) == X -> false.
+  if (Pred == ICmpInst::ICMP_EQ)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext()));
+
+  // (X+4) != X -> true.
+  if (Pred == ICmpInst::ICMP_NE)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext()));
+
+  // If this is an instruction (as opposed to constantexpr) get NUW/NSW info.
+  bool isNUW = false, isNSW = false;
+  if (BinaryOperator *Add = dyn_cast<BinaryOperator>(TheAdd)) {
+    isNUW = Add->hasNoUnsignedWrap();
+    isNSW = Add->hasNoSignedWrap();
+  }      
+  
+  // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
+  // so the values can never be equal.  Similiarly for all other "or equals"
+  // operators.
+  
+  // (X+1) <u X        --> X >u (MAXUINT-1)        --> X != 255
+  // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
+  // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
+  if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
+    // If this is an NUW add, then this is always false.
+    if (isNUW)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext())); 
+    
+    Value *R = ConstantExpr::getSub(ConstantInt::get(CI->getType(), -1ULL), CI);
+    return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
+  }
+  
+  // (X+1) >u X        --> X <u (0-1)        --> X != 255
+  // (X+2) >u X        --> X <u (0-2)        --> X <u 254
+  // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
+    // If this is an NUW add, then this is always true.
+    if (isNUW)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext())); 
+    return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI));
+  }
+  
+  unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits();
+  ConstantInt *SMax = ConstantInt::get(X->getContext(),
+                                       APInt::getSignedMaxValue(BitWidth));
+
+  // (X+ 1) <s X       --> X >s (MAXSINT-1)          --> X == 127
+  // (X+ 2) <s X       --> X >s (MAXSINT-2)          --> X >s 125
+  // (X+MAXSINT) <s X  --> X >s (MAXSINT-MAXSINT)    --> X >s 0
+  // (X+MINSINT) <s X  --> X >s (MAXSINT-MINSINT)    --> X >s -1
+  // (X+ -2) <s X      --> X >s (MAXSINT- -2)        --> X >s 126
+  // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127
+  if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) {
+    // If this is an NSW add, then we have two cases: if the constant is
+    // positive, then this is always false, if negative, this is always true.
+    if (isNSW) {
+      bool isTrue = CI->getValue().isNegative();
+      return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
+    }
+    
+    return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI));
+  }
+  
+  // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127
+  // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126
+  // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
+  // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
+  // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126
+  // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
+  
+  // If this is an NSW add, then we have two cases: if the constant is
+  // positive, then this is always true, if negative, this is always false.
+  if (isNSW) {
+    bool isTrue = !CI->getValue().isNegative();
+    return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
+  }
+  
+  assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
+  Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1);
+  return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
+}
 
 /// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS
 /// and CmpRHS are both known to be integer constants.
@@ -7075,8 +7248,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     break;
 
   case Instruction::Add:
-    // Fold: icmp pred (add, X, C1), C2
-
+    // Fold: icmp pred (add X, C1), C2
     if (!ICI.isEquality()) {
       ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(1));
       if (!LHSC) break;
@@ -7299,19 +7471,17 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
 
   // If the re-extended constant didn't change...
   if (Res2 == CI) {
-    // Make sure that sign of the Cmp and the sign of the Cast are the same.
-    // For example, we might have:
-    //    %A = sext i16 %X to i32
-    //    %B = icmp ugt i32 %A, 1330
-    // It is incorrect to transform this into 
-    //    %B = icmp ugt i16 %X, 1330
-    // because %A may have negative value. 
-    //
-    // However, we allow this when the compare is EQ/NE, because they are
-    // signless.
-    if (isSignedExt == isSignedCmp || ICI.isEquality())
+    // Deal with equality cases early.
+    if (ICI.isEquality())
       return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
-    return 0;
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedExt && isSignedCmp)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, Res1);
   }
 
   // The re-extended constant changed so the constant cannot be represented 
@@ -9372,9 +9542,6 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
       return ReplaceInstUsesWith(SI, TrueVal);
     /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
   }
-
-  /// NOTE: if we wanted to, this is where to detect integer ABS
-
   return Changed ? &SI : 0;
 }
 
@@ -9416,6 +9583,35 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
   return false;
 }
 
+/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form:
+///   SPF2(SPF1(A, B), C) 
+Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
+                                        SelectPatternFlavor SPF1,
+                                        Value *A, Value *B,
+                                        Instruction &Outer,
+                                        SelectPatternFlavor SPF2, Value *C) {
+  if (C == A || C == B) {
+    // MAX(MAX(A, B), B) -> MAX(A, B)
+    // MIN(MIN(a, b), a) -> MIN(a, b)
+    if (SPF1 == SPF2)
+      return ReplaceInstUsesWith(Outer, Inner);
+    
+    // MAX(MIN(a, b), a) -> a
+    // MIN(MAX(a, b), a) -> a
+    if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
+        (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
+        (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
+        (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
+      return ReplaceInstUsesWith(Outer, C);
+  }
+  
+  // TODO: MIN(MIN(A, 23), 97)
+  return 0;
+}
+
+
+
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -9622,9 +9818,28 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   // See if we can fold the select into one of our operands.
   if (SI.getType()->isInteger()) {
-    Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal);
-    if (FoldI)
+    if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
       return FoldI;
+    
+    // MAX(MAX(a, b), a) -> MAX(a, b)
+    // MIN(MIN(a, b), a) -> MIN(a, b)
+    // MAX(MIN(a, b), a) -> a
+    // MIN(MAX(a, b), a) -> a
+    Value *LHS, *RHS, *LHS2, *RHS2;
+    if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) {
+      if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2))
+        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, 
+                                          SI, SPF, RHS))
+          return R;
+      if (SelectPatternFlavor SPF2 = MatchSelectPattern(RHS, LHS2, RHS2))
+        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,
+                                          SI, SPF, LHS))
+          return R;
+    }
+
+    // TODO.
+    // ABS(-X) -> ABS(X)
+    // ABS(ABS(X)) -> ABS(X)
   }
 
   // See if we can fold the select into a phi node if the condition is a select.
@@ -9896,9 +10111,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                         Intrinsic::getDeclaration(M, MemCpyID, Tys, 1));
           Changed = true;
         }
+    }
 
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
       // memmove(x,x,size) -> noop.
-      if (MMI->getSource() == MMI->getDest())
+      if (MTI->getSource() == MTI->getDest())
         return EraseInstFromFunction(CI);
     }
 
@@ -9923,6 +10140,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if (Operand->getIntrinsicID() == Intrinsic::bswap)
         return ReplaceInstUsesWith(CI, Operand->getOperand(1));
     break;
+  case Intrinsic::powi:
+    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getOperand(2))) {
+      // powi(x, 0) -> 1.0
+      if (Power->isZero())
+        return ReplaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
+      // powi(x, 1) -> x
+      if (Power->isOne())
+        return ReplaceInstUsesWith(CI, II->getOperand(1));
+      // powi(x, -1) -> 1/x
+      if (Power->isAllOnesValue())
+        return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
+                                          II->getOperand(1));
+    }
+    break;
+      
   case Intrinsic::uadd_with_overflow: {
     Value *LHS = II->getOperand(1), *RHS = II->getOperand(2);
     const IntegerType *IT = cast<IntegerType>(II->getOperand(1)->getType());
@@ -11232,6 +11464,23 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
     PHINode *PN = PHIsToSlice[PHIId];
     
+    // Scan the input list of the PHI.  If any input is an invoke, and if the
+    // input is defined in the predecessor, then we won't be split the critical
+    // edge which is required to insert a truncate.  Because of this, we have to
+    // bail out.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
+      if (II == 0) continue;
+      if (II->getParent() != PN->getIncomingBlock(i))
+        continue;
+     
+      // If we have a phi, and if it's directly in the predecessor, then we have
+      // a critical edge where we need to put the truncate.  Since we can't
+      // split the edge in instcombine, we have to bail out.
+      return 0;
+    }
+      
+    
     for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
          UI != E; ++UI) {
       Instruction *User = cast<Instruction>(*UI);
@@ -11314,7 +11563,9 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
           PredVal = EltPHI;
           EltPHI->addIncoming(PredVal, Pred);
           continue;
-        } else if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
+        }
+        
+        if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
           // If the incoming value was a PHI, and if it was one of the PHIs we
           // already rewrote it, just use the lowered value.
           if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index d58b9c9..7e6cf79 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 42a8fdc..99f3ae0 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -433,7 +433,7 @@ bool LICM::isNotUsedInLoop(Instruction &I) {
         if (PN->getIncomingValue(i) == &I)
           if (CurLoop->contains(PN->getIncomingBlock(i)))
             return false;
-    } else if (CurLoop->contains(User->getParent())) {
+    } else if (CurLoop->contains(User)) {
       return false;
     }
   }
@@ -831,7 +831,7 @@ void LICM::FindPromotableValuesInLoop(
          UI != UE; ++UI) {
       // Ignore instructions not in this loop.
       Instruction *Use = dyn_cast<Instruction>(*UI);
-      if (!Use || !CurLoop->contains(Use->getParent()))
+      if (!Use || !CurLoop->contains(Use))
         continue;
 
       if (!isa<LoadInst>(Use) && !isa<StoreInst>(Use)) {
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
index 8b6a233..1d9dd68 100644
--- a/lib/Transforms/Scalar/LoopIndexSplit.cpp
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -288,7 +288,7 @@ bool LoopIndexSplit::runOnLoop(Loop *IncomingLoop, LPPassManager &LPM_Ref) {
 // isUsedOutsideLoop - Returns true iff V is used outside the loop L.
 static bool isUsedOutsideLoop(Value *V, Loop *L) {
   for(Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI)
-    if (!L->contains(cast<Instruction>(*UI)->getParent()))
+    if (!L->contains(cast<Instruction>(*UI)))
       return true;
   return false;
 }
@@ -842,7 +842,7 @@ void LoopIndexSplit::updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch,
       for (Value::use_iterator UI = PHV->use_begin(), E = PHV->use_end(); 
            UI != E; ++UI) 
         if (PHINode *U = dyn_cast<PHINode>(*UI)) 
-          if (LP->contains(U->getParent())) {
+          if (LP->contains(U)) {
             NewV = U;
             break;
           }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 85cc712..85f7368 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -144,7 +144,7 @@ namespace {
     /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a
     /// single stride of IV.  All of the users may have different starting
     /// values, and this may not be the only stride.
-    void StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
+    void StrengthReduceIVUsersOfStride(const SCEV *Stride,
                                       IVUsersOfOneStride &Uses,
                                       Loop *L);
     void StrengthReduceIVUsers(Loop *L);
@@ -157,14 +157,14 @@ namespace {
     bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
                            const SCEV* &CondStride);
     bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
-    const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *const&,
+    const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *,
                              IVExpr&, const Type*,
                              const std::vector<BasedUser>& UsersToProcess);
     bool ValidScale(bool, int64_t,
                     const std::vector<BasedUser>& UsersToProcess);
     bool ValidOffset(bool, int64_t, int64_t,
                      const std::vector<BasedUser>& UsersToProcess);
-    const SCEV *CollectIVUsers(const SCEV *const &Stride,
+    const SCEV *CollectIVUsers(const SCEV *Stride,
                               IVUsersOfOneStride &Uses,
                               Loop *L,
                               bool &AllUsesAreAddresses,
@@ -212,8 +212,6 @@ Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
 /// specified set are trivially dead, delete them and see if this makes any of
 /// their operands subsequently dead.
 void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
-  if (DeadInsts.empty()) return;
-
   while (!DeadInsts.empty()) {
     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
 
@@ -232,44 +230,6 @@ void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
   }
 }
 
-/// containsAddRecFromDifferentLoop - Determine whether expression S involves a
-/// subexpression that is an AddRec from a loop other than L.  An outer loop
-/// of L is OK, but not an inner loop nor a disjoint loop.
-static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
-  // This is very common, put it first.
-  if (isa<SCEVConstant>(S))
-    return false;
-  if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
-    for (unsigned int i=0; i< AE->getNumOperands(); i++)
-      if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
-        return true;
-    return false;
-  }
-  if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
-    if (const Loop *newLoop = AE->getLoop()) {
-      if (newLoop == L)
-        return false;
-      // if newLoop is an outer loop of L, this is OK.
-      if (newLoop->contains(L->getHeader()))
-        return false;
-    }
-    return true;
-  }
-  if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
-    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
-           containsAddRecFromDifferentLoop(DE->getRHS(), L);
-#if 0
-  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll
-  // need this when it is.
-  if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
-    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
-           containsAddRecFromDifferentLoop(DE->getRHS(), L);
-#endif
-  if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
-    return containsAddRecFromDifferentLoop(CE->getOperand(), L);
-  return false;
-}
-
 /// isAddressUse - Returns true if the specified instruction is using the
 /// specified value as an address.
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
@@ -362,13 +322,13 @@ namespace {
     // Once we rewrite the code to insert the new IVs we want, update the
     // operands of Inst to use the new expression 'NewBase', with 'Imm' added
     // to it.
-    void RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
+    void RewriteInstructionToUseNewBase(const SCEV *NewBase,
                                         Instruction *InsertPt,
                                        SCEVExpander &Rewriter, Loop *L, Pass *P,
                                         SmallVectorImpl<WeakVH> &DeadInsts,
                                         ScalarEvolution *SE);
 
-    Value *InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
+    Value *InsertCodeForBaseAtPosition(const SCEV *NewBase,
                                        const Type *Ty,
                                        SCEVExpander &Rewriter,
                                        Instruction *IP,
@@ -378,12 +338,12 @@ namespace {
 }
 
 void BasedUser::dump() const {
-  errs() << " Base=" << *Base;
-  errs() << " Imm=" << *Imm;
-  errs() << "   Inst: " << *Inst;
+  dbgs() << " Base=" << *Base;
+  dbgs() << " Imm=" << *Imm;
+  dbgs() << "   Inst: " << *Inst;
 }
 
-Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *NewBase,
                                               const Type *Ty,
                                               SCEVExpander &Rewriter,
                                               Instruction *IP,
@@ -407,7 +367,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
 // value of NewBase in the case that it's a diffferent instruction from
 // the PHI that NewBase is computed from, or null otherwise.
 //
-void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
+void BasedUser::RewriteInstructionToUseNewBase(const SCEV *NewBase,
                                                Instruction *NewBasePt,
                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
                                       SmallVectorImpl<WeakVH> &DeadInsts,
@@ -428,7 +388,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
     // If this is a use outside the loop (which means after, since it is based
     // on a loop indvar) we use the post-incremented value, so that we don't
     // artificially make the preinc value live out the bottom of the loop.
-    if (!isUseOfPostIncrementedValue && L->contains(Inst->getParent())) {
+    if (!isUseOfPostIncrementedValue && L->contains(Inst)) {
       if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
         InsertPt = NewBasePt;
         ++InsertPt;
@@ -444,9 +404,9 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
     // Replace the use of the operand Value with the new Phi we just created.
     Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
 
-    DEBUG(errs() << "      Replacing with ");
-    DEBUG(WriteAsOperand(errs(), NewVal, /*PrintType=*/false));
-    DEBUG(errs() << ", which has value " << *NewBase << " plus IMM "
+    DEBUG(dbgs() << "      Replacing with ");
+    DEBUG(WriteAsOperand(dbgs(), NewVal, /*PrintType=*/false));
+    DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM "
                  << *Imm << "\n");
     return;
   }
@@ -469,7 +429,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
       // that case(?).
       Instruction *OldLoc = dyn_cast<Instruction>(OperandValToReplace);
       BasicBlock *PHIPred = PN->getIncomingBlock(i);
-      if (L->contains(OldLoc->getParent())) {
+      if (L->contains(OldLoc)) {
         // If this is a critical edge, split the edge so that we do not insert
         // the code on all predecessor/successor paths.  We do this unless this
         // is the canonical backedge for this loop, as this can make some
@@ -486,7 +446,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
           // is outside of the loop, and PredTI is in the loop, we want to
           // move the block to be immediately before the PHI block, not
           // immediately after PredTI.
-          if (L->contains(PHIPred) && !L->contains(PN->getParent()))
+          if (L->contains(PHIPred) && !L->contains(PN))
             NewBB->moveBefore(PN->getParent());
 
           // Splitting the edge can reduce the number of PHI entries we have.
@@ -498,15 +458,15 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
       Value *&Code = InsertedCode[PHIPred];
       if (!Code) {
         // Insert the code into the end of the predecessor block.
-        Instruction *InsertPt = (L->contains(OldLoc->getParent())) ?
+        Instruction *InsertPt = (L->contains(OldLoc)) ?
                                 PHIPred->getTerminator() :
                                 OldLoc->getParent()->getTerminator();
         Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(),
                                            Rewriter, InsertPt, SE);
 
-        DEBUG(errs() << "      Changing PHI use to ");
-        DEBUG(WriteAsOperand(errs(), Code, /*PrintType=*/false));
-        DEBUG(errs() << ", which has value " << *NewBase << " plus IMM "
+        DEBUG(dbgs() << "      Changing PHI use to ");
+        DEBUG(WriteAsOperand(dbgs(), Code, /*PrintType=*/false));
+        DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM "
                      << *Imm << "\n");
       }
 
@@ -523,7 +483,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
 
 /// fitsInAddressMode - Return true if V can be subsumed within an addressing
 /// mode, and does not need to be put in a register first.
-static bool fitsInAddressMode(const SCEV *const &V, const Type *AccessTy,
+static bool fitsInAddressMode(const SCEV *V, const Type *AccessTy,
                              const TargetLowering *TLI, bool HasBaseReg) {
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
     int64_t VC = SC->getValue()->getSExtValue();
@@ -737,7 +697,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
     // it is clearly shared across all the IV's.  If the use is outside the loop
     // (which means after it) we don't want to factor anything *into* the loop,
     // so just use 0 as the base.
-    if (L->contains(Uses[0].Inst->getParent()))
+    if (L->contains(Uses[0].Inst))
       std::swap(Result, Uses[0].Base);
     return Result;
   }
@@ -762,7 +722,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
     // after the loop to affect base computation of values *inside* the loop,
     // because we can always add their offsets to the result IV after the loop
     // is done, ensuring we get good code inside the loop.
-    if (!L->contains(Uses[i].Inst->getParent()))
+    if (!L->contains(Uses[i].Inst))
       continue;
     NumUsesInsideLoop++;
 
@@ -818,7 +778,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
     // and a Result in the same instruction (for example because it would
     // require too many registers).  Check this.
     for (unsigned i=0; i<NumUses; ++i) {
-      if (!L->contains(Uses[i].Inst->getParent()))
+      if (!L->contains(Uses[i].Inst))
         continue;
       // We know this is an addressing mode use; if there are any uses that
       // are not, FreeResult would be Zero.
@@ -854,7 +814,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
     // the final IV value coming into those uses does.  Instead of trying to
     // remove the pieces of the common base, which might not be there,
     // subtract off the base to compensate for this.
-    if (!L->contains(Uses[i].Inst->getParent())) {
+    if (!L->contains(Uses[i].Inst)) {
       Uses[i].Base = SE->getMinusSCEV(Uses[i].Base, Result);
       continue;
     }
@@ -975,7 +935,7 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
 const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
                                 bool AllUsesAreAddresses,
                                 bool AllUsesAreOutsideLoop,
-                                const SCEV *const &Stride,
+                                const SCEV *Stride,
                                 IVExpr &IV, const Type *Ty,
                                 const std::vector<BasedUser>& UsersToProcess) {
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
@@ -1088,7 +1048,7 @@ static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
 
 /// isNonConstantNegative - Return true if the specified scev is negated, but
 /// not a constant.
-static bool isNonConstantNegative(const SCEV *const &Expr) {
+static bool isNonConstantNegative(const SCEV *Expr) {
   const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
   if (!Mul) return false;
 
@@ -1105,7 +1065,7 @@ static bool isNonConstantNegative(const SCEV *const &Expr) {
 /// base of the strided accesses, as well as the old information from Uses. We
 /// progressively move information from the Base field to the Imm field, until
 /// we eventually have the full access expression to rewrite the use.
-const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
+const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *Stride,
                                               IVUsersOfOneStride &Uses,
                                               Loop *L,
                                               bool &AllUsesAreAddresses,
@@ -1149,7 +1109,7 @@ const SCEV *LoopStrengthReduce::CollectIVUsers(const SCEV *const &Stride,
     // If the user is not in the current loop, this means it is using the exit
     // value of the IV.  Do not put anything in the base, make sure it's all in
     // the immediate field to allow as much factoring as possible.
-    if (!L->contains(UsersToProcess[i].Inst->getParent())) {
+    if (!L->contains(UsersToProcess[i].Inst)) {
       UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm,
                                              UsersToProcess[i].Base);
       UsersToProcess[i].Base =
@@ -1361,7 +1321,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
                                         const SCEV *CommonExprs,
                                         const Loop *L,
                                         SCEVExpander &PreheaderRewriter) {
-  DEBUG(errs() << "  Fully reducing all users\n");
+  DEBUG(dbgs() << "  Fully reducing all users\n");
 
   // Rewrite the UsersToProcess records, creating a separate PHI for each
   // unique Base value.
@@ -1393,7 +1353,7 @@ static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
                                       const Loop *L) {
   if (UsersToProcess.size() == 1 &&
       UsersToProcess[0].isUseOfPostIncrementedValue &&
-      L->contains(UsersToProcess[0].Inst->getParent()))
+      L->contains(UsersToProcess[0].Inst))
     return UsersToProcess[0].Inst;
   return L->getLoopLatch()->getTerminator();
 }
@@ -1410,7 +1370,7 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
                                          Instruction *IVIncInsertPt,
                                          const Loop *L,
                                          SCEVExpander &PreheaderRewriter) {
-  DEBUG(errs() << "  Inserting new PHI:\n");
+  DEBUG(dbgs() << "  Inserting new PHI:\n");
 
   PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
                                  Stride, IVIncInsertPt, L,
@@ -1423,9 +1383,9 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
   for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
     UsersToProcess[i].Phi = Phi;
 
-  DEBUG(errs() << "    IV=");
-  DEBUG(WriteAsOperand(errs(), Phi, /*PrintType=*/false));
-  DEBUG(errs() << "\n");
+  DEBUG(dbgs() << "    IV=");
+  DEBUG(WriteAsOperand(dbgs(), Phi, /*PrintType=*/false));
+  DEBUG(dbgs() << "\n");
 }
 
 /// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to
@@ -1438,7 +1398,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFromSmallerStride(
                                          Value *CommonBaseV,
                                          const IVExpr &ReuseIV,
                                          Instruction *PreInsertPt) {
-  DEBUG(errs() << "  Rewriting in terms of existing IV of STRIDE "
+  DEBUG(dbgs() << "  Rewriting in terms of existing IV of STRIDE "
                << *ReuseIV.Stride << " and BASE " << *ReuseIV.Base << "\n");
 
   // All the users will share the reused IV.
@@ -1482,7 +1442,7 @@ static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
 /// stride of IV.  All of the users may have different starting values, and this
 /// may not be the only stride.
 void
-LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
+LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *Stride,
                                                   IVUsersOfOneStride &Uses,
                                                   Loop *L) {
   // If all the users are moved to another stride, then there is nothing to do.
@@ -1547,7 +1507,7 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
                                          UsersToProcess, TLI);
 
       if (DoSink) {
-        DEBUG(errs() << "  Sinking " << *Imm << " back down into uses\n");
+        DEBUG(dbgs() << "  Sinking " << *Imm << " back down into uses\n");
         for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i)
           UsersToProcess[i].Imm = SE->getAddExpr(UsersToProcess[i].Imm, Imm);
         CommonExprs = NewCommon;
@@ -1559,7 +1519,7 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
 
   // Now that we know what we need to do, insert the PHI node itself.
   //
-  DEBUG(errs() << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE "
+  DEBUG(dbgs() << "LSR: Examining IVs of TYPE " << *ReplacedTy << " of STRIDE "
                << *Stride << ":\n"
                << "  Common base: " << *CommonExprs << "\n");
 
@@ -1623,10 +1583,10 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
     if (!Base->isZero()) {
       BaseV = PreheaderRewriter.expandCodeFor(Base, 0, PreInsertPt);
 
-      DEBUG(errs() << "  INSERTING code for BASE = " << *Base << ":");
+      DEBUG(dbgs() << "  INSERTING code for BASE = " << *Base << ":");
       if (BaseV->hasName())
-        DEBUG(errs() << " Result value name = %" << BaseV->getName());
-      DEBUG(errs() << "\n");
+        DEBUG(dbgs() << " Result value name = %" << BaseV->getName());
+      DEBUG(dbgs() << "\n");
 
       // If BaseV is a non-zero constant, make sure that it gets inserted into
       // the preheader, instead of being forward substituted into the uses.  We
@@ -1647,15 +1607,15 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
       // FIXME: Use emitted users to emit other users.
       BasedUser &User = UsersToProcess.back();
 
-      DEBUG(errs() << "    Examining ");
+      DEBUG(dbgs() << "    Examining ");
       if (User.isUseOfPostIncrementedValue)
-        DEBUG(errs() << "postinc");
+        DEBUG(dbgs() << "postinc");
       else
-        DEBUG(errs() << "preinc");
-      DEBUG(errs() << " use ");
-      DEBUG(WriteAsOperand(errs(), UsersToProcess.back().OperandValToReplace,
+        DEBUG(dbgs() << "preinc");
+      DEBUG(dbgs() << " use ");
+      DEBUG(WriteAsOperand(dbgs(), UsersToProcess.back().OperandValToReplace,
                            /*PrintType=*/false));
-      DEBUG(errs() << " in Inst: " << *User.Inst);
+      DEBUG(dbgs() << " in Inst: " << *User.Inst);
 
       // If this instruction wants to use the post-incremented value, move it
       // after the post-inc and use its value instead of the PHI.
@@ -1666,7 +1626,7 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
         // loop to ensure it is dominated by the increment. In case it's the
         // only use of the iv, the increment instruction is already before the
         // use.
-        if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt)
+        if (L->contains(User.Inst) && User.Inst != IVIncInsertPt)
           User.Inst->moveBefore(IVIncInsertPt);
       }
 
@@ -1728,7 +1688,7 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
         // common base, and are adding it back here.  Use the same expression
         // as before, rather than CommonBaseV, so DAGCombiner will zap it.
         if (!CommonExprs->isZero()) {
-          if (L->contains(User.Inst->getParent()))
+          if (L->contains(User.Inst))
             RewriteExpr = SE->getAddExpr(RewriteExpr,
                                        SE->getUnknown(CommonBaseV));
           else
@@ -1815,7 +1775,7 @@ namespace {
     const ScalarEvolution *SE;
     explicit StrideCompare(const ScalarEvolution *se) : SE(se) {}
 
-    bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) {
+    bool operator()(const SCEV *LHS, const SCEV *RHS) {
       const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
       const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
       if (LHSC && RHSC) {
@@ -2069,8 +2029,8 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
     Cond = new ICmpInst(OldCond, Predicate, NewCmpLHS, NewCmpRHS,
                         L->getHeader()->getName() + ".termcond");
 
-    DEBUG(errs() << "    Change compare stride in Inst " << *OldCond);
-    DEBUG(errs() << " to " << *Cond << '\n');
+    DEBUG(dbgs() << "    Change compare stride in Inst " << *OldCond);
+    DEBUG(dbgs() << " to " << *Cond << '\n');
 
     // Remove the old compare instruction. The old indvar is probably dead too.
     DeadInsts.push_back(CondUse->getOperandValToReplace());
@@ -2403,7 +2363,7 @@ static bool isUsedByExitBranch(ICmpInst *Cond, Loop *L) {
 static bool ShouldCountToZero(ICmpInst *Cond, IVStrideUse* &CondUse,
                               ScalarEvolution *SE, Loop *L,
                               const TargetLowering *TLI = 0) {
-  if (!L->contains(Cond->getParent()))
+  if (!L->contains(Cond))
     return false;
 
   if (!isa<SCEVConstant>(CondUse->getOffset()))
@@ -2529,7 +2489,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
     if (!UsePostInc)
       continue;
 
-    DEBUG(errs() << "  Change loop exiting icmp to use postinc iv: "
+    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
           << *Cond << '\n');
 
     // It's possible for the setcc instruction to be anywhere in the loop, and
@@ -2608,9 +2568,9 @@ bool LoopStrengthReduce::OptimizeLoopCountIVOfStride(const SCEV* &Stride,
   }
 
   // Replace the increment with a decrement.
-  DEBUG(errs() << "LSR: Examining use ");
-  DEBUG(WriteAsOperand(errs(), CondOp0, /*PrintType=*/false));
-  DEBUG(errs() << " in Inst: " << *Cond << '\n');
+  DEBUG(dbgs() << "LSR: Examining use ");
+  DEBUG(WriteAsOperand(dbgs(), CondOp0, /*PrintType=*/false));
+  DEBUG(dbgs() << " in Inst: " << *Cond << '\n');
   BinaryOperator *Decr =  BinaryOperator::Create(Instruction::Sub,
                          Incr->getOperand(0), Incr->getOperand(1), "tmp", Incr);
   Incr->replaceAllUsesWith(Decr);
@@ -2624,7 +2584,7 @@ bool LoopStrengthReduce::OptimizeLoopCountIVOfStride(const SCEV* &Stride,
   unsigned InBlock = L->contains(PHIExpr->getIncomingBlock(0)) ? 1 : 0;
   Value *StartVal = PHIExpr->getIncomingValue(InBlock);
   Value *EndVal = Cond->getOperand(1);
-  DEBUG(errs() << "    Optimize loop counting iv to count down ["
+  DEBUG(dbgs() << "    Optimize loop counting iv to count down ["
         << *EndVal << " .. " << *StartVal << "]\n");
 
   // FIXME: check for case where both are constant.
@@ -2633,7 +2593,7 @@ bool LoopStrengthReduce::OptimizeLoopCountIVOfStride(const SCEV* &Stride,
                                           EndVal, StartVal, "tmp", PreInsertPt);
   PHIExpr->setIncomingValue(InBlock, NewStartVal);
   Cond->setOperand(1, Zero);
-  DEBUG(errs() << "    New icmp: " << *Cond << "\n");
+  DEBUG(dbgs() << "    New icmp: " << *Cond << "\n");
 
   int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
   const SCEV *NewStride = 0;
@@ -2716,9 +2676,9 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
     return false;
 
   if (!IU->IVUsesByStride.empty()) {
-    DEBUG(errs() << "\nLSR on \"" << L->getHeader()->getParent()->getName()
+    DEBUG(dbgs() << "\nLSR on \"" << L->getHeader()->getParent()->getName()
           << "\" ";
-          L->dump());
+          L->print(dbgs()));
 
     // Sort the StrideOrder so we process larger strides first.
     std::stable_sort(IU->StrideOrder.begin(), IU->StrideOrder.end(),
@@ -2758,8 +2718,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
     IVsByStride.clear();
 
     // Clean up after ourselves
-    if (!DeadInsts.empty())
-      DeleteTriviallyDeadInstructions();
+    DeleteTriviallyDeadInstructions();
   }
 
   // At this point, it is worth checking to see if any recurrence PHIs are also
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index b7adfdc..0c19133 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -877,7 +877,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     
     for (unsigned i = 0, e = Users.size(); i != e; ++i)
       if (Instruction *U = cast<Instruction>(Users[i])) {
-        if (!L->contains(U->getParent()))
+        if (!L->contains(U))
           continue;
         U->replaceUsesOfWith(LIC, Replacement);
         Worklist.push_back(U);
@@ -888,7 +888,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // can.  This case occurs when we unswitch switch statements.
     for (unsigned i = 0, e = Users.size(); i != e; ++i)
       if (Instruction *U = cast<Instruction>(Users[i])) {
-        if (!L->contains(U->getParent()))
+        if (!L->contains(U))
           continue;
 
         Worklist.push_back(U);
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 8466918..827b47d 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass reassociates commutative expressions in an order that is designed
-// to promote better constant propagation, GCSE, LICM, PRE...
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
 //
 // For example: 4 + (x + 5) -> x + (4 + 5)
 //
@@ -35,8 +35,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
 #include <algorithm>
-#include <map>
 using namespace llvm;
 
 STATISTIC(NumLinear , "Number of insts linearized");
@@ -58,21 +58,22 @@ namespace {
 #ifndef NDEBUG
 /// PrintOps - Print out the expression identified in the Ops list.
 ///
-static void PrintOps(Instruction *I, const std::vector<ValueEntry> &Ops) {
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
   Module *M = I->getParent()->getParent()->getParent();
   errs() << Instruction::getOpcodeName(I->getOpcode()) << " "
-       << *Ops[0].Op->getType();
+       << *Ops[0].Op->getType() << '\t';
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-    WriteAsOperand(errs() << " ", Ops[i].Op, false, M);
-    errs() << "," << Ops[i].Rank;
+    errs() << "[ ";
+    WriteAsOperand(errs(), Ops[i].Op, false, M);
+    errs() << ", #" << Ops[i].Rank << "] ";
   }
 }
 #endif
   
 namespace {
   class Reassociate : public FunctionPass {
-    std::map<BasicBlock*, unsigned> RankMap;
-    std::map<AssertingVH<>, unsigned> ValueRankMap;
+    DenseMap<BasicBlock*, unsigned> RankMap;
+    DenseMap<AssertingVH<>, unsigned> ValueRankMap;
     bool MadeChange;
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -86,11 +87,13 @@ namespace {
   private:
     void BuildRankMap(Function &F);
     unsigned getRank(Value *V);
-    void ReassociateExpression(BinaryOperator *I);
-    void RewriteExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops,
+    Value *ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops,
                          unsigned Idx = 0);
-    Value *OptimizeExpression(BinaryOperator *I, std::vector<ValueEntry> &Ops);
-    void LinearizeExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    Value *OptimizeExpression(BinaryOperator *I,
+                              SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     void LinearizeExpr(BinaryOperator *I);
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
     void ReassociateBB(BasicBlock *BB);
@@ -107,10 +110,13 @@ FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
 
 void Reassociate::RemoveDeadBinaryOp(Value *V) {
   Instruction *Op = dyn_cast<Instruction>(V);
-  if (!Op || !isa<BinaryOperator>(Op) || !isa<CmpInst>(Op) || !Op->use_empty())
+  if (!Op || !isa<BinaryOperator>(Op) || !Op->use_empty())
     return;
   
   Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1);
+  
+  ValueRankMap.erase(Op);
+  Op->eraseFromParent();
   RemoveDeadBinaryOp(LHS);
   RemoveDeadBinaryOp(RHS);
 }
@@ -156,13 +162,14 @@ void Reassociate::BuildRankMap(Function &F) {
 }
 
 unsigned Reassociate::getRank(Value *V) {
-  if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument...
-
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) return 0;  // Otherwise it's a global or constant, rank 0.
+  if (I == 0) {
+    if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
+    return 0;  // Otherwise it's a global or constant, rank 0.
+  }
 
-  unsigned &CachedRank = ValueRankMap[I];
-  if (CachedRank) return CachedRank;    // Rank already known?
+  if (unsigned Rank = ValueRankMap[I])
+    return Rank;    // Rank already known?
 
   // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
   // we can reassociate expressions for code motion!  Since we do not recurse
@@ -182,7 +189,7 @@ unsigned Reassociate::getRank(Value *V) {
   //DEBUG(errs() << "Calculated Rank[" << V->getName() << "] = "
   //     << Rank << "\n");
 
-  return CachedRank = Rank;
+  return ValueRankMap[I] = Rank;
 }
 
 /// isReassociableOp - Return true if V is an instruction of the specified
@@ -197,7 +204,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
 static Instruction *LowerNegateToMultiply(Instruction *Neg,
-                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
   Constant *Cst = Constant::getAllOnesValue(Neg->getType());
 
   Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
@@ -250,7 +257,7 @@ void Reassociate::LinearizeExpr(BinaryOperator *I) {
 /// caller MUST use something like RewriteExprTree to put the values back in.
 ///
 void Reassociate::LinearizeExprTree(BinaryOperator *I,
-                                    std::vector<ValueEntry> &Ops) {
+                                    SmallVectorImpl<ValueEntry> &Ops) {
   Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
   unsigned Opcode = I->getOpcode();
 
@@ -282,15 +289,15 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
       I->setOperand(0, UndefValue::get(I->getType()));
       I->setOperand(1, UndefValue::get(I->getType()));
       return;
-    } else {
-      // Turn X+(Y+Z) -> (Y+Z)+X
-      std::swap(LHSBO, RHSBO);
-      std::swap(LHS, RHS);
-      bool Success = !I->swapOperands();
-      assert(Success && "swapOperands failed");
-      Success = false;
-      MadeChange = true;
     }
+    
+    // Turn X+(Y+Z) -> (Y+Z)+X
+    std::swap(LHSBO, RHSBO);
+    std::swap(LHS, RHS);
+    bool Success = !I->swapOperands();
+    assert(Success && "swapOperands failed");
+    Success = false;
+    MadeChange = true;
   } else if (RHSBO) {
     // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the the RHS is not
     // part of the expression tree.
@@ -322,7 +329,7 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
 // linearized and optimized, emit them in-order.  This function is written to be
 // tail recursive.
 void Reassociate::RewriteExprTree(BinaryOperator *I,
-                                  std::vector<ValueEntry> &Ops,
+                                  SmallVectorImpl<ValueEntry> &Ops,
                                   unsigned i) {
   if (i+2 == Ops.size()) {
     if (I->getOperand(0) != Ops[i].Op ||
@@ -369,6 +376,9 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
 // that should be processed next by the reassociation pass.
 //
 static Value *NegateValue(Value *V, Instruction *BI) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getNeg(C);
+  
   // We are trying to expose opportunity for reassociation.  One of the things
   // that we want to do to achieve this is to push a negation as deep into an
   // expression chain as possible, to expose the add instructions.  In practice,
@@ -376,7 +386,7 @@ static Value *NegateValue(Value *V, Instruction *BI) {
   //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
   // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
   // the constants.  We assume that instcombine will clean up the mess later if
-  // we introduce tons of unnecessary negation instructions...
+  // we introduce tons of unnecessary negation instructions.
   //
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
@@ -393,10 +403,36 @@ static Value *NegateValue(Value *V, Instruction *BI) {
       I->setName(I->getName()+".neg");
       return I;
     }
+  
+  // Okay, we need to materialize a negated version of V with an instruction.
+  // Scan the use lists of V to see if we have one already.
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    if (!BinaryOperator::isNeg(*UI)) continue;
+
+    // We found one!  Now we have to make sure that the definition dominates
+    // this use.  We do this by moving it to the entry block (if it is a
+    // non-instruction value) or right after the definition.  These negates will
+    // be zapped by reassociate later, so we don't need much finesse here.
+    BinaryOperator *TheNeg = cast<BinaryOperator>(*UI);
+    
+    BasicBlock::iterator InsertPt;
+    if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+        InsertPt = II->getNormalDest()->begin();
+      } else {
+        InsertPt = InstInput;
+        ++InsertPt;
+      }
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+    } else {
+      InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+    }
+    TheNeg->moveBefore(InsertPt);
+    return TheNeg;
+  }
 
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
-  //
   return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
 }
 
@@ -427,12 +463,12 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
 /// only used by an add, transform this into (X+(0-Y)) to promote better
 /// reassociation.
 static Instruction *BreakUpSubtract(Instruction *Sub,
-                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
-  // Convert a subtract into an add and a neg instruction... so that sub
-  // instructions can be commuted with other add instructions...
+                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+  // Convert a subtract into an add and a neg instruction. This allows sub
+  // instructions to be commuted with other add instructions.
   //
-  // Calculate the negative value of Operand 1 of the sub instruction...
-  // and set it as the RHS of the add instruction we just made...
+  // Calculate the negative value of Operand 1 of the sub instruction,
+  // and set it as the RHS of the add instruction we just made.
   //
   Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
   Instruction *New =
@@ -452,7 +488,7 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
 /// by one, change this into a multiply by a constant to assist with further
 /// reassociation.
 static Instruction *ConvertShiftToMul(Instruction *Shl, 
-                              std::map<AssertingVH<>, unsigned> &ValueRankMap) {
+                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
   // If an operand of this shift is a reassociable multiply, or if the shift
   // is used by a reassociable multiply or add, turn into a multiply.
   if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
@@ -460,11 +496,10 @@ static Instruction *ConvertShiftToMul(Instruction *Shl,
        (isReassociableOp(Shl->use_back(), Instruction::Mul) ||
         isReassociableOp(Shl->use_back(), Instruction::Add)))) {
     Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
-    MulCst =
-        ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+    MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
     
-    Instruction *Mul = BinaryOperator::CreateMul(Shl->getOperand(0), MulCst,
-                                                 "", Shl);
+    Instruction *Mul =
+      BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
     ValueRankMap.erase(Shl);
     Mul->takeName(Shl);
     Shl->replaceAllUsesWith(Mul);
@@ -475,15 +510,16 @@ static Instruction *ConvertShiftToMul(Instruction *Shl,
 }
 
 // Scan backwards and forwards among values with the same rank as element i to
-// see if X exists.  If X does not exist, return i.
-static unsigned FindInOperandList(std::vector<ValueEntry> &Ops, unsigned i,
+// see if X exists.  If X does not exist, return i.  This is useful when
+// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
                                   Value *X) {
   unsigned XRank = Ops[i].Rank;
   unsigned e = Ops.size();
   for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
     if (Ops[j].Op == X)
       return j;
-  // Scan backwards
+  // Scan backwards.
   for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
     if (Ops[j].Op == X)
       return j;
@@ -492,7 +528,7 @@ static unsigned FindInOperandList(std::vector<ValueEntry> &Ops, unsigned i,
 
 /// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together
 /// and returning the result.  Insert the tree before I.
-static Value *EmitAddTreeOfValues(Instruction *I, std::vector<Value*> &Ops) {
+static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl<Value*> &Ops){
   if (Ops.size() == 1) return Ops.back();
   
   Value *V1 = Ops.back();
@@ -508,32 +544,57 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
   if (!BO) return 0;
   
-  std::vector<ValueEntry> Factors;
+  SmallVector<ValueEntry, 8> Factors;
   LinearizeExprTree(BO, Factors);
 
   bool FoundFactor = false;
-  for (unsigned i = 0, e = Factors.size(); i != e; ++i)
+  bool NeedsNegate = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
     if (Factors[i].Op == Factor) {
       FoundFactor = true;
       Factors.erase(Factors.begin()+i);
       break;
     }
+    
+    // If this is a negative version of this factor, remove it.
+    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor))
+      if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+        if (FC1->getValue() == -FC2->getValue()) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin()+i);
+          break;
+        }
+  }
+  
   if (!FoundFactor) {
     // Make sure to restore the operands to the expression tree.
     RewriteExprTree(BO, Factors);
     return 0;
   }
   
-  if (Factors.size() == 1) return Factors[0].Op;
+  BasicBlock::iterator InsertPt = BO; ++InsertPt;
+  
+  // If this was just a single multiply, remove the multiply and return the only
+  // remaining operand.
+  if (Factors.size() == 1) {
+    ValueRankMap.erase(BO);
+    BO->eraseFromParent();
+    V = Factors[0].Op;
+  } else {
+    RewriteExprTree(BO, Factors);
+    V = BO;
+  }
   
-  RewriteExprTree(BO, Factors);
-  return BO;
+  if (NeedsNegate)
+    V = BinaryOperator::CreateNeg(V, "neg", InsertPt);
+  
+  return V;
 }
 
 /// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively
 /// add its operands as factors, otherwise add V to the list of factors.
 static void FindSingleUseMultiplyFactors(Value *V,
-                                         std::vector<Value*> &Factors) {
+                                         SmallVectorImpl<Value*> &Factors) {
   BinaryOperator *BO;
   if ((!V->hasOneUse() && !V->use_empty()) ||
       !(BO = dyn_cast<BinaryOperator>(V)) ||
@@ -547,10 +608,228 @@ static void FindSingleUseMultiplyFactors(Value *V,
   FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
 }
 
+/// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor'
+/// instruction.  This optimizes based on identities.  If it can be reduced to
+/// a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+                               SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+  // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    // First, check for X and ~X in the operand list.
+    assert(i < Ops.size());
+    if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+      Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+      unsigned FoundX = FindInOperandList(Ops, i, X);
+      if (FoundX != i) {
+        if (Opcode == Instruction::And)   // ...&X&~X = 0
+          return Constant::getNullValue(X->getType());
+        
+        if (Opcode == Instruction::Or)    // ...|X|~X = -1
+          return Constant::getAllOnesValue(X->getType());
+      }
+    }
+    
+    // Next, check for duplicate pairs of values, which we assume are next to
+    // each other, due to our sorting criteria.
+    assert(i < Ops.size());
+    if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+      if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+        // Drop duplicate values for And and Or.
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+        ++NumAnnihil;
+        continue;
+      }
+      
+      // Drop pairs of values for Xor.
+      assert(Opcode == Instruction::Xor);
+      if (e == 2)
+        return Constant::getNullValue(Ops[0].Op->getType());
+      
+      // Y ^ X^X -> Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+      i -= 1; e -= 2;
+      ++NumAnnihil;
+    }
+  }
+  return 0;
+}
 
+/// OptimizeAdd - Optimize a series of operands to an 'add' instruction.  This
+/// optimizes based on identities.  If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *Reassociate::OptimizeAdd(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and -X pairs.  If we find any, we
+  // can simplify the expression. X+-X == 0.  While we're at it, scan for any
+  // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+  //
+  // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1".
+  //
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *TheOp = Ops[i].Op;
+    // Check to see if we've seen this operand before.  If so, we factor all
+    // instances of the operand together.  Due to our sorting criteria, we know
+    // that these need to be next to each other in the vector.
+    if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+      // Rescan the list, remove all instances of this operand from the expr.
+      unsigned NumFound = 0;
+      do {
+        Ops.erase(Ops.begin()+i);
+        ++NumFound;
+      } while (i != Ops.size() && Ops[i].Op == TheOp);
+      
+      DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+      ++NumFactor;
+      
+      // Insert a new multiply.
+      Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound);
+      Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I);
+      
+      // Now that we have inserted a multiply, optimize it. This allows us to
+      // handle cases that require multiple factoring steps, such as this:
+      // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+      Mul = ReassociateExpression(cast<BinaryOperator>(Mul));
+      
+      // If every add operand was a duplicate, return the multiply.
+      if (Ops.empty())
+        return Mul;
+      
+      // Otherwise, we had some input that didn't have the dupe, such as
+      // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
+      // things being added by this operation.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+      
+      --i;
+      e = Ops.size();
+      continue;
+    }
+    
+    // Check for X and -X in the operand list.
+    if (!BinaryOperator::isNeg(TheOp))
+      continue;
+    
+    Value *X = BinaryOperator::getNegArgument(TheOp);
+    unsigned FoundX = FindInOperandList(Ops, i, X);
+    if (FoundX == i)
+      continue;
+    
+    // Remove X and -X from the operand list.
+    if (Ops.size() == 2)
+      return Constant::getNullValue(X->getType());
+    
+    Ops.erase(Ops.begin()+i);
+    if (i < FoundX)
+      --FoundX;
+    else
+      --i;   // Need to back up an extra one.
+    Ops.erase(Ops.begin()+FoundX);
+    ++NumAnnihil;
+    --i;     // Revisit element.
+    e -= 2;  // Removed two elements.
+  }
+  
+  // Scan the operand list, checking to see if there are any common factors
+  // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+  // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+  // To efficiently find this, we count the number of times a factor occurs
+  // for any ADD operands that are MULs.
+  DenseMap<Value*, unsigned> FactorOccurrences;
+  
+  // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+  // where they are actually the same multiply.
+  unsigned MaxOcc = 0;
+  Value *MaxOccVal = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op);
+    if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty())
+      continue;
+    
+    // Compute all of the factors of this added value.
+    SmallVector<Value*, 8> Factors;
+    FindSingleUseMultiplyFactors(BOp, Factors);
+    assert(Factors.size() > 1 && "Bad linearize!");
+    
+    // Add one to FactorOccurrences for each unique factor in this op.
+    SmallPtrSet<Value*, 8> Duplicates;
+    for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+      Value *Factor = Factors[i];
+      if (!Duplicates.insert(Factor)) continue;
+      
+      unsigned Occ = ++FactorOccurrences[Factor];
+      if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+      
+      // If Factor is a negative constant, add the negated value as a factor
+      // because we can percolate the negate out.  Watch for minint, which
+      // cannot be positivified.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor))
+        if (CI->getValue().isNegative() && !CI->getValue().isMinSignedValue()) {
+          Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+          assert(!Duplicates.count(Factor) &&
+                 "Shouldn't have two constant factors, missed a canonicalize");
+          
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+        }
+    }
+  }
+  
+  // If any factor occurred more than one time, we can pull it out.
+  if (MaxOcc > 1) {
+    DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+    ++NumFactor;
+
+    // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+    // this, we could otherwise run into situations where removing a factor
+    // from an expression will drop a use of maxocc, and this can cause 
+    // RemoveFactorFromExpression on successive values to behave differently.
+    Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
+    SmallVector<Value*, 4> NewMulOps;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+        NewMulOps.push_back(V);
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+      }
+    }
+    
+    // No need for extra uses anymore.
+    delete DummyInst;
+    
+    unsigned NumAddedValues = NewMulOps.size();
+    Value *V = EmitAddTreeOfValues(I, NewMulOps);
+    
+    // Now that we have inserted the add tree, optimize it. This allows us to
+    // handle cases that require multiple factoring steps, such as this:
+    // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+    assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+    V = ReassociateExpression(cast<BinaryOperator>(V));
+
+    // Create the multiply.
+    Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+
+    // Rerun associate on the multiply in case the inner expression turned into
+    // a multiply.  We want to make sure that we keep things in canonical form.
+    V2 = ReassociateExpression(cast<BinaryOperator>(V2));
+    
+    // If every add operand included the factor (e.g. "A*B + A*C"), then the
+    // entire result expression is just the multiply "A*(B+C)".
+    if (Ops.empty())
+      return V2;
+    
+    // Otherwise, we had some input that didn't have the factor, such as
+    // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
+    // things being added by this operation.
+    Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+  }
+  
+  return 0;
+}
 
 Value *Reassociate::OptimizeExpression(BinaryOperator *I,
-                                       std::vector<ValueEntry> &Ops) {
+                                       SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
   bool IterateOptimization = false;
@@ -570,198 +849,53 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
     switch (Opcode) {
     default: break;
     case Instruction::And:
-      if (CstVal->isZero()) {                // ... & 0 -> 0
-        ++NumAnnihil;
+      if (CstVal->isZero())                  // X & 0 -> 0
         return CstVal;
-      } else if (CstVal->isAllOnesValue()) { // ... & -1 -> ...
+      if (CstVal->isAllOnesValue())          // X & -1 -> X
         Ops.pop_back();
-      }
       break;
     case Instruction::Mul:
-      if (CstVal->isZero()) {                // ... * 0 -> 0
+      if (CstVal->isZero()) {                // X * 0 -> 0
         ++NumAnnihil;
         return CstVal;
-      } else if (cast<ConstantInt>(CstVal)->isOne()) {
-        Ops.pop_back();                      // ... * 1 -> ...
       }
+        
+      if (cast<ConstantInt>(CstVal)->isOne())
+        Ops.pop_back();                      // X * 1 -> X
       break;
     case Instruction::Or:
-      if (CstVal->isAllOnesValue()) {        // ... | -1 -> -1
-        ++NumAnnihil;
+      if (CstVal->isAllOnesValue())          // X | -1 -> -1
         return CstVal;
-      }
       // FALLTHROUGH!
     case Instruction::Add:
     case Instruction::Xor:
-      if (CstVal->isZero())                  // ... [|^+] 0 -> ...
+      if (CstVal->isZero())                  // X [|^+] 0 -> X
         Ops.pop_back();
       break;
     }
   if (Ops.size() == 1) return Ops[0].Op;
 
-  // Handle destructive annihilation do to identities between elements in the
+  // Handle destructive annihilation due to identities between elements in the
   // argument list here.
   switch (Opcode) {
   default: break;
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-    // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
-    // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-      // First, check for X and ~X in the operand list.
-      assert(i < Ops.size());
-      if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
-        Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
-        unsigned FoundX = FindInOperandList(Ops, i, X);
-        if (FoundX != i) {
-          if (Opcode == Instruction::And) {   // ...&X&~X = 0
-            ++NumAnnihil;
-            return Constant::getNullValue(X->getType());
-          } else if (Opcode == Instruction::Or) {   // ...|X|~X = -1
-            ++NumAnnihil;
-            return Constant::getAllOnesValue(X->getType());
-          }
-        }
-      }
-
-      // Next, check for duplicate pairs of values, which we assume are next to
-      // each other, due to our sorting criteria.
-      assert(i < Ops.size());
-      if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
-        if (Opcode == Instruction::And || Opcode == Instruction::Or) {
-          // Drop duplicate values.
-          Ops.erase(Ops.begin()+i);
-          --i; --e;
-          IterateOptimization = true;
-          ++NumAnnihil;
-        } else {
-          assert(Opcode == Instruction::Xor);
-          if (e == 2) {
-            ++NumAnnihil;
-            return Constant::getNullValue(Ops[0].Op->getType());
-          }
-          // ... X^X -> ...
-          Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
-          i -= 1; e -= 2;
-          IterateOptimization = true;
-          ++NumAnnihil;
-        }
-      }
-    }
+  case Instruction::Xor: {
+    unsigned NumOps = Ops.size();
+    if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+      return Result;
+    IterateOptimization |= Ops.size() != NumOps;
     break;
+  }
 
-  case Instruction::Add:
-    // Scan the operand lists looking for X and -X pairs.  If we find any, we
-    // can simplify the expression. X+-X == 0.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-      assert(i < Ops.size());
-      // Check for X and -X in the operand list.
-      if (BinaryOperator::isNeg(Ops[i].Op)) {
-        Value *X = BinaryOperator::getNegArgument(Ops[i].Op);
-        unsigned FoundX = FindInOperandList(Ops, i, X);
-        if (FoundX != i) {
-          // Remove X and -X from the operand list.
-          if (Ops.size() == 2) {
-            ++NumAnnihil;
-            return Constant::getNullValue(X->getType());
-          } else {
-            Ops.erase(Ops.begin()+i);
-            if (i < FoundX)
-              --FoundX;
-            else
-              --i;   // Need to back up an extra one.
-            Ops.erase(Ops.begin()+FoundX);
-            IterateOptimization = true;
-            ++NumAnnihil;
-            --i;     // Revisit element.
-            e -= 2;  // Removed two elements.
-          }
-        }
-      }
-    }
-    
-
-    // Scan the operand list, checking to see if there are any common factors
-    // between operands.  Consider something like A*A+A*B*C+D.  We would like to
-    // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
-    // To efficiently find this, we count the number of times a factor occurs
-    // for any ADD operands that are MULs.
-    std::map<Value*, unsigned> FactorOccurrences;
-    unsigned MaxOcc = 0;
-    Value *MaxOccVal = 0;
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-      if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op)) {
-        if (BOp->getOpcode() == Instruction::Mul && BOp->use_empty()) {
-          // Compute all of the factors of this added value.
-          std::vector<Value*> Factors;
-          FindSingleUseMultiplyFactors(BOp, Factors);
-          assert(Factors.size() > 1 && "Bad linearize!");
-
-          // Add one to FactorOccurrences for each unique factor in this op.
-          if (Factors.size() == 2) {
-            unsigned Occ = ++FactorOccurrences[Factors[0]];
-            if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[0]; }
-            if (Factors[0] != Factors[1]) {   // Don't double count A*A.
-              Occ = ++FactorOccurrences[Factors[1]];
-              if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[1]; }
-            }
-          } else {
-            std::set<Value*> Duplicates;
-            for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
-              if (Duplicates.insert(Factors[i]).second) {
-                unsigned Occ = ++FactorOccurrences[Factors[i]];
-                if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[i]; }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    // If any factor occurred more than one time, we can pull it out.
-    if (MaxOcc > 1) {
-      DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << "\n");
-      
-      // Create a new instruction that uses the MaxOccVal twice.  If we don't do
-      // this, we could otherwise run into situations where removing a factor
-      // from an expression will drop a use of maxocc, and this can cause 
-      // RemoveFactorFromExpression on successive values to behave differently.
-      Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
-      std::vector<Value*> NewMulOps;
-      for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-        if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
-          NewMulOps.push_back(V);
-          Ops.erase(Ops.begin()+i);
-          --i; --e;
-        }
-      }
-      
-      // No need for extra uses anymore.
-      delete DummyInst;
-
-      unsigned NumAddedValues = NewMulOps.size();
-      Value *V = EmitAddTreeOfValues(I, NewMulOps);
-      Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
-
-      // Now that we have inserted V and its sole use, optimize it. This allows
-      // us to handle cases that require multiple factoring steps, such as this:
-      // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
-      if (NumAddedValues > 1)
-        ReassociateExpression(cast<BinaryOperator>(V));
-      
-      ++NumFactor;
-      
-      if (Ops.empty())
-        return V2;
+  case Instruction::Add: {
+    unsigned NumOps = Ops.size();
+    if (Value *Result = OptimizeAdd(I, Ops))
+      return Result;
+    IterateOptimization |= Ops.size() != NumOps;
+  }
 
-      // Add the new value to the list of things being added.
-      Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
-      
-      // Rewrite the tree so that there is now a use of V.
-      RewriteExprTree(I, Ops);
-      return OptimizeExpression(I, Ops);
-    }
     break;
   //case Instruction::Mul:
   }
@@ -826,13 +960,14 @@ void Reassociate::ReassociateBB(BasicBlock *BB) {
   }
 }
 
-void Reassociate::ReassociateExpression(BinaryOperator *I) {
+Value *Reassociate::ReassociateExpression(BinaryOperator *I) {
   
-  // First, walk the expression tree, linearizing the tree, collecting
-  std::vector<ValueEntry> Ops;
+  // First, walk the expression tree, linearizing the tree, collecting the
+  // operand information.
+  SmallVector<ValueEntry, 8> Ops;
   LinearizeExprTree(I, Ops);
   
-  DEBUG(errs() << "RAIn:\t"; PrintOps(I, Ops); errs() << "\n");
+  DEBUG(errs() << "RAIn:\t"; PrintOps(I, Ops); errs() << '\n');
   
   // Now that we have linearized the tree to a list and have gathered all of
   // the operands and their ranks, sort the operands by their rank.  Use a
@@ -847,10 +982,11 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) {
   if (Value *V = OptimizeExpression(I, Ops)) {
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
-    DEBUG(errs() << "Reassoc to scalar: " << *V << "\n");
+    DEBUG(errs() << "Reassoc to scalar: " << *V << '\n');
     I->replaceAllUsesWith(V);
     RemoveDeadBinaryOp(I);
-    return;
+    ++NumAnnihil;
+    return V;
   }
   
   // We want to sink immediates as deeply as possible except in the case where
@@ -861,22 +997,24 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) {
       cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add &&
       isa<ConstantInt>(Ops.back().Op) &&
       cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
-    Ops.insert(Ops.begin(), Ops.back());
-    Ops.pop_back();
+    ValueEntry Tmp = Ops.pop_back_val();
+    Ops.insert(Ops.begin(), Tmp);
   }
   
-  DEBUG(errs() << "RAOut:\t"; PrintOps(I, Ops); errs() << "\n");
+  DEBUG(errs() << "RAOut:\t"; PrintOps(I, Ops); errs() << '\n');
   
   if (Ops.size() == 1) {
     // This expression tree simplified to something that isn't a tree,
     // eliminate it.
     I->replaceAllUsesWith(Ops[0].Op);
     RemoveDeadBinaryOp(I);
-  } else {
-    // Now that we ordered and optimized the expressions, splat them back into
-    // the expression tree, removing any unneeded nodes.
-    RewriteExprTree(I, Ops);
+    return Ops[0].Op;
   }
+  
+  // Now that we ordered and optimized the expressions, splat them back into
+  // the expression tree, removing any unneeded nodes.
+  RewriteExprTree(I, Ops);
+  return I;
 }
 
 
@@ -888,7 +1026,7 @@ bool Reassociate::runOnFunction(Function &F) {
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
     ReassociateBB(FI);
 
-  // We are done with the rank map...
+  // We are done with the rank map.
   RankMap.clear();
   ValueRankMap.clear();
   return MadeChange;
diff --git a/lib/Transforms/Scalar/SCCVN.cpp b/lib/Transforms/Scalar/SCCVN.cpp
index dbc82e1..f91fbda 100644
--- a/lib/Transforms/Scalar/SCCVN.cpp
+++ b/lib/Transforms/Scalar/SCCVN.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
-#include <cstdio>
 using namespace llvm;
 
 STATISTIC(NumSCCVNInstr,  "Number of instructions deleted by SCCVN");
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index b040a27..79bb7c5 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -74,6 +74,10 @@ namespace {
   private:
     TargetData *TD;
     
+    /// DeadInsts - Keep track of instructions we have made dead, so that
+    /// we can remove them after we are done working.
+    SmallVector<Value*, 32> DeadInsts;
+
     /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
     /// information about the uses.  All these fields are initialized to false
     /// and set to true when something is learned.
@@ -102,25 +106,29 @@ namespace {
 
     int isSafeAllocaToScalarRepl(AllocaInst *AI);
 
-    void isSafeUseOfAllocation(Instruction *User, AllocaInst *AI,
-                               AllocaInfo &Info);
-    void isSafeElementUse(Value *Ptr, bool isFirstElt, AllocaInst *AI,
-                          AllocaInfo &Info);
-    void isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocaInst *AI,
-                                        unsigned OpNo, AllocaInfo &Info);
-    void isSafeUseOfBitCastedAllocation(BitCastInst *User, AllocaInst *AI,
-                                        AllocaInfo &Info);
+    void isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                             AllocaInfo &Info);
+    void isSafeGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t &Offset,
+                   AllocaInfo &Info);
+    void isSafeMemAccess(AllocaInst *AI, uint64_t Offset, uint64_t MemSize,
+                         const Type *MemOpType, bool isStore, AllocaInfo &Info);
+    bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size);
+    uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset,
+                                  const Type *&IdxTy);
     
     void DoScalarReplacement(AllocaInst *AI, 
                              std::vector<AllocaInst*> &WorkList);
-    void CleanupGEP(GetElementPtrInst *GEP);
-    void CleanupAllocaUsers(AllocaInst *AI);
+    void DeleteDeadInstructions();
+    void CleanupAllocaUsers(Value *V);
     AllocaInst *AddNewAlloca(Function &F, const Type *Ty, AllocaInst *Base);
     
-    void RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocaInst *AI,
-                                    SmallVector<AllocaInst*, 32> &NewElts);
-    
-    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
+    void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                              SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+                        SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+                    SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                       AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
@@ -360,399 +368,350 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
     }
   }
 
-  // Now that we have created the alloca instructions that we want to use,
-  // expand the getelementptr instructions to use them.
-  while (!AI->use_empty()) {
-    Instruction *User = cast<Instruction>(AI->use_back());
-    if (BitCastInst *BCInst = dyn_cast<BitCastInst>(User)) {
-      RewriteBitCastUserOfAlloca(BCInst, AI, ElementAllocas);
-      BCInst->eraseFromParent();
-      continue;
-    }
-    
-    // Replace:
-    //   %res = load { i32, i32 }* %alloc
-    // with:
-    //   %load.0 = load i32* %alloc.0
-    //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 
-    //   %load.1 = load i32* %alloc.1
-    //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 
-    // (Also works for arrays instead of structs)
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      Value *Insert = UndefValue::get(LI->getType());
-      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
-        Value *Load = new LoadInst(ElementAllocas[i], "load", LI);
-        Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
-      }
-      LI->replaceAllUsesWith(Insert);
-      LI->eraseFromParent();
-      continue;
-    }
-
-    // Replace:
-    //   store { i32, i32 } %val, { i32, i32 }* %alloc
-    // with:
-    //   %val.0 = extractvalue { i32, i32 } %val, 0 
-    //   store i32 %val.0, i32* %alloc.0
-    //   %val.1 = extractvalue { i32, i32 } %val, 1 
-    //   store i32 %val.1, i32* %alloc.1
-    // (Also works for arrays instead of structs)
-    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-      Value *Val = SI->getOperand(0);
-      for (unsigned i = 0, e = ElementAllocas.size(); i != e; ++i) {
-        Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
-        new StoreInst(Extract, ElementAllocas[i], SI);
-      }
-      SI->eraseFromParent();
-      continue;
-    }
-    
-    GetElementPtrInst *GEPI = cast<GetElementPtrInst>(User);
-    // We now know that the GEP is of the form: GEP <ptr>, 0, <cst>
-    unsigned Idx =
-       (unsigned)cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
-
-    assert(Idx < ElementAllocas.size() && "Index out of range?");
-    AllocaInst *AllocaToUse = ElementAllocas[Idx];
-
-    Value *RepValue;
-    if (GEPI->getNumOperands() == 3) {
-      // Do not insert a new getelementptr instruction with zero indices, only
-      // to have it optimized out later.
-      RepValue = AllocaToUse;
-    } else {
-      // We are indexing deeply into the structure, so we still need a
-      // getelement ptr instruction to finish the indexing.  This may be
-      // expanded itself once the worklist is rerun.
-      //
-      SmallVector<Value*, 8> NewArgs;
-      NewArgs.push_back(Constant::getNullValue(
-                                           Type::getInt32Ty(AI->getContext())));
-      NewArgs.append(GEPI->op_begin()+3, GEPI->op_end());
-      RepValue = GetElementPtrInst::Create(AllocaToUse, NewArgs.begin(),
-                                           NewArgs.end(), "", GEPI);
-      RepValue->takeName(GEPI);
-    }
-    
-    // If this GEP is to the start of the aggregate, check for memcpys.
-    if (Idx == 0 && GEPI->hasAllZeroIndices())
-      RewriteBitCastUserOfAlloca(GEPI, AI, ElementAllocas);
-
-    // Move all of the users over to the new GEP.
-    GEPI->replaceAllUsesWith(RepValue);
-    // Delete the old GEP
-    GEPI->eraseFromParent();
-  }
+  // Now that we have created the new alloca instructions, rewrite all the
+  // uses of the old alloca.
+  RewriteForScalarRepl(AI, AI, 0, ElementAllocas);
 
-  // Finally, delete the Alloca instruction
+  // Now erase any instructions that were made dead while rewriting the alloca.
+  DeleteDeadInstructions();
   AI->eraseFromParent();
+
   NumReplaced++;
 }
 
-/// isSafeElementUse - Check to see if this use is an allowed use for a
-/// getelementptr instruction of an array aggregate allocation.  isFirstElt
-/// indicates whether Ptr is known to the start of the aggregate.
-void SROA::isSafeElementUse(Value *Ptr, bool isFirstElt, AllocaInst *AI,
-                            AllocaInfo &Info) {
-  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
-       I != E; ++I) {
-    Instruction *User = cast<Instruction>(*I);
-    switch (User->getOpcode()) {
-    case Instruction::Load:  break;
-    case Instruction::Store:
-      // Store is ok if storing INTO the pointer, not storing the pointer
-      if (User->getOperand(0) == Ptr) return MarkUnsafe(Info);
-      break;
-    case Instruction::GetElementPtr: {
-      GetElementPtrInst *GEP = cast<GetElementPtrInst>(User);
-      bool AreAllZeroIndices = isFirstElt;
-      if (GEP->getNumOperands() > 1 &&
-          (!isa<ConstantInt>(GEP->getOperand(1)) ||
-           !cast<ConstantInt>(GEP->getOperand(1))->isZero()))
-        // Using pointer arithmetic to navigate the array.
-        return MarkUnsafe(Info);
-      
-      // Verify that any array subscripts are in range.
-      for (gep_type_iterator GEPIt = gep_type_begin(GEP),
-           E = gep_type_end(GEP); GEPIt != E; ++GEPIt) {
-        // Ignore struct elements, no extra checking needed for these.
-        if (isa<StructType>(*GEPIt))
-          continue;
-
-        // This GEP indexes an array.  Verify that this is an in-range
-        // constant integer. Specifically, consider A[0][i]. We cannot know that
-        // the user isn't doing invalid things like allowing i to index an
-        // out-of-range subscript that accesses A[1].  Because of this, we have
-        // to reject SROA of any accesses into structs where any of the
-        // components are variables. 
-        ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
-        if (!IdxVal) return MarkUnsafe(Info);
-        
-        // Are all indices still zero?
-        AreAllZeroIndices &= IdxVal->isZero();
-        
-        if (const ArrayType *AT = dyn_cast<ArrayType>(*GEPIt)) {
-          if (IdxVal->getZExtValue() >= AT->getNumElements())
-            return MarkUnsafe(Info);
-        } else if (const VectorType *VT = dyn_cast<VectorType>(*GEPIt)) {
-          if (IdxVal->getZExtValue() >= VT->getNumElements())
-            return MarkUnsafe(Info);
-        }
+/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
+/// recursively including all their operands that become trivially dead.
+void SROA::DeleteDeadInstructions() {
+  while (!DeadInsts.empty()) {
+    Instruction *I = cast<Instruction>(DeadInsts.pop_back_val());
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        // (But, don't add allocas to the dead instruction list -- they are
+        // already on the worklist and will be deleted separately.)
+        *OI = 0;
+        if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
+          DeadInsts.push_back(U);
       }
-      
-      isSafeElementUse(GEP, AreAllZeroIndices, AI, Info);
-      if (Info.isUnsafe) return;
-      break;
-    }
-    case Instruction::BitCast:
-      if (isFirstElt) {
-        isSafeUseOfBitCastedAllocation(cast<BitCastInst>(User), AI, Info);
-        if (Info.isUnsafe) return;
-        break;
-      }
-      DEBUG(errs() << "  Transformation preventing inst: " << *User << '\n');
-      return MarkUnsafe(Info);
-    case Instruction::Call:
-      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
-        if (isFirstElt) {
-          isSafeMemIntrinsicOnAllocation(MI, AI, I.getOperandNo(), Info);
-          if (Info.isUnsafe) return;
-          break;
-        }
-      }
-      DEBUG(errs() << "  Transformation preventing inst: " << *User << '\n');
-      return MarkUnsafe(Info);
-    default:
-      DEBUG(errs() << "  Transformation preventing inst: " << *User << '\n');
-      return MarkUnsafe(Info);
-    }
-  }
-  return;  // All users look ok :)
-}
 
-/// AllUsersAreLoads - Return true if all users of this value are loads.
-static bool AllUsersAreLoads(Value *Ptr) {
-  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
-       I != E; ++I)
-    if (cast<Instruction>(*I)->getOpcode() != Instruction::Load)
-      return false;
-  return true;
-}
-
-/// isSafeUseOfAllocation - Check if this user is an allowed use for an
-/// aggregate allocation.
-void SROA::isSafeUseOfAllocation(Instruction *User, AllocaInst *AI,
-                                 AllocaInfo &Info) {
-  if (BitCastInst *C = dyn_cast<BitCastInst>(User))
-    return isSafeUseOfBitCastedAllocation(C, AI, Info);
-
-  if (LoadInst *LI = dyn_cast<LoadInst>(User))
-    if (!LI->isVolatile())
-      return;// Loads (returning a first class aggregrate) are always rewritable
-
-  if (StoreInst *SI = dyn_cast<StoreInst>(User))
-    if (!SI->isVolatile() && SI->getOperand(0) != AI)
-      return;// Store is ok if storing INTO the pointer, not storing the pointer
- 
-  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User);
-  if (GEPI == 0)
-    return MarkUnsafe(Info);
-
-  gep_type_iterator I = gep_type_begin(GEPI), E = gep_type_end(GEPI);
-
-  // The GEP is not safe to transform if not of the form "GEP <ptr>, 0, <cst>".
-  if (I == E ||
-      I.getOperand() != Constant::getNullValue(I.getOperand()->getType())) {
-    return MarkUnsafe(Info);
+    I->eraseFromParent();
   }
+}
+    
+/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
+/// performing scalar replacement of alloca AI.  The results are flagged in
+/// the Info parameter.  Offset indicates the position within AI that is
+/// referenced by this instruction.
+void SROA::isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                               AllocaInfo &Info) {
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
 
-  ++I;
-  if (I == E) return MarkUnsafe(Info);  // ran out of GEP indices??
-
-  bool IsAllZeroIndices = true;
-  
-  // If the first index is a non-constant index into an array, see if we can
-  // handle it as a special case.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
-    if (!isa<ConstantInt>(I.getOperand())) {
-      IsAllZeroIndices = 0;
-      uint64_t NumElements = AT->getNumElements();
-      
-      // If this is an array index and the index is not constant, we cannot
-      // promote... that is unless the array has exactly one or two elements in
-      // it, in which case we CAN promote it, but we have to canonicalize this
-      // out if this is the only problem.
-      if ((NumElements == 1 || NumElements == 2) &&
-          AllUsersAreLoads(GEPI)) {
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      isSafeForScalarRepl(BC, AI, Offset, Info);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      uint64_t GEPOffset = Offset;
+      isSafeGEP(GEPI, AI, GEPOffset, Info);
+      if (!Info.isUnsafe)
+        isSafeForScalarRepl(GEPI, AI, GEPOffset, Info);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
+      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+      if (Length)
+        isSafeMemAccess(AI, Offset, Length->getZExtValue(), 0,
+                        UI.getOperandNo() == 1, Info);
+      else
+        MarkUnsafe(Info);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      if (!LI->isVolatile()) {
+        const Type *LIType = LI->getType();
+        isSafeMemAccess(AI, Offset, TD->getTypeAllocSize(LIType),
+                        LIType, false, Info);
+      } else
+        MarkUnsafe(Info);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (!SI->isVolatile() && SI->getOperand(0) != I) {
+        const Type *SIType = SI->getOperand(0)->getType();
+        isSafeMemAccess(AI, Offset, TD->getTypeAllocSize(SIType),
+                        SIType, true, Info);
+      } else
+        MarkUnsafe(Info);
+    } else if (isa<DbgInfoIntrinsic>(UI)) {
+      // If one user is DbgInfoIntrinsic then check if all users are
+      // DbgInfoIntrinsics.
+      if (OnlyUsedByDbgInfoIntrinsics(I)) {
         Info.needsCleanup = true;
-        return;  // Canonicalization required!
+        return;
       }
-      return MarkUnsafe(Info);
+      MarkUnsafe(Info);
+    } else {
+      DEBUG(errs() << "  Transformation preventing inst: " << *User << '\n');
+      MarkUnsafe(Info);
     }
+    if (Info.isUnsafe) return;
   }
- 
+}
+
+/// isSafeGEP - Check if a GEP instruction can be handled for scalar
+/// replacement.  It is safe when all the indices are constant, in-bounds
+/// references, and when the resulting offset corresponds to an element within
+/// the alloca type.  The results are flagged in the Info parameter.  Upon
+/// return, Offset is adjusted as specified by the GEP indices.
+void SROA::isSafeGEP(GetElementPtrInst *GEPI, AllocaInst *AI,
+                     uint64_t &Offset, AllocaInfo &Info) {
+  gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+  if (GEPIt == E)
+    return;
+
   // Walk through the GEP type indices, checking the types that this indexes
   // into.
-  for (; I != E; ++I) {
+  for (; GEPIt != E; ++GEPIt) {
     // Ignore struct elements, no extra checking needed for these.
-    if (isa<StructType>(*I))
+    if (isa<StructType>(*GEPIt))
       continue;
-    
-    ConstantInt *IdxVal = dyn_cast<ConstantInt>(I.getOperand());
-    if (!IdxVal) return MarkUnsafe(Info);
 
-    // Are all indices still zero?
-    IsAllZeroIndices &= IdxVal->isZero();
-    
-    if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
-      // This GEP indexes an array.  Verify that this is an in-range constant
-      // integer. Specifically, consider A[0][i]. We cannot know that the user
-      // isn't doing invalid things like allowing i to index an out-of-range
-      // subscript that accesses A[1].  Because of this, we have to reject SROA
-      // of any accesses into structs where any of the components are variables.
-      if (IdxVal->getZExtValue() >= AT->getNumElements())
-        return MarkUnsafe(Info);
-    } else if (const VectorType *VT = dyn_cast<VectorType>(*I)) {
-      if (IdxVal->getZExtValue() >= VT->getNumElements())
-        return MarkUnsafe(Info);
+    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
+    if (!IdxVal)
+      return MarkUnsafe(Info);
+  }
+
+  // Compute the offset due to this GEP and check if the alloca has a
+  // component element at that offset.
+  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
+                                 &Indices[0], Indices.size());
+  if (!TypeHasComponent(AI->getAllocatedType(), Offset, 0))
+    MarkUnsafe(Info);
+}
+
+/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI
+/// alloca or has an offset and size that corresponds to a component element
+/// within it.  The offset checked here may have been formed from a GEP with a
+/// pointer bitcasted to a different type.
+void SROA::isSafeMemAccess(AllocaInst *AI, uint64_t Offset, uint64_t MemSize,
+                           const Type *MemOpType, bool isStore,
+                           AllocaInfo &Info) {
+  // Check if this is a load/store of the entire alloca.
+  if (Offset == 0 && MemSize == TD->getTypeAllocSize(AI->getAllocatedType())) {
+    bool UsesAggregateType = (MemOpType == AI->getAllocatedType());
+    // This is safe for MemIntrinsics (where MemOpType is 0), integer types
+    // (which are essentially the same as the MemIntrinsics, especially with
+    // regard to copying padding between elements), or references using the
+    // aggregate type of the alloca.
+    if (!MemOpType || isa<IntegerType>(MemOpType) || UsesAggregateType) {
+      if (!UsesAggregateType) {
+        if (isStore)
+          Info.isMemCpyDst = true;
+        else
+          Info.isMemCpySrc = true;
+      }
+      return;
     }
   }
-  
-  // If there are any non-simple uses of this getelementptr, make sure to reject
-  // them.
-  return isSafeElementUse(GEPI, IsAllZeroIndices, AI, Info);
+  // Check if the offset/size correspond to a component within the alloca type.
+  const Type *T = AI->getAllocatedType();
+  if (TypeHasComponent(T, Offset, MemSize))
+    return;
+
+  return MarkUnsafe(Info);
 }
 
-/// isSafeMemIntrinsicOnAllocation - Check if the specified memory
-/// intrinsic can be promoted by SROA.  At this point, we know that the operand
-/// of the memintrinsic is a pointer to the beginning of the allocation.
-void SROA::isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocaInst *AI,
-                                          unsigned OpNo, AllocaInfo &Info) {
-  // If not constant length, give up.
-  ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
-  if (!Length) return MarkUnsafe(Info);
-  
-  // If not the whole aggregate, give up.
-  if (Length->getZExtValue() !=
-      TD->getTypeAllocSize(AI->getType()->getElementType()))
-    return MarkUnsafe(Info);
-  
-  // We only know about memcpy/memset/memmove.
-  if (!isa<MemIntrinsic>(MI))
-    return MarkUnsafe(Info);
-  
-  // Otherwise, we can transform it.  Determine whether this is a memcpy/set
-  // into or out of the aggregate.
-  if (OpNo == 1)
-    Info.isMemCpyDst = true;
-  else {
-    assert(OpNo == 2);
-    Info.isMemCpySrc = true;
+/// TypeHasComponent - Return true if T has a component type with the
+/// specified offset and size.  If Size is zero, do not check the size.
+bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) {
+  const Type *EltTy;
+  uint64_t EltSize;
+  if (const StructType *ST = dyn_cast<StructType>(T)) {
+    const StructLayout *Layout = TD->getStructLayout(ST);
+    unsigned EltIdx = Layout->getElementContainingOffset(Offset);
+    EltTy = ST->getContainedType(EltIdx);
+    EltSize = TD->getTypeAllocSize(EltTy);
+    Offset -= Layout->getElementOffset(EltIdx);
+  } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    EltTy = AT->getElementType();
+    EltSize = TD->getTypeAllocSize(EltTy);
+    if (Offset >= AT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
+  } else {
+    return false;
   }
+  if (Offset == 0 && (Size == 0 || EltSize == Size))
+    return true;
+  // Check if the component spans multiple elements.
+  if (Offset + Size > EltSize)
+    return false;
+  return TypeHasComponent(EltTy, Offset, Size);
 }
 
-/// isSafeUseOfBitCastedAllocation - Check if all users of this bitcast
-/// from an alloca are safe for SROA of that alloca.
-void SROA::isSafeUseOfBitCastedAllocation(BitCastInst *BC, AllocaInst *AI,
-                                          AllocaInfo &Info) {
-  for (Value::use_iterator UI = BC->use_begin(), E = BC->use_end();
-       UI != E; ++UI) {
-    if (BitCastInst *BCU = dyn_cast<BitCastInst>(UI)) {
-      isSafeUseOfBitCastedAllocation(BCU, AI, Info);
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
-      isSafeMemIntrinsicOnAllocation(MI, AI, UI.getOperandNo(), Info);
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
-      if (SI->isVolatile())
-        return MarkUnsafe(Info);
-      
-      // If storing the entire alloca in one chunk through a bitcasted pointer
-      // to integer, we can transform it.  This happens (for example) when you
-      // cast a {i32,i32}* to i64* and store through it.  This is similar to the
-      // memcpy case and occurs in various "byval" cases and emulated memcpys.
-      if (isa<IntegerType>(SI->getOperand(0)->getType()) &&
-          TD->getTypeAllocSize(SI->getOperand(0)->getType()) ==
-          TD->getTypeAllocSize(AI->getType()->getElementType())) {
-        Info.isMemCpyDst = true;
-        continue;
-      }
-      return MarkUnsafe(Info);
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
-      if (LI->isVolatile())
-        return MarkUnsafe(Info);
-
-      // If loading the entire alloca in one chunk through a bitcasted pointer
-      // to integer, we can transform it.  This happens (for example) when you
-      // cast a {i32,i32}* to i64* and load through it.  This is similar to the
-      // memcpy case and occurs in various "byval" cases and emulated memcpys.
-      if (isa<IntegerType>(LI->getType()) &&
-          TD->getTypeAllocSize(LI->getType()) ==
-          TD->getTypeAllocSize(AI->getType()->getElementType())) {
-        Info.isMemCpySrc = true;
-        continue;
+/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
+/// the instruction I, which references it, to use the separate elements.
+/// Offset indicates the position within AI that is referenced by this
+/// instruction.
+void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                                SmallVector<AllocaInst*, 32> &NewElts) {
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCast(BC, AI, Offset, NewElts);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      RewriteGEP(GEPI, AI, Offset, NewElts);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+      uint64_t MemSize = Length->getZExtValue();
+      if (Offset == 0 &&
+          MemSize == TD->getTypeAllocSize(AI->getAllocatedType()))
+        RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
+      // Otherwise the intrinsic can only touch a single element and the
+      // address operand will be updated, so nothing else needs to be done.
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      const Type *LIType = LI->getType();
+      if (LIType == AI->getAllocatedType()) {
+        // Replace:
+        //   %res = load { i32, i32 }* %alloc
+        // with:
+        //   %load.0 = load i32* %alloc.0
+        //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0
+        //   %load.1 = load i32* %alloc.1
+        //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
+        // (Also works for arrays instead of structs)
+        Value *Insert = UndefValue::get(LIType);
+        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+          Value *Load = new LoadInst(NewElts[i], "load", LI);
+          Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+        }
+        LI->replaceAllUsesWith(Insert);
+        DeadInsts.push_back(LI);
+      } else if (isa<IntegerType>(LIType) &&
+                 TD->getTypeAllocSize(LIType) ==
+                 TD->getTypeAllocSize(AI->getAllocatedType())) {
+        // If this is a load of the entire alloca to an integer, rewrite it.
+        RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
       }
-      return MarkUnsafe(Info);
-    } else if (isa<DbgInfoIntrinsic>(UI)) {
-      // If one user is DbgInfoIntrinsic then check if all users are
-      // DbgInfoIntrinsics.
-      if (OnlyUsedByDbgInfoIntrinsics(BC)) {
-        Info.needsCleanup = true;
-        return;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      Value *Val = SI->getOperand(0);
+      const Type *SIType = Val->getType();
+      if (SIType == AI->getAllocatedType()) {
+        // Replace:
+        //   store { i32, i32 } %val, { i32, i32 }* %alloc
+        // with:
+        //   %val.0 = extractvalue { i32, i32 } %val, 0
+        //   store i32 %val.0, i32* %alloc.0
+        //   %val.1 = extractvalue { i32, i32 } %val, 1
+        //   store i32 %val.1, i32* %alloc.1
+        // (Also works for arrays instead of structs)
+        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+          Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
+          new StoreInst(Extract, NewElts[i], SI);
+        }
+        DeadInsts.push_back(SI);
+      } else if (isa<IntegerType>(SIType) &&
+                 TD->getTypeAllocSize(SIType) ==
+                 TD->getTypeAllocSize(AI->getAllocatedType())) {
+        // If this is a store of the entire alloca from an integer, rewrite it.
+        RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
       }
-      else
-        MarkUnsafe(Info);
     }
-    else {
-      return MarkUnsafe(Info);
-    }
-    if (Info.isUnsafe) return;
   }
 }
 
-/// RewriteBitCastUserOfAlloca - BCInst (transitively) bitcasts AI, or indexes
-/// to its first element.  Transform users of the cast to use the new values
-/// instead.
-void SROA::RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocaInst *AI,
-                                      SmallVector<AllocaInst*, 32> &NewElts) {
-  Value::use_iterator UI = BCInst->use_begin(), UE = BCInst->use_end();
-  while (UI != UE) {
-    Instruction *User = cast<Instruction>(*UI++);
-    if (BitCastInst *BCU = dyn_cast<BitCastInst>(User)) {
-      RewriteBitCastUserOfAlloca(BCU, AI, NewElts);
-      if (BCU->use_empty()) BCU->eraseFromParent();
-      continue;
-    }
+/// RewriteBitCast - Update a bitcast reference to the alloca being replaced
+/// and recursively continue updating all of its uses.
+void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+                          SmallVector<AllocaInst*, 32> &NewElts) {
+  RewriteForScalarRepl(BC, AI, Offset, NewElts);
+  if (BC->getOperand(0) != AI)
+    return;
 
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
-      // This must be memcpy/memmove/memset of the entire aggregate.
-      // Split into one per element.
-      RewriteMemIntrinUserOfAlloca(MI, BCInst, AI, NewElts);
-      continue;
-    }
-      
-    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-      // If this is a store of the entire alloca from an integer, rewrite it.
-      RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
-      continue;
-    }
+  // The bitcast references the original alloca.  Replace its uses with
+  // references to the first new element alloca.
+  Instruction *Val = NewElts[0];
+  if (Val->getType() != BC->getDestTy()) {
+    Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
+    Val->takeName(BC);
+  }
+  BC->replaceAllUsesWith(Val);
+  DeadInsts.push_back(BC);
+}
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      // If this is a load of the entire alloca to an integer, rewrite it.
-      RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
-      continue;
-    }
-    
-    // Otherwise it must be some other user of a gep of the first pointer.  Just
-    // leave these alone.
-    continue;
+/// FindElementAndOffset - Return the index of the element containing Offset
+/// within the specified type, which must be either a struct or an array.
+/// Sets T to the type of the element and Offset to the offset within that
+/// element.  IdxTy is set to the type of the index result to be used in a
+/// GEP instruction.
+uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
+                                    const Type *&IdxTy) {
+  uint64_t Idx = 0;
+  if (const StructType *ST = dyn_cast<StructType>(T)) {
+    const StructLayout *Layout = TD->getStructLayout(ST);
+    Idx = Layout->getElementContainingOffset(Offset);
+    T = ST->getContainedType(Idx);
+    Offset -= Layout->getElementOffset(Idx);
+    IdxTy = Type::getInt32Ty(T->getContext());
+    return Idx;
   }
+  const ArrayType *AT = cast<ArrayType>(T);
+  T = AT->getElementType();
+  uint64_t EltSize = TD->getTypeAllocSize(T);
+  Idx = Offset / EltSize;
+  Offset -= Idx * EltSize;
+  IdxTy = Type::getInt64Ty(T->getContext());
+  return Idx;
+}
+
+/// RewriteGEP - Check if this GEP instruction moves the pointer across
+/// elements of the alloca that are being split apart, and if so, rewrite
+/// the GEP to be relative to the new element.
+void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+                      SmallVector<AllocaInst*, 32> &NewElts) {
+  uint64_t OldOffset = Offset;
+  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
+                                 &Indices[0], Indices.size());
+
+  RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
+
+  const Type *T = AI->getAllocatedType();
+  const Type *IdxTy;
+  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
+  if (GEPI->getOperand(0) == AI)
+    OldIdx = ~0ULL; // Force the GEP to be rewritten.
+
+  T = AI->getAllocatedType();
+  uint64_t EltOffset = Offset;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+
+  // If this GEP does not move the pointer across elements of the alloca
+  // being split, then it does not needs to be rewritten.
+  if (Idx == OldIdx)
+    return;
+
+  const Type *i32Ty = Type::getInt32Ty(AI->getContext());
+  SmallVector<Value*, 8> NewArgs;
+  NewArgs.push_back(Constant::getNullValue(i32Ty));
+  while (EltOffset != 0) {
+    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy);
+    NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
+  }
+  Instruction *Val = NewElts[Idx];
+  if (NewArgs.size() > 1) {
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(),
+                                            NewArgs.end(), "", GEPI);
+    Val->takeName(GEPI);
+  }
+  if (Val->getType() != GEPI->getType())
+    Val = new BitCastInst(Val, GEPI->getType(), Val->getNameStr(), GEPI);
+  GEPI->replaceAllUsesWith(Val);
+  DeadInsts.push_back(GEPI);
 }
 
 /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
 /// Rewrite it to copy or set the elements of the scalarized memory.
-void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
+void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                         AllocaInst *AI,
                                         SmallVector<AllocaInst*, 32> &NewElts) {
-  
   // If this is a memcpy/memmove, construct the other pointer as the
   // appropriate type.  The "Other" pointer is the pointer that goes to memory
   // that doesn't have anything to do with the alloca that we are promoting. For
@@ -761,28 +720,41 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
   LLVMContext &Context = MI->getContext();
   unsigned MemAlignment = MI->getAlignment();
   if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
-    if (BCInst == MTI->getRawDest())
+    if (Inst == MTI->getRawDest())
       OtherPtr = MTI->getRawSource();
     else {
-      assert(BCInst == MTI->getRawSource());
+      assert(Inst == MTI->getRawSource());
       OtherPtr = MTI->getRawDest();
     }
   }
 
-  // Keep track of the other intrinsic argument, so it can be removed if it
-  // is dead when the intrinsic is replaced.
-  Value *PossiblyDead = OtherPtr;
-  
   // If there is an other pointer, we want to convert it to the same pointer
   // type as AI has, so we can GEP through it safely.
   if (OtherPtr) {
-    // It is likely that OtherPtr is a bitcast, if so, remove it.
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr))
-      OtherPtr = BC->getOperand(0);
-    // All zero GEPs are effectively bitcasts.
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(OtherPtr))
-      if (GEP->hasAllZeroIndices())
-        OtherPtr = GEP->getOperand(0);
+
+    // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
+    // optimization, but it's also required to detect the corner case where
+    // both pointer operands are referencing the same memory, and where
+    // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
+    // function is only called for mem intrinsics that access the whole
+    // aggregate, so non-zero GEPs are not an issue here.)
+    while (1) {
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr)) {
+        OtherPtr = BC->getOperand(0);
+        continue;
+      }
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(OtherPtr)) {
+        // All zero GEPs are effectively bitcasts.
+        if (GEP->hasAllZeroIndices()) {
+          OtherPtr = GEP->getOperand(0);
+          continue;
+        }
+      }
+      break;
+    }
+    // If OtherPtr has already been rewritten, this intrinsic will be dead.
+    if (OtherPtr == NewElts[0])
+      return;
     
     if (ConstantExpr *BCE = dyn_cast<ConstantExpr>(OtherPtr))
       if (BCE->getOpcode() == Instruction::BitCast)
@@ -798,7 +770,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
   // Process each element of the aggregate.
   Value *TheFn = MI->getOperand(0);
   const Type *BytePtrTy = MI->getRawDest()->getType();
-  bool SROADest = MI->getRawDest() == BCInst;
+  bool SROADest = MI->getRawDest() == Inst;
   
   Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
 
@@ -807,12 +779,15 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
     Value *OtherElt = 0;
     unsigned OtherEltAlign = MemAlignment;
     
-    if (OtherPtr) {
+    if (OtherPtr == AI) {
+      OtherElt = NewElts[i];
+      OtherEltAlign = 0;
+    } else if (OtherPtr) {
       Value *Idx[2] = { Zero,
                       ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
-      OtherElt = GetElementPtrInst::Create(OtherPtr, Idx, Idx + 2,
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2,
                                            OtherPtr->getNameStr()+"."+Twine(i),
-                                           MI);
+                                                   MI);
       uint64_t EltOffset;
       const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
       if (const StructType *ST =
@@ -924,9 +899,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
       CallInst::Create(TheFn, Ops, Ops + 4, "", MI);
     }
   }
-  MI->eraseFromParent();
-  if (PossiblyDead)
-    RecursivelyDeleteTriviallyDeadInstructions(PossiblyDead);
+  DeadInsts.push_back(MI);
 }
 
 /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
@@ -937,15 +910,9 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // Extract each element out of the integer according to its structure offset
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
-  const Type *AllocaEltTy = AI->getType()->getElementType();
+  const Type *AllocaEltTy = AI->getAllocatedType();
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
   
-  // If this isn't a store of an integer to the whole alloca, it may be a store
-  // to the first element.  Just ignore the store in this case and normal SROA
-  // will handle it.
-  if (!isa<IntegerType>(SrcVal->getType()) ||
-      TD->getTypeAllocSizeInBits(SrcVal->getType()) != AllocaSizeBits)
-    return;
   // Handle tail padding by extending the operand
   if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
     SrcVal = new ZExtInst(SrcVal,
@@ -1050,7 +1017,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
     }
   }
   
-  SI->eraseFromParent();
+  DeadInsts.push_back(SI);
 }
 
 /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
@@ -1059,16 +1026,9 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                         SmallVector<AllocaInst*, 32> &NewElts) {
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
-  const Type *AllocaEltTy = AI->getType()->getElementType();
+  const Type *AllocaEltTy = AI->getAllocatedType();
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
   
-  // If this isn't a load of the whole alloca to an integer, it may be a load
-  // of the first element.  Just ignore the load in this case and normal SROA
-  // will handle it.
-  if (!isa<IntegerType>(LI->getType()) ||
-      TD->getTypeAllocSizeInBits(LI->getType()) != AllocaSizeBits)
-    return;
-  
   DEBUG(errs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
                << '\n');
   
@@ -1139,10 +1099,9 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
 
   LI->replaceAllUsesWith(ResultVal);
-  LI->eraseFromParent();
+  DeadInsts.push_back(LI);
 }
 
-
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding, false otherwise.
 static bool HasPadding(const Type *Ty, const TargetData &TD) {
@@ -1192,14 +1151,10 @@ int SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   // the users are safe to transform.
   AllocaInfo Info;
   
-  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end();
-       I != E; ++I) {
-    isSafeUseOfAllocation(cast<Instruction>(*I), AI, Info);
-    if (Info.isUnsafe) {
-      DEBUG(errs() << "Cannot transform: " << *AI << "\n  due to user: "
-                   << **I << '\n');
-      return 0;
-    }
+  isSafeForScalarRepl(AI, AI, 0, Info);
+  if (Info.isUnsafe) {
+    DEBUG(errs() << "Cannot transform: " << *AI << '\n');
+    return 0;
   }
   
   // Okay, we know all the users are promotable.  If the aggregate is a memcpy
@@ -1208,88 +1163,28 @@ int SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   // types, but may actually be used.  In these cases, we refuse to promote the
   // struct.
   if (Info.isMemCpySrc && Info.isMemCpyDst &&
-      HasPadding(AI->getType()->getElementType(), *TD))
+      HasPadding(AI->getAllocatedType(), *TD))
     return 0;
 
   // If we require cleanup, return 1, otherwise return 3.
   return Info.needsCleanup ? 1 : 3;
 }
 
-/// CleanupGEP - GEP is used by an Alloca, which can be promoted after the GEP
-/// is canonicalized here.
-void SROA::CleanupGEP(GetElementPtrInst *GEPI) {
-  gep_type_iterator I = gep_type_begin(GEPI);
-  ++I;
-  
-  const ArrayType *AT = dyn_cast<ArrayType>(*I);
-  if (!AT) 
-    return;
-
-  uint64_t NumElements = AT->getNumElements();
-  
-  if (isa<ConstantInt>(I.getOperand()))
-    return;
-
-  if (NumElements == 1) {
-    GEPI->setOperand(2, 
-                  Constant::getNullValue(Type::getInt32Ty(GEPI->getContext())));
-    return;
-  } 
-    
-  assert(NumElements == 2 && "Unhandled case!");
-  // All users of the GEP must be loads.  At each use of the GEP, insert
-  // two loads of the appropriate indexed GEP and select between them.
-  Value *IsOne = new ICmpInst(GEPI, ICmpInst::ICMP_NE, I.getOperand(), 
-                              Constant::getNullValue(I.getOperand()->getType()),
-                              "isone");
-  // Insert the new GEP instructions, which are properly indexed.
-  SmallVector<Value*, 8> Indices(GEPI->op_begin()+1, GEPI->op_end());
-  Indices[1] = Constant::getNullValue(Type::getInt32Ty(GEPI->getContext()));
-  Value *ZeroIdx = GetElementPtrInst::Create(GEPI->getOperand(0),
-                                             Indices.begin(),
-                                             Indices.end(),
-                                             GEPI->getName()+".0", GEPI);
-  Indices[1] = ConstantInt::get(Type::getInt32Ty(GEPI->getContext()), 1);
-  Value *OneIdx = GetElementPtrInst::Create(GEPI->getOperand(0),
-                                            Indices.begin(),
-                                            Indices.end(),
-                                            GEPI->getName()+".1", GEPI);
-  // Replace all loads of the variable index GEP with loads from both
-  // indexes and a select.
-  while (!GEPI->use_empty()) {
-    LoadInst *LI = cast<LoadInst>(GEPI->use_back());
-    Value *Zero = new LoadInst(ZeroIdx, LI->getName()+".0", LI);
-    Value *One  = new LoadInst(OneIdx , LI->getName()+".1", LI);
-    Value *R = SelectInst::Create(IsOne, One, Zero, LI->getName(), LI);
-    LI->replaceAllUsesWith(R);
-    LI->eraseFromParent();
-  }
-  GEPI->eraseFromParent();
-}
-
-
 /// CleanupAllocaUsers - If SROA reported that it can promote the specified
 /// allocation, but only if cleaned up, perform the cleanups required.
-void SROA::CleanupAllocaUsers(AllocaInst *AI) {
-  // At this point, we know that the end result will be SROA'd and promoted, so
-  // we can insert ugly code if required so long as sroa+mem2reg will clean it
-  // up.
-  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+void SROA::CleanupAllocaUsers(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
        UI != E; ) {
     User *U = *UI++;
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U))
-      CleanupGEP(GEPI);
-    else {
-      Instruction *I = cast<Instruction>(U);
-      SmallVector<DbgInfoIntrinsic *, 2> DbgInUses;
-      if (!isa<StoreInst>(I) && OnlyUsedByDbgInfoIntrinsics(I, &DbgInUses)) {
-        // Safe to remove debug info uses.
-        while (!DbgInUses.empty()) {
-          DbgInfoIntrinsic *DI = DbgInUses.back(); DbgInUses.pop_back();
-          DI->eraseFromParent();
-        }
-        I->eraseFromParent();
+    Instruction *I = cast<Instruction>(U);
+    SmallVector<DbgInfoIntrinsic *, 2> DbgInUses;
+    if (!isa<StoreInst>(I) && OnlyUsedByDbgInfoIntrinsics(I, &DbgInUses)) {
+      // Safe to remove debug info uses.
+      while (!DbgInUses.empty()) {
+        DbgInfoIntrinsic *DI = DbgInUses.back(); DbgInUses.pop_back();
+        DI->eraseFromParent();
       }
+      I->eraseFromParent();
     }
   }
 }
@@ -1395,7 +1290,7 @@ bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
       
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getPointerOperandType(),
                                                 &Indices[0], Indices.size());
       // See if all uses can be converted.
       if (!CanConvertToScalar(GEP, IsNotTrivial, VecTy, SawVec,Offset+GEPOffset,
@@ -1457,7 +1352,7 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getPointerOperandType(),
                                                 &Indices[0], Indices.size());
       ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
       GEP->eraseFromParent();
@@ -1478,13 +1373,16 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
     
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       assert(SI->getOperand(0) != Ptr && "Consistency error!");
-      // FIXME: Remove once builder has Twine API.
-      Value *Old = Builder.CreateLoad(NewAI,
-                                      (NewAI->getName()+".in").str().c_str());
+      Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
       Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
                                              Builder);
       Builder.CreateStore(New, NewAI);
       SI->eraseFromParent();
+      
+      // If the load we just inserted is now dead, then the inserted store
+      // overwrote the entire thing.
+      if (Old->use_empty())
+        Old->eraseFromParent();
       continue;
     }
     
@@ -1504,13 +1402,16 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
           for (unsigned i = 1; i != NumBytes; ++i)
             APVal |= APVal << 8;
         
-        // FIXME: Remove once builder has Twine API.
-        Value *Old = Builder.CreateLoad(NewAI,
-                                        (NewAI->getName()+".in").str().c_str());
+        Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
         Value *New = ConvertScalar_InsertValue(
                                     ConstantInt::get(User->getContext(), APVal),
                                                Old, Offset, Builder);
         Builder.CreateStore(New, NewAI);
+        
+        // If the load we just inserted is now dead, then the memset overwrote
+        // the entire thing.
+        if (Old->use_empty())
+          Old->eraseFromParent();        
       }
       MSI->eraseFromParent();
       continue;
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index e905952..a36da78 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -189,6 +189,77 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) {
   return true;
 }
 
+/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi
+/// node) return blocks, merge them together to promote recursive block merging.
+static bool MergeEmptyReturnBlocks(Function &F) {
+  bool Changed = false;
+  
+  BasicBlock *RetBlock = 0;
+  
+  // Scan all the blocks in the function, looking for empty return blocks.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+    BasicBlock &BB = *BBI++;
+    
+    // Only look at return blocks.
+    ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (Ret == 0) continue;
+    
+    // Only look at the block if it is empty or the only other thing in it is a
+    // single PHI node that is the operand to the return.
+    if (Ret != &BB.front()) {
+      // Check for something else in the block.
+      BasicBlock::iterator I = Ret;
+      --I;
+      if (!isa<PHINode>(I) || I != BB.begin() ||
+          Ret->getNumOperands() == 0 ||
+          Ret->getOperand(0) != I)
+        continue;
+    }
+    
+    // If this is the first returning block, remember it and keep going.
+    if (RetBlock == 0) {
+      RetBlock = &BB;
+      continue;
+    }
+    
+    // Otherwise, we found a duplicate return block.  Merge the two.
+    Changed = true;
+    
+    // Case when there is no input to the return or when the returned values
+    // agree is trivial.  Note that they can't agree if there are phis in the
+    // blocks.
+    if (Ret->getNumOperands() == 0 ||
+        Ret->getOperand(0) == 
+          cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+      BB.replaceAllUsesWith(RetBlock);
+      BB.eraseFromParent();
+      continue;
+    }
+    
+    // If the canonical return block has no PHI node, create one now.
+    PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+    if (RetBlockPHI == 0) {
+      Value *InVal = cast<ReturnInst>(RetBlock->begin())->getOperand(0);
+      RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), "merge",
+                                    &RetBlock->front());
+      
+      for (pred_iterator PI = pred_begin(RetBlock), E = pred_end(RetBlock);
+           PI != E; ++PI)
+        RetBlockPHI->addIncoming(InVal, *PI);
+      RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+    }
+    
+    // Turn BB into a block that just unconditionally branches to the return
+    // block.  This handles the case when the two return blocks have a common
+    // predecessor but that return different things.
+    RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+    BB.getTerminator()->eraseFromParent();
+    BranchInst::Create(RetBlock, &BB);
+  }
+  
+  return Changed;
+}
+
 /// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool IterativeSimplifyCFG(Function &F) {
@@ -216,6 +287,7 @@ static bool IterativeSimplifyCFG(Function &F) {
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   bool EverChanged = RemoveUnreachableBlocksFromFn(F);
+  EverChanged |= MergeEmptyReturnBlocks(F);
   EverChanged |= IterativeSimplifyCFG(F);
   
   // If neither pass changed anything, we're done.
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 6fd884b..3c28ad2 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -76,6 +76,11 @@ public:
   /// return value has 'intptr_t' type.
   Value *EmitStrLen(Value *Ptr, IRBuilder<> &B);
 
+  /// EmitStrChr - Emit a call to the strchr function to the builder, for the
+  /// specified pointer and character.  Ptr is required to be some pointer type,
+  /// and the return value has 'i8*' type.
+  Value *EmitStrChr(Value *Ptr, char C, IRBuilder<> &B);
+  
   /// EmitMemCpy - Emit a call to the memcpy function to the builder.  This
   /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
   Value *EmitMemCpy(Value *Dst, Value *Src, Value *Len,
@@ -151,6 +156,26 @@ Value *LibCallOptimization::EmitStrLen(Value *Ptr, IRBuilder<> &B) {
   return CI;
 }
 
+/// EmitStrChr - Emit a call to the strchr function to the builder, for the
+/// specified pointer and character.  Ptr is required to be some pointer type,
+/// and the return value has 'i8*' type.
+Value *LibCallOptimization::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B) {
+  Module *M = Caller->getParent();
+  AttributeWithIndex AWI =
+    AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+  
+  const Type *I8Ptr = Type::getInt8PtrTy(*Context);
+  const Type *I32Ty = Type::getInt32Ty(*Context);
+  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1),
+                                            I8Ptr, I8Ptr, I32Ty, NULL);
+  CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
+                               ConstantInt::get(I32Ty, C), "strchr");
+  if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+
 /// EmitMemCpy - Emit a call to the memcpy function to the builder.  This always
 /// expects that the size has type 'intptr_t' and Dst/Src are pointers.
 Value *LibCallOptimization::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
@@ -880,17 +905,16 @@ struct StrLenOpt : public LibCallOptimization {
     if (uint64_t Len = GetStringLength(Src))
       return ConstantInt::get(CI->getType(), Len-1);
 
-    // Handle strlen(p) != 0.
-    if (!IsOnlyUsedInZeroEqualityComparison(CI)) return 0;
-
     // strlen(x) != 0 --> *x != 0
     // strlen(x) == 0 --> *x == 0
-    return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+    if (IsOnlyUsedInZeroEqualityComparison(CI))
+      return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+    return 0;
   }
 };
 
 //===---------------------------------------===//
-// 'strto*' Optimizations
+// 'strto*' Optimizations.  This handles strtol, strtod, strtof, strtoul, etc.
 
 struct StrToOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
@@ -910,6 +934,52 @@ struct StrToOpt : public LibCallOptimization {
   }
 };
 
+//===---------------------------------------===//
+// 'strstr' Optimizations
+
+struct StrStrOpt : public LibCallOptimization {
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    const FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !isa<PointerType>(FT->getParamType(0)) ||
+        !isa<PointerType>(FT->getParamType(1)) ||
+        !isa<PointerType>(FT->getReturnType()))
+      return 0;
+
+    // fold strstr(x, x) -> x.
+    if (CI->getOperand(1) == CI->getOperand(2))
+      return B.CreateBitCast(CI->getOperand(1), CI->getType());
+    
+    // See if either input string is a constant string.
+    std::string SearchStr, ToFindStr;
+    bool HasStr1 = GetConstantStringInfo(CI->getOperand(1), SearchStr);
+    bool HasStr2 = GetConstantStringInfo(CI->getOperand(2), ToFindStr);
+    
+    // fold strstr(x, "") -> x.
+    if (HasStr2 && ToFindStr.empty())
+      return B.CreateBitCast(CI->getOperand(1), CI->getType());
+    
+    // If both strings are known, constant fold it.
+    if (HasStr1 && HasStr2) {
+      std::string::size_type Offset = SearchStr.find(ToFindStr);
+      
+      if (Offset == std::string::npos) // strstr("foo", "bar") -> null
+        return Constant::getNullValue(CI->getType());
+
+      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+      Value *Result = CastToCStr(CI->getOperand(1), B);
+      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+      return B.CreateBitCast(Result, CI->getType());
+    }
+    
+    // fold strstr(x, "y") -> strchr(x, 'y').
+    if (HasStr2 && ToFindStr.size() == 1)
+      return B.CreateBitCast(EmitStrChr(CI->getOperand(1), ToFindStr[0], B),
+                             CI->getType());
+    return 0;
+  }
+};
+  
 
 //===---------------------------------------===//
 // 'memcmp' Optimizations
@@ -941,19 +1011,6 @@ struct MemCmpOpt : public LibCallOptimization {
       return B.CreateSExt(B.CreateSub(LHSV, RHSV, "chardiff"), CI->getType());
     }
 
-    // memcmp(S1,S2,2) != 0 -> (*(short*)LHS ^ *(short*)RHS)  != 0
-    // memcmp(S1,S2,4) != 0 -> (*(int*)LHS ^ *(int*)RHS)  != 0
-    if ((Len == 2 || Len == 4) && IsOnlyUsedInZeroEqualityComparison(CI)) {
-      const Type *PTy = PointerType::getUnqual(Len == 2 ?
-                       Type::getInt16Ty(*Context) : Type::getInt32Ty(*Context));
-      LHS = B.CreateBitCast(LHS, PTy, "tmp");
-      RHS = B.CreateBitCast(RHS, PTy, "tmp");
-      LoadInst *LHSV = B.CreateLoad(LHS, "lhsv");
-      LoadInst *RHSV = B.CreateLoad(RHS, "rhsv");
-      LHSV->setAlignment(1); RHSV->setAlignment(1);  // Unaligned loads.
-      return B.CreateZExt(B.CreateXor(LHSV, RHSV, "shortdiff"), CI->getType());
-    }
-
     // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
     std::string LHSStr, RHSStr;
     if (GetConstantStringInfo(LHS, LHSStr) &&
@@ -1051,7 +1108,7 @@ struct SizeOpt : public LibCallOptimization {
 
     const Type *Ty = Callee->getFunctionType()->getReturnType();
 
-    if (Const->getZExtValue() < 2)
+    if (Const->getZExtValue() == 0)
       return Constant::getAllOnesValue(Ty);
     else
       return ConstantInt::get(Ty, 0);
@@ -1071,8 +1128,8 @@ struct MemCpyChkOpt : public LibCallOptimization {
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !isa<PointerType>(FT->getParamType(0)) ||
         !isa<PointerType>(FT->getParamType(1)) ||
-	!isa<IntegerType>(FT->getParamType(3)) ||
-	FT->getParamType(2) != TD->getIntPtrType(*Context))
+        !isa<IntegerType>(FT->getParamType(3)) ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
       return 0;
 
     ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getOperand(4));
@@ -1099,7 +1156,7 @@ struct MemSetChkOpt : public LibCallOptimization {
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !isa<PointerType>(FT->getParamType(0)) ||
         !isa<IntegerType>(FT->getParamType(1)) ||
-	!isa<IntegerType>(FT->getParamType(3)) ||
+        !isa<IntegerType>(FT->getParamType(3)) ||
         FT->getParamType(2) != TD->getIntPtrType(*Context))
       return 0;
 
@@ -1129,7 +1186,7 @@ struct MemMoveChkOpt : public LibCallOptimization {
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !isa<PointerType>(FT->getParamType(0)) ||
         !isa<PointerType>(FT->getParamType(1)) ||
-	!isa<IntegerType>(FT->getParamType(3)) ||
+        !isa<IntegerType>(FT->getParamType(3)) ||
         FT->getParamType(2) != TD->getIntPtrType(*Context))
       return 0;
 
@@ -1675,8 +1732,8 @@ namespace {
     // String and Memory LibCall Optimizations
     StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrCmpOpt StrCmp;
     StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrNCpyOpt StrNCpy; StrLenOpt StrLen;
-    StrToOpt StrTo; MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove;
-    MemSetOpt MemSet;
+    StrToOpt StrTo; StrStrOpt StrStr;
+    MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
     PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
     // Integer Optimizations
@@ -1738,6 +1795,7 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["strtoll"] = &StrTo;
   Optimizations["strtold"] = &StrTo;
   Optimizations["strtoull"] = &StrTo;
+  Optimizations["strstr"] = &StrStr;
   Optimizations["memcmp"] = &MemCmp;
   Optimizations["memcpy"] = &MemCpy;
   Optimizations["memmove"] = &MemMove;
@@ -2644,12 +2702,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * strcspn("",a) -> 0
 //   * strcspn(s,"") -> strlen(a)
 //
-// strstr: (PR5783)
-//   * strstr(x,x)  -> x
-//   * strstr(x, "") -> x
-//   * strstr(x, "a") -> strchr(x, 'a')
-//   * strstr(s1,s2) -> result   (if s1 and s2 are constant strings)
-//
 // tan, tanf, tanl:
 //   * tan(atan(x)) -> x
 //
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index ccd97c8..19c7206 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -309,10 +309,10 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         if (TIL == DestLoop) {
           // Both in the same loop, the NewBB joins loop.
           DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
-        } else if (TIL->contains(DestLoop->getHeader())) {
+        } else if (TIL->contains(DestLoop)) {
           // Edge from an outer loop to an inner loop.  Add to the outer loop.
           TIL->addBasicBlockToLoop(NewBB, LI->getBase());
-        } else if (DestLoop->contains(TIL->getHeader())) {
+        } else if (DestLoop->contains(TIL)) {
           // Edge from an inner loop to an outer loop.  Add to the outer loop.
           DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
         } else {
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 162d7b3..c287747 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -21,6 +21,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -347,8 +348,7 @@ ConstantFoldMappedInstruction(const Instruction *I) {
                                   Ops.size(), TD);
 }
 
-static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD,
-                                   LLVMContext &Context) {
+static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD) {
   DILocation ILoc(InsnMD);
   if (ILoc.isNull()) return InsnMD;
 
@@ -358,14 +358,15 @@ static MDNode *UpdateInlinedAtInfo(MDNode *InsnMD, MDNode *TheCallMD,
   DILocation OrigLocation = ILoc.getOrigLocation();
   MDNode *NewLoc = TheCallMD;
   if (!OrigLocation.isNull())
-    NewLoc = UpdateInlinedAtInfo(OrigLocation.getNode(), TheCallMD, Context);
-
-  SmallVector<Value *, 4> MDVs;
-  MDVs.push_back(InsnMD->getElement(0)); // Line
-  MDVs.push_back(InsnMD->getElement(1)); // Col
-  MDVs.push_back(InsnMD->getElement(2)); // Scope
-  MDVs.push_back(NewLoc);
-  return MDNode::get(Context, MDVs.data(), MDVs.size());
+    NewLoc = UpdateInlinedAtInfo(OrigLocation.getNode(), TheCallMD);
+
+  Value *MDVs[] = {
+    InsnMD->getOperand(0), // Line
+    InsnMD->getOperand(1), // Col
+    InsnMD->getOperand(2), // Scope
+    NewLoc
+  };
+  return MDNode::get(InsnMD->getContext(), MDVs, 4);
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
@@ -421,12 +422,10 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     //
     BasicBlock::iterator I = NewBB->begin();
 
-    LLVMContext &Context = OldFunc->getContext();
-    unsigned DbgKind = Context.getMetadata().getMDKind("dbg");
+    unsigned DbgKind = OldFunc->getContext().getMDKindID("dbg");
     MDNode *TheCallMD = NULL;
-    SmallVector<Value *, 4> MDVs;
     if (TheCall && TheCall->hasMetadata()) 
-      TheCallMD = Context.getMetadata().getMD(DbgKind, TheCall);
+      TheCallMD = TheCall->getMetadata(DbgKind);
     
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
@@ -436,32 +435,38 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
       for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI) {
         if (I->hasMetadata()) {
           if (TheCallMD) {
-            if (MDNode *IMD = Context.getMetadata().getMD(DbgKind, I)) {
-              MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD, Context);
-              Context.getMetadata().addMD(DbgKind, NewMD, I);
+            if (MDNode *IMD = I->getMetadata(DbgKind)) {
+              MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD);
+              I->setMetadata(DbgKind, NewMD);
             }
           } else {
             // The cloned instruction has dbg info but the call instruction
             // does not have dbg info. Remove dbg info from cloned instruction.
-            Context.getMetadata().removeMD(DbgKind, I);
+            I->setMetadata(DbgKind, 0);
           }
         }
         PHIToResolve.push_back(cast<PHINode>(OldI));
       }
     }
     
+    // FIXME:
+    // FIXME:
+    // FIXME: Unclone all this metadata stuff.
+    // FIXME:
+    // FIXME:
+    
     // Otherwise, remap the rest of the instructions normally.
     for (; I != NewBB->end(); ++I) {
       if (I->hasMetadata()) {
         if (TheCallMD) {
-          if (MDNode *IMD = Context.getMetadata().getMD(DbgKind, I)) {
-            MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD, Context);
-            Context.getMetadata().addMD(DbgKind, NewMD, I);
+          if (MDNode *IMD = I->getMetadata(DbgKind)) {
+            MDNode *NewMD = UpdateInlinedAtInfo(IMD, TheCallMD);
+            I->setMetadata(DbgKind, NewMD);
           }
         } else {
           // The cloned instruction has dbg info but the call instruction
           // does not have dbg info. Remove dbg info from cloned instruction.
-          Context.getMetadata().removeMD(DbgKind, I);
+          I->setMetadata(DbgKind, 0);
         }
       }
       RemapInstruction(I, ValueMap);
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 7a37aa3..2426e3e 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -20,10 +20,9 @@
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Target/TargetData.h"
@@ -31,6 +30,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 690972d..7fcc5f7 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -109,7 +109,7 @@ X("loopsimplify", "Canonicalize natural loops", true);
 const PassInfo *const llvm::LoopSimplifyID = &X;
 Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 
-/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
+/// runOnLoop - Run down all loops in the CFG (recursively, but we could do
 /// it in any convenient order) inserting preheaders...
 ///
 bool LoopSimplify::runOnLoop(Loop *l, LPPassManager &LPM) {
@@ -305,12 +305,6 @@ ReprocessLoop:
     }
   }
 
-  // If there are duplicate phi nodes (for example, from loop rotation),
-  // get rid of them.
-  for (Loop::block_iterator BB = L->block_begin(), E = L->block_end();
-       BB != E; ++BB)
-    EliminateDuplicatePHINodes(*BB);
-
   return Changed;
 }
 
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 6232f32..6b2c591 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -194,7 +194,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
     OrigPHINode.push_back(PN);
     if (Instruction *I = 
                 dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
-      if (L->contains(I->getParent()))
+      if (L->contains(I))
         LastValueMap[I] = I;
   }
 
@@ -222,7 +222,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
           PHINode *NewPHI = cast<PHINode>(ValueMap[OrigPHINode[i]]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
           if (Instruction *InValI = dyn_cast<Instruction>(InVal))
-            if (It > 1 && L->contains(InValI->getParent()))
+            if (It > 1 && L->contains(InValI))
               InVal = LastValueMap[InValI];
           ValueMap[OrigPHINode[i]] = InVal;
           New->getInstList().erase(NewPHI);
@@ -244,7 +244,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
              UI != UE;) {
           Instruction *UseInst = cast<Instruction>(*UI);
           ++UI;
-          if (isa<PHINode>(UseInst) && !L->contains(UseInst->getParent())) {
+          if (isa<PHINode>(UseInst) && !L->contains(UseInst)) {
             PHINode *phi = cast<PHINode>(UseInst);
             Value *Incoming = phi->getIncomingValueForBlock(*BB);
             phi->addIncoming(Incoming, New);
@@ -295,7 +295,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, LoopInfo* LI, LPPassManager* LPM)
       // If this value was defined in the loop, take the value defined by the
       // last iteration of the loop.
       if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
-        if (L->contains(InValI->getParent()))
+        if (L->contains(InValI))
           InVal = LastValueMap[InVal];
       }
       PN->addIncoming(InVal, LastIterationBB);
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index ba41bf9..9881b3c 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -149,7 +149,29 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   if (SingularValue != 0)
     return SingularValue;
 
-  // Otherwise, we do need a PHI: insert one now.
+  // Otherwise, we do need a PHI: check to see if we already have one available
+  // in this block that produces the right value.
+  if (isa<PHINode>(BB->begin())) {
+    DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(),
+                                               PredValues.end());
+    PHINode *SomePHI;
+    for (BasicBlock::iterator It = BB->begin();
+         (SomePHI = dyn_cast<PHINode>(It)); ++It) {
+      // Scan this phi to see if it is what we need.
+      bool Equal = true;
+      for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i)
+        if (ValueMapping[SomePHI->getIncomingBlock(i)] !=
+            SomePHI->getIncomingValue(i)) {
+          Equal = false;
+          break;
+        }
+         
+      if (Equal)
+        return SomePHI;
+    }
+  }
+  
+  // Ok, we have no way out, insert a new one now.
   PHINode *InsertedPHI = PHINode::Create(PrototypeValue->getType(),
                                          PrototypeValue->getName(),
                                          &BB->front());
@@ -198,7 +220,7 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
 
   // Query AvailableVals by doing an insertion of null.
   std::pair<AvailableValsTy::iterator, bool> InsertRes =
-  AvailableVals.insert(std::make_pair(BB, WeakVH()));
+    AvailableVals.insert(std::make_pair(BB, TrackingVH<Value>()));
 
   // Handle the case when the insertion fails because we have already seen BB.
   if (!InsertRes.second) {
@@ -214,8 +236,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
     // it.  When we get back to the first instance of the recursion we will fill
     // in the PHI node.
     return InsertRes.first->second =
-    PHINode::Create(PrototypeValue->getType(), PrototypeValue->getName(),
-                    &BB->front());
+      PHINode::Create(PrototypeValue->getType(), PrototypeValue->getName(),
+                      &BB->front());
   }
 
   // Okay, the value isn't in the map and we just inserted a null in the entry
author	rdivacky <rdivacky@FreeBSD.org>	2010-01-01 10:31:22 +0000
committer	rdivacky <rdivacky@FreeBSD.org>	2010-01-01 10:31:22 +0000
commit	a16c51cee9225a354c999dd1076d5dba2aa79807 (patch)
tree	dba00119388b84f9f44e6ec5e9129f807fd79ca3 /lib/Transforms
parent	40a6fcdb85efd93fe0e36c9552cfb0b18b5eacd6 (diff)
download	FreeBSD-src-a16c51cee9225a354c999dd1076d5dba2aa79807.zip FreeBSD-src-a16c51cee9225a354c999dd1076d5dba2aa79807.tar.gz