19 files changed, 495 insertions, 318 deletions
diff --git a/lib/Transforms/Scalar/ABCD.cpp b/lib/Transforms/Scalar/ABCD.cpp
index 6135992..dcf14a6 100644
--- a/lib/Transforms/Scalar/ABCD.cpp
+++ b/lib/Transforms/Scalar/ABCD.cpp
@@ -230,7 +230,7 @@ class ABCD : public FunctionPass {
       DenseMapIterator<Value*, MemoizedResultChart> begin = map.begin();
       DenseMapIterator<Value*, MemoizedResultChart> end = map.end();
       for (; begin != end; ++begin) {
-	begin->second.clear();
+        begin->second.clear();
       }
       map.clear();
     }
@@ -396,8 +396,8 @@ class ABCD : public FunctionPass {
   /// this case the method returns true, otherwise false. It also obtains the
   /// Instruction and ConstantInt from the BinaryOperator and returns it.
   bool createBinaryOperatorInfo(BinaryOperator *BO, Instruction **I1,
-				Instruction **I2, ConstantInt **C1,
-				ConstantInt **C2);
+                                Instruction **I2, ConstantInt **C1,
+                                ConstantInt **C2);
 
   /// This method creates a constraint between a Sigma and an Instruction.
   /// These constraints are created as soon as we find a comparator that uses a
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 5a49841..2d19467 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -83,7 +83,7 @@ bool ADCE::runOnFunction(Function& F) {
   
   for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
        E = worklist.end(); I != E; ++I) {
-    NumRemoved++;
+    ++NumRemoved;
     (*I)->eraseFromParent();
   }
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 93e9bfb..272066c 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -548,7 +548,8 @@ protected:
     CI->eraseFromParent();
   }
   bool isFoldable(unsigned SizeCIOp, unsigned, bool) const {
-    if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getOperand(SizeCIOp)))
+      if (ConstantInt *SizeCI = dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp
+                                                        - CallInst::ArgOffset)))
       return SizeCI->isAllOnesValue();
     return false;
   }
@@ -559,7 +560,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
-    bool Min = (cast<ConstantInt>(II->getOperand(2))->getZExtValue() == 1);
+    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
     const Type *ReturnTy = CI->getType();
     Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);    
     CI->replaceAllUsesWith(RetVal);
@@ -759,8 +760,7 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(Instruction *I, CallSite CS,
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue(),
-                             OpInfo.ConstraintType == TargetLowering::C_Memory);
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.isIndirect) {
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 09c01d3..e047e4f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -56,7 +56,8 @@ namespace {
     }
     
     bool runOnBasicBlock(BasicBlock &BB);
-    bool handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep);
+    bool handleFreeWithNonTrivialDependency(const CallInst *F,
+                                            MemDepResult Dep);
     bool handleEndBlock(BasicBlock &BB);
     bool RemoveUndeadPointers(Value *Ptr, uint64_t killPointerSize,
                               BasicBlock::iterator &BBI,
@@ -73,7 +74,6 @@ namespace {
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
 
@@ -123,14 +123,15 @@ static Value *getPointerOperand(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->getPointerOperand();
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
-    return MI->getOperand(1);
-  
-  switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    return MI->getArgOperand(0);
+
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
+  switch (II->getIntrinsicID()) {
   default: assert(false && "Unexpected intrinsic!");
   case Intrinsic::init_trampoline:
-    return I->getOperand(1);
+    return II->getArgOperand(0);
   case Intrinsic::lifetime_end:
-    return I->getOperand(2);
+    return II->getArgOperand(1);
   }
 }
 
@@ -147,12 +148,13 @@ static unsigned getStoreSize(Instruction *I, const TargetData *TD) {
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
     Len = MI->getLength();
   } else {
-    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
+    switch (II->getIntrinsicID()) {
     default: assert(false && "Unexpected intrinsic!");
     case Intrinsic::init_trampoline:
       return -1u;
     case Intrinsic::lifetime_end:
-      Len = I->getOperand(1);
+      Len = II->getArgOperand(0);
       break;
     }
   }
@@ -201,8 +203,8 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     if (InstDep.isNonLocal()) continue;
   
     // Handle frees whose dependencies are non-trivial.
-    if (isFreeCall(Inst)) {
-      MadeChange |= handleFreeWithNonTrivialDependency(Inst, InstDep);
+    if (const CallInst *F = isFreeCall(Inst)) {
+      MadeChange |= handleFreeWithNonTrivialDependency(F, InstDep);
       continue;
     }
     
@@ -218,7 +220,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           isElidable(DepStore)) {
         // Delete the store and now-dead instructions that feed it.
         DeleteDeadInstruction(DepStore);
-        NumFastStores++;
+        ++NumFastStores;
         MadeChange = true;
 
         // DeleteDeadInstruction can delete the current instruction in loop
@@ -249,7 +251,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
             BBI = BB.begin();
           else if (BBI != BB.begin())  // Revisit this instruction if possible.
             --BBI;
-          NumFastStores++;
+          ++NumFastStores;
           MadeChange = true;
           continue;
         }
@@ -270,7 +272,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           BBI = BB.begin();
         else if (BBI != BB.begin())  // Revisit this instruction if possible.
           --BBI;
-        NumFastStores++;
+        ++NumFastStores;
         MadeChange = true;
         continue;
       }
@@ -287,7 +289,8 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 
 /// handleFreeWithNonTrivialDependency - Handle frees of entire structures whose
 /// dependency is a store to a field of that structure.
-bool DSE::handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep) {
+bool DSE::handleFreeWithNonTrivialDependency(const CallInst *F,
+                                             MemDepResult Dep) {
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
   
   Instruction *Dependency = Dep.getInst();
@@ -297,13 +300,13 @@ bool DSE::handleFreeWithNonTrivialDependency(Instruction *F, MemDepResult Dep) {
   Value *DepPointer = getPointerOperand(Dependency)->getUnderlyingObject();
 
   // Check for aliasing.
-  if (AA.alias(F->getOperand(1), 1, DepPointer, 1) !=
+  if (AA.alias(F->getArgOperand(0), 1, DepPointer, 1) !=
          AliasAnalysis::MustAlias)
     return false;
   
   // DCE instructions only used to calculate that store
   DeleteDeadInstruction(Dependency);
-  NumFastStores++;
+  ++NumFastStores;
   return true;
 }
 
@@ -349,9 +352,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
         if (deadPointers.count(pointerOperand)) {
           // DCE instructions only used to calculate that store.
           Instruction *Dead = BBI;
-          BBI++;
+          ++BBI;
           DeleteDeadInstruction(Dead, &deadPointers);
-          NumFastStores++;
+          ++NumFastStores;
           MadeChange = true;
           continue;
         }
@@ -371,9 +374,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // However, if this load is unused and not volatile, we can go ahead and
       // remove it, and not have to worry about it making our pointer undead!
       if (L->use_empty() && !L->isVolatile()) {
-        BBI++;
+        ++BBI;
         DeleteDeadInstruction(L, &deadPointers);
-        NumFastOther++;
+        ++NumFastOther;
         MadeChange = true;
         continue;
       }
@@ -391,9 +394,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       
       // Dead alloca's can be DCE'd when we reach them
       if (A->use_empty()) {
-        BBI++;
+        ++BBI;
         DeleteDeadInstruction(A, &deadPointers);
-        NumFastOther++;
+        ++NumFastOther;
         MadeChange = true;
       }
       
@@ -426,9 +429,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
                                                          getPointerSize(*I));
         
         if (A == AliasAnalysis::ModRef)
-          modRef++;
+          ++modRef;
         else
-          other++;
+          ++other;
         
         if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
           dead.push_back(*I);
@@ -442,9 +445,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     } else if (isInstructionTriviallyDead(BBI)) {
       // For any non-memory-affecting non-terminators, DCE them as we reach them
       Instruction *Inst = BBI;
-      BBI++;
+      ++BBI;
       DeleteDeadInstruction(Inst, &deadPointers);
-      NumFastOther++;
+      ++NumFastOther;
       MadeChange = true;
       continue;
     }
@@ -497,7 +500,7 @@ bool DSE::RemoveUndeadPointers(Value *killPointer, uint64_t killPointerSize,
       // Remove it!
       ++BBI;
       DeleteDeadInstruction(S, &deadPointers);
-      NumFastStores++;
+      ++NumFastStores;
       MadeChange = true;
 
       continue;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index ca8ab49..88b6776 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/PHITransAddr.h"
@@ -271,7 +272,8 @@ Expression ValueTable::create_expression(CallInst* C) {
   e.function = C->getCalledFunction();
   e.opcode = Expression::CALL;
 
-  for (CallInst::op_iterator I = C->op_begin()+1, E = C->op_end();
+  CallSite CS(C);
+  for (CallInst::op_iterator I = CS.arg_begin(), E = CS.arg_end();
        I != E; ++I)
     e.varargs.push_back(lookup_or_add(*I));
 
@@ -447,14 +449,14 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
     if (local_dep.isDef()) {
       CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
 
-      if (local_cdep->getNumOperands() != C->getNumOperands()) {
+      if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
       }
 
-      for (unsigned i = 1; i < C->getNumOperands(); ++i) {
-        uint32_t c_vn = lookup_or_add(C->getOperand(i));
-        uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
+      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+        uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+        uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i));
         if (c_vn != cd_vn) {
           valueNumbering[C] = nextValueNumber;
           return nextValueNumber++;
@@ -504,13 +506,13 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
       return nextValueNumber++;
     }
 
-    if (cdep->getNumOperands() != C->getNumOperands()) {
+    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
       valueNumbering[C] = nextValueNumber;
       return nextValueNumber++;
     }
-    for (unsigned i = 1; i < C->getNumOperands(); ++i) {
-      uint32_t c_vn = lookup_or_add(C->getOperand(i));
-      uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+      uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i));
       if (c_vn != cd_vn) {
         valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
@@ -1500,7 +1502,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
       MD->invalidateCachedPointerInfo(V);
     VN.erase(LI);
     toErase.push_back(LI);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
 
@@ -1723,7 +1725,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     MD->invalidateCachedPointerInfo(V);
   VN.erase(LI);
   toErase.push_back(LI);
-  NumPRELoad++;
+  ++NumPRELoad;
   return true;
 }
 
@@ -1784,7 +1786,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
         MD->invalidateCachedPointerInfo(AvailVal);
       VN.erase(L);
       toErase.push_back(L);
-      NumGVNLoad++;
+      ++NumGVNLoad;
       return true;
     }
         
@@ -1830,7 +1832,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       MD->invalidateCachedPointerInfo(StoredVal);
     VN.erase(L);
     toErase.push_back(L);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
 
@@ -1860,7 +1862,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       MD->invalidateCachedPointerInfo(DepLI);
     VN.erase(L);
     toErase.push_back(L);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
 
@@ -1871,7 +1873,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     VN.erase(L);
     toErase.push_back(L);
-    NumGVNLoad++;
+    ++NumGVNLoad;
     return true;
   }
   
@@ -1882,7 +1884,7 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
       L->replaceAllUsesWith(UndefValue::get(L->getType()));
       VN.erase(L);
       toErase.push_back(L);
-      NumGVNLoad++;
+      ++NumGVNLoad;
       return true;
     }
   }
@@ -2014,7 +2016,7 @@ bool GVN::runOnFunction(Function& F) {
     BasicBlock *BB = FI;
     ++FI;
     bool removedBlock = MergeBlockIntoPredecessor(BB, this);
-    if (removedBlock) NumGVNBlocks++;
+    if (removedBlock) ++NumGVNBlocks;
 
     Changed |= removedBlock;
   }
@@ -2126,27 +2128,28 @@ bool GVN::performPRE(Function &F) {
 
       for (pred_iterator PI = pred_begin(CurrentBlock),
            PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        BasicBlock *P = *PI;
         // We're not interested in PRE where the block is its
         // own predecessor, or in blocks with predecessors
         // that are not reachable.
-        if (*PI == CurrentBlock) {
+        if (P == CurrentBlock) {
           NumWithout = 2;
           break;
-        } else if (!localAvail.count(*PI))  {
+        } else if (!localAvail.count(P))  {
           NumWithout = 2;
           break;
         }
 
         DenseMap<uint32_t, Value*>::iterator predV =
-                                            localAvail[*PI]->table.find(ValNo);
-        if (predV == localAvail[*PI]->table.end()) {
-          PREPred = *PI;
-          NumWithout++;
+                                            localAvail[P]->table.find(ValNo);
+        if (predV == localAvail[P]->table.end()) {
+          PREPred = P;
+          ++NumWithout;
         } else if (predV->second == CurInst) {
           NumWithout = 2;
         } else {
-          predMap[*PI] = predV->second;
-          NumWith++;
+          predMap[P] = predV->second;
+          ++NumWith;
         }
       }
 
@@ -2201,7 +2204,7 @@ bool GVN::performPRE(Function &F) {
       PREInstr->setName(CurInst->getName() + ".pre");
       predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
-      NumGVNPRE++;
+      ++NumGVNPRE;
 
       // Update the availability map to include the new instruction.
       localAvail[PREPred]->table.insert(std::make_pair(ValNo, PREInstr));
@@ -2211,8 +2214,10 @@ bool GVN::performPRE(Function &F) {
                                      CurInst->getName() + ".pre-phi",
                                      CurrentBlock->begin());
       for (pred_iterator PI = pred_begin(CurrentBlock),
-           PE = pred_end(CurrentBlock); PI != PE; ++PI)
-        Phi->addIncoming(predMap[*PI], *PI);
+           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        Phi->addIncoming(predMap[P], P);
+      }
 
       VN.add(Phi, ValNo);
       localAvail[CurrentBlock]->table[ValNo] = Phi;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 36bea67..b5c9dd8 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -467,6 +467,17 @@ void IndVarSimplify::EliminateIVRemainders() {
 }
 
 bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  // If LoopSimplify form is not available, stay out of trouble. Some notes:
+  //  - LSR currently only supports LoopSimplify-form loops. Indvars'
+  //    canonicalization can be a pessimization without LSR to "clean up"
+  //    afterwards.
+  //  - We depend on having a preheader; in particular,
+  //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
+  //    and we're in trouble if we can't find the induction variable even when
+  //    we've manually inserted one.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
   IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
@@ -760,8 +771,9 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     bool UsedInLoop = false;
     for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
          UI != UE; ++UI) {
-      BasicBlock *UseBB = cast<Instruction>(UI)->getParent();
-      if (PHINode *P = dyn_cast<PHINode>(UI)) {
+      User *U = *UI;
+      BasicBlock *UseBB = cast<Instruction>(U)->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(U)) {
         unsigned i =
           PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
         UseBB = P->getIncomingBlock(i);
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index df05b71..edce14c 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -288,14 +289,15 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
       // Perhaps getConstantOnEdge should be smart enough to do this?
       
       for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *P = *PI;
         // If the value is known by LazyValueInfo to be a constant in a
         // predecessor, use that information to try to thread this block.
-        Constant *PredCst = LVI->getConstantOnEdge(V, *PI, BB);
+        Constant *PredCst = LVI->getConstantOnEdge(V, P, BB);
         if (PredCst == 0 ||
             (!isa<ConstantInt>(PredCst) && !isa<UndefValue>(PredCst)))
           continue;
         
-        Result.push_back(std::make_pair(dyn_cast<ConstantInt>(PredCst), *PI));
+        Result.push_back(std::make_pair(dyn_cast<ConstantInt>(PredCst), P));
       }
       
       return !Result.empty();
@@ -345,8 +347,19 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
         }
       for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
         if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0) {
-          Result.push_back(RHSVals[i]);
-          Result.back().first = InterestingVal;
+          // If we already inferred a value for this block on the LHS, don't
+          // re-add it.
+          bool HasValue = false;
+          for (unsigned r = 0, e = Result.size(); r != e; ++r)
+            if (Result[r].second == RHSVals[i].second) {
+              HasValue = true;
+              break;
+            }
+          
+          if (!HasValue) {
+            Result.push_back(RHSVals[i]);
+            Result.back().first = InterestingVal;
+          }
         }
       return !Result.empty();
     }
@@ -409,20 +422,21 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
         (!isa<Instruction>(Cmp->getOperand(0)) ||
          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB)) {
       Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
-      
+
       for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+        BasicBlock *P = *PI;
         // If the value is known by LazyValueInfo to be a constant in a
         // predecessor, use that information to try to thread this block.
         LazyValueInfo::Tristate
           Res = LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
-                                        RHSCst, *PI, BB);
+                                        RHSCst, P, BB);
         if (Res == LazyValueInfo::Unknown)
           continue;
 
         Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
-        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), *PI));
+        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), P));
       }
-      
+
       return !Result.empty();
     }
   }
@@ -538,18 +552,22 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition.
     pred_iterator PI = pred_begin(BB), E = pred_end(BB);
     if (isa<BranchInst>(BB->getTerminator())) {
-      for (; PI != E; ++PI)
-        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      for (; PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if (BranchInst *PBI = dyn_cast<BranchInst>(P->getTerminator()))
           if (PBI->isConditional() && PBI->getCondition() == Condition &&
-              ProcessBranchOnDuplicateCond(*PI, BB))
+              ProcessBranchOnDuplicateCond(P, BB))
             return true;
+      }
     } else {
       assert(isa<SwitchInst>(BB->getTerminator()) && "Unknown jump terminator");
-      for (; PI != E; ++PI)
-        if (SwitchInst *PSI = dyn_cast<SwitchInst>((*PI)->getTerminator()))
+      for (; PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if (SwitchInst *PSI = dyn_cast<SwitchInst>(P->getTerminator()))
           if (PSI->getCondition() == Condition &&
-              ProcessSwitchOnDuplicateCond(*PI, BB))
+              ProcessSwitchOnDuplicateCond(P, BB))
             return true;
+      }
     }
   }
 
@@ -569,19 +587,21 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
       // If we have a comparison, loop over the predecessors to see if there is
       // a condition with a lexically identical value.
       pred_iterator PI = pred_begin(BB), E = pred_end(BB);
-      for (; PI != E; ++PI)
-        if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
-          if (PBI->isConditional() && *PI != BB) {
+      for (; PI != E; ++PI) {
+        BasicBlock *P = *PI;
+        if (BranchInst *PBI = dyn_cast<BranchInst>(P->getTerminator()))
+          if (PBI->isConditional() && P != BB) {
             if (CmpInst *CI = dyn_cast<CmpInst>(PBI->getCondition())) {
               if (CI->getOperand(0) == CondCmp->getOperand(0) &&
                   CI->getOperand(1) == CondCmp->getOperand(1) &&
                   CI->getPredicate() == CondCmp->getPredicate()) {
                 // TODO: Could handle things like (x != 4) --> (x == 17)
-                if (ProcessBranchOnDuplicateCond(*PI, BB))
+                if (ProcessBranchOnDuplicateCond(P, BB))
                   return true;
               }
             }
           }
+      }
     }
   }
 
@@ -869,9 +889,15 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Add all the unavailable predecessors to the PredsToSplit list.
     for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
-         PI != PE; ++PI)
-      if (!AvailablePredSet.count(*PI))
-        PredsToSplit.push_back(*PI);
+         PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      // If the predecessor is an indirect goto, we can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()))
+        return false;
+      
+      if (!AvailablePredSet.count(P))
+        PredsToSplit.push_back(P);
+    }
     
     // Split them out to their own block.
     UnavailablePred =
@@ -903,11 +929,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // have multiple entries here.
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E;
        ++PI) {
+    BasicBlock *P = *PI;
     AvailablePredsTy::iterator I = 
       std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
-                       std::make_pair(*PI, (Value*)0));
+                       std::make_pair(P, (Value*)0));
     
-    assert(I != AvailablePreds.end() && I->first == *PI &&
+    assert(I != AvailablePreds.end() && I->first == P &&
            "Didn't find entry for predecessor!");
     
     PN->addIncoming(I->second, I->first);
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 48817ab..e4894e9 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -83,7 +83,7 @@ bool LoopDeletion::IsLoopDead(Loop* L,
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
       
-    BI++;
+    ++BI;
   }
   
   // Make sure that no instructions in the block have potential side-effects.
@@ -176,7 +176,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   BasicBlock::iterator BI = exitBlock->begin();
   while (PHINode* P = dyn_cast<PHINode>(BI)) {
     P->replaceUsesOfWith(exitingBlock, preheader);
-    BI++;
+    ++BI;
   }
   
   // Update the dominator tree and remove the instructions and blocks that will
@@ -226,7 +226,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   LPM.deleteLoopFromQueue(L);
   Changed = true;
   
-  NumDeleted++;
+  ++NumDeleted;
   
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp
index 101ff5b..31058e5 100644
--- a/lib/Transforms/Scalar/LoopIndexSplit.cpp
+++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp
@@ -649,7 +649,7 @@ bool LoopIndexSplit::updateLoopIterationSpace() {
       }
     }
   }
-  NumRestrictBounds++;
+  ++NumRestrictBounds;
   return true;
 }
 
@@ -958,11 +958,11 @@ bool LoopIndexSplit::splitLoop() {
       continue;
 
     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-	 BI != BE; ++BI) {
+         BI != BE; ++BI) {
       Instruction *Inst = BI;
 
       if (!Inst->isSafeToSpeculativelyExecute() && !isa<PHINode>(Inst)
-	  && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst))
+          && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst))
         return false;
     }
   }
@@ -1016,13 +1016,13 @@ bool LoopIndexSplit::splitLoop() {
   BSV = getMax(BSV, IVStartValue, Sign, PHTerm);
 
   // [*] Clone Loop
-  DenseMap<const Value *, Value *> ValueMap;
-  Loop *BLoop = CloneLoop(L, LPM, LI, ValueMap, this);
+  ValueMap<const Value *, Value *> VMap;
+  Loop *BLoop = CloneLoop(L, LPM, LI, VMap, this);
   Loop *ALoop = L;
 
   // [*] ALoop's exiting edge enters BLoop's header.
   //    ALoop's original exit block becomes BLoop's exit block.
-  PHINode *B_IndVar = cast<PHINode>(ValueMap[IndVar]);
+  PHINode *B_IndVar = cast<PHINode>(VMap[IndVar]);
   BasicBlock *A_ExitingBlock = ExitCondition->getParent();
   BranchInst *A_ExitInsn =
     dyn_cast<BranchInst>(A_ExitingBlock->getTerminator());
@@ -1047,7 +1047,7 @@ bool LoopIndexSplit::splitLoop() {
   for (BasicBlock::iterator BI = ALoop->getHeader()->begin(), 
          BE = ALoop->getHeader()->end(); BI != BE; ++BI) {
     if (PHINode *PN = dyn_cast<PHINode>(BI)) {
-      PHINode *PNClone = cast<PHINode>(ValueMap[PN]);
+      PHINode *PNClone = cast<PHINode>(VMap[PN]);
       InverseMap[PNClone] = PN;
     } else
       break;
@@ -1085,11 +1085,11 @@ bool LoopIndexSplit::splitLoop() {
   //     block. Remove incoming PHINode values from ALoop's exiting block.
   //     Add new incoming values from BLoop's incoming exiting value.
   //     Update BLoop exit block's dominator info..
-  BasicBlock *B_ExitingBlock = cast<BasicBlock>(ValueMap[A_ExitingBlock]);
+  BasicBlock *B_ExitingBlock = cast<BasicBlock>(VMap[A_ExitingBlock]);
   for (BasicBlock::iterator BI = B_ExitBlock->begin(), BE = B_ExitBlock->end();
        BI != BE; ++BI) {
     if (PHINode *PN = dyn_cast<PHINode>(BI)) {
-      PN->addIncoming(ValueMap[PN->getIncomingValueForBlock(A_ExitingBlock)], 
+      PN->addIncoming(VMap[PN->getIncomingValueForBlock(A_ExitingBlock)], 
                                                             B_ExitingBlock);
       PN->removeIncomingValue(A_ExitingBlock);
     } else
@@ -1131,7 +1131,7 @@ bool LoopIndexSplit::splitLoop() {
   removeBlocks(A_InactiveBranch, L, A_ActiveBranch);
 
   //[*] Eliminate split condition's inactive branch in from BLoop.
-  BasicBlock *B_SplitCondBlock = cast<BasicBlock>(ValueMap[A_SplitCondBlock]);
+  BasicBlock *B_SplitCondBlock = cast<BasicBlock>(VMap[A_SplitCondBlock]);
   BranchInst *B_BR = cast<BranchInst>(B_SplitCondBlock->getTerminator());
   BasicBlock *B_InactiveBranch = NULL;
   BasicBlock *B_ActiveBranch = NULL;
@@ -1146,9 +1146,9 @@ bool LoopIndexSplit::splitLoop() {
 
   //[*] Move exit condition into split condition block to avoid
   //    executing dead loop iteration.
-  ICmpInst *B_ExitCondition = cast<ICmpInst>(ValueMap[ExitCondition]);
-  Instruction *B_IndVarIncrement = cast<Instruction>(ValueMap[IVIncrement]);
-  ICmpInst *B_SplitCondition = cast<ICmpInst>(ValueMap[SplitCondition]);
+  ICmpInst *B_ExitCondition = cast<ICmpInst>(VMap[ExitCondition]);
+  Instruction *B_IndVarIncrement = cast<Instruction>(VMap[IVIncrement]);
+  ICmpInst *B_SplitCondition = cast<ICmpInst>(VMap[SplitCondition]);
 
   moveExitCondition(A_SplitCondBlock, A_ActiveBranch, A_ExitBlock, ExitCondition,
                     cast<ICmpInst>(SplitCondition), IndVar, IVIncrement, 
@@ -1159,7 +1159,7 @@ bool LoopIndexSplit::splitLoop() {
                     B_SplitCondition, B_IndVar, B_IndVarIncrement, 
                     BLoop, EVOpNum);
 
-  NumIndexSplit++;
+  ++NumIndexSplit;
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 5004483..16c4a15 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -147,7 +147,7 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
         continue;           // PHI nodes don't count.
       if (isa<DbgInfoIntrinsic>(OI))
         continue;  // Debug intrinsics don't count as size.
-      Size++;
+      ++Size;
   }
 
   if (Size > MAX_HEADER_SIZE)
@@ -263,7 +263,7 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
 
   preserveCanonicalLoopForm(LPM);
 
-  NumRotated++;
+  ++NumRotated;
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 86ea3eb..a250a88 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -392,12 +392,13 @@ static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 }
 
-/// isMulSExtable - Return true if the given add can be sign-extended
+/// isMulSExtable - Return true if the given mul can be sign-extended
 /// without changing its value.
-static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) {
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
   const Type *WideTy =
-    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
-  return isa<SCEVMulExpr>(SE.getSignExtendExpr(A, WideTy));
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 }
 
 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
@@ -413,20 +414,28 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   if (LHS == RHS)
     return SE.getConstant(LHS->getType(), 1);
 
-  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
-  // folding.
-  if (RHS->isAllOnesValue())
-    return SE.getMulExpr(LHS, RHS);
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getValue()->getValue();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
 
   // Check for a division of a constant by a constant.
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
-    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
     if (!RC)
       return 0;
-    if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
+    const APInt &LA = C->getValue()->getValue();
+    const APInt &RA = RC->getValue()->getValue();
+    if (LA.srem(RA) != 0)
       return 0;
-    return SE.getConstant(C->getValue()->getValue()
-               .sdiv(RC->getValue()->getValue()));
+    return SE.getConstant(LA.sdiv(RA));
   }
 
   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
@@ -440,6 +449,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
       if (!Step) return 0;
       return SE.getAddRecExpr(Start, Step, AR->getLoop());
     }
+    return 0;
   }
 
   // Distribute the sdiv over add operands, if the add doesn't overflow.
@@ -455,10 +465,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
       }
       return SE.getAddExpr(Ops);
     }
+    return 0;
   }
 
   // Check for a multiply operand that we can pull RHS out of.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
       SmallVector<const SCEV *, 4> Ops;
       bool Found = false;
@@ -475,6 +486,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
       }
       return Found ? SE.getMulExpr(Ops) : 0;
     }
+    return 0;
+  }
 
   // Otherwise we don't know.
   return 0;
@@ -546,7 +559,7 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
       case Intrinsic::x86_sse2_storeu_pd:
       case Intrinsic::x86_sse2_storeu_dq:
       case Intrinsic::x86_sse2_storel_dq:
-        if (II->getOperand(1) == OperandVal)
+        if (II->getArgOperand(0) == OperandVal)
           isAddress = true;
         break;
     }
@@ -568,7 +581,7 @@ static const Type *getAccessType(const Instruction *Inst) {
     case Intrinsic::x86_sse2_storeu_pd:
     case Intrinsic::x86_sse2_storeu_dq:
     case Intrinsic::x86_sse2_storel_dq:
-      AccessTy = II->getOperand(1)->getType();
+      AccessTy = II->getArgOperand(0)->getType();
       break;
     }
   }
@@ -976,6 +989,8 @@ public:
   void dump() const;
 };
 
+}
+
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -1203,6 +1218,32 @@ static bool isAlwaysFoldable(const SCEV *S,
   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
 }
 
+namespace {
+
+/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
+/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
+struct UseMapDenseMapInfo {
+  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
+  }
+
+  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
+  }
+
+  static unsigned
+  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
+    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
+    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
+    return Result;
+  }
+
+  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
+                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
+    return LHS == RHS;
+  }
+};
+
 /// FormulaSorter - This class implements an ordering for formulae which sorts
 /// the by their standalone cost.
 class FormulaSorter {
@@ -1275,7 +1316,9 @@ class LSRInstance {
   }
 
   // Support for sharing of LSRUses between LSRFixups.
-  typedef DenseMap<const SCEV *, size_t> UseMapTy;
+  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
+                   size_t,
+                   UseMapDenseMapInfo> UseMapTy;
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -1613,8 +1656,11 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
     NewRHS = Sel->getOperand(1);
   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
     NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
   else
-    llvm_unreachable("Max doesn't match expected pattern!");
+    // Max doesn't match expected pattern.
+    return Cond;
 
   // Determine the new comparison opcode. It may be signed or unsigned,
   // and the original comparison may be either equality or inequality.
@@ -1805,6 +1851,8 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
     NewMaxOffset = NewOffset;
   }
   // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
 
@@ -1833,7 +1881,7 @@ LSRInstance::getUse(const SCEV *&Expr,
   }
 
   std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(Expr, 0));
+    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
   if (!P.second) {
     // A use already existed with this base.
     size_t LUIdx = P.first->second;
@@ -1919,7 +1967,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
         Strides.insert(AR->getStepRecurrence(SE));
         Worklist.push_back(AR->getStart());
       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-        Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end());
+        Worklist.append(Add->op_begin(), Add->op_end());
       }
     } while (!Worklist.empty());
   }
@@ -2086,7 +2134,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
     const SCEV *S = Worklist.pop_back_val();
 
     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
-      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
+      Worklist.append(N->op_begin(), N->op_end());
     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
       Worklist.push_back(C->getOperand());
     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
@@ -2095,8 +2143,12 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
       if (!Inserted.insert(U)) continue;
       const Value *V = U->getValue();
-      if (const Instruction *Inst = dyn_cast<Instruction>(V))
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
         if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
            UI != UE; ++UI) {
         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
@@ -2155,20 +2207,23 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 /// separate registers. If C is non-null, multiply each subexpression by C.
 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
                             SmallVectorImpl<const SCEV *> &Ops,
+                            SmallVectorImpl<const SCEV *> &UninterestingOps,
+                            const Loop *L,
                             ScalarEvolution &SE) {
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     // Break out add operands.
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
          I != E; ++I)
-      CollectSubexprs(*I, C, Ops, SE);
+      CollectSubexprs(*I, C, Ops, UninterestingOps, L, SE);
     return;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
     if (!AR->getStart()->isZero()) {
       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
                                        AR->getStepRecurrence(SE),
-                                       AR->getLoop()), C, Ops, SE);
-      CollectSubexprs(AR->getStart(), C, Ops, SE);
+                                       AR->getLoop()),
+                      C, Ops, UninterestingOps, L, SE);
+      CollectSubexprs(AR->getStart(), C, Ops, UninterestingOps, L, SE);
       return;
     }
   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
@@ -2178,13 +2233,17 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
         CollectSubexprs(Mul->getOperand(1),
                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
-                        Ops, SE);
+                        Ops, UninterestingOps, L, SE);
         return;
       }
   }
 
-  // Otherwise use the value itself.
-  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
+  // Otherwise use the value itself. Loop-variant "unknown" values are
+  // uninteresting; we won't be able to do anything meaningful with them.
+  if (!C && isa<SCEVUnknown>(S) && !S->isLoopInvariant(L))
+    UninterestingOps.push_back(S);
+  else
+    Ops.push_back(C ? SE.getMulExpr(C, S) : S);
 }
 
 /// GenerateReassociations - Split out subexpressions from adds and the bases of
@@ -2198,8 +2257,15 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
     const SCEV *BaseReg = Base.BaseRegs[i];
 
-    SmallVector<const SCEV *, 8> AddOps;
-    CollectSubexprs(BaseReg, 0, AddOps, SE);
+    SmallVector<const SCEV *, 8> AddOps, UninterestingAddOps;
+    CollectSubexprs(BaseReg, 0, AddOps, UninterestingAddOps, L, SE);
+
+    // Add any uninteresting values as one register, as we won't be able to
+    // form any interesting reassociation opportunities with them. They'll
+    // just have to be added inside the loop no matter what we do.
+    if (!UninterestingAddOps.empty())
+      AddOps.push_back(SE.getAddExpr(UninterestingAddOps));
+
     if (AddOps.size() == 1) continue;
 
     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
@@ -2212,11 +2278,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
         continue;
 
       // Collect all operands except *J.
-      SmallVector<const SCEV *, 8> InnerAddOps;
-      for (SmallVectorImpl<const SCEV *>::const_iterator K = AddOps.begin(),
-           KE = AddOps.end(); K != KE; ++K)
-        if (K != J)
-          InnerAddOps.push_back(*K);
+      SmallVector<const SCEV *, 8> InnerAddOps
+        (         ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+      InnerAddOps.append
+        (next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
 
       // Don't leave just a constant behind in a register if the constant could
       // be folded into an immediate field.
@@ -2350,13 +2415,12 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   for (SmallSetVector<int64_t, 8>::const_iterator
        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
     int64_t Factor = *I;
-    Formula F = Base;
 
     // Check that the multiplication doesn't overflow.
-    if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
+    if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
       continue;
-    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
-    if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
+    int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
+    if (NewBaseOffs / Factor != Base.AM.BaseOffs)
       continue;
 
     // Check that multiplying with the use offset doesn't overflow.
@@ -2367,6 +2431,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     if (Offset / Factor != LU.MinOffset)
       continue;
 
+    Formula F = Base;
+    F.AM.BaseOffs = NewBaseOffs;
+
     // Check that this scale is legal.
     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
       continue;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index ae7bf40..0c900ff 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -445,7 +445,7 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
   // This is a very ad-hoc heuristic.
   if (Metrics.NumInsts > Threshold ||
       Metrics.NumBlocks * 5 > Threshold ||
-      Metrics.NeverInline) {
+      Metrics.containsIndirectBr || Metrics.isRecursive) {
     DEBUG(dbgs() << "NOT unswitching loop %"
           << currentLoop->getHeader()->getName() << ", cost too high: "
           << currentLoop->getBlocks().size() << "\n");
@@ -457,21 +457,21 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
 }
 
 // RemapInstruction - Convert the instruction operands from referencing the
-// current values into those specified by ValueMap.
+// current values into those specified by VMap.
 //
 static inline void RemapInstruction(Instruction *I,
-                                    DenseMap<const Value *, Value*> &ValueMap) {
+                                    ValueMap<const Value *, Value*> &VMap) {
   for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
     Value *Op = I->getOperand(op);
-    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
-    if (It != ValueMap.end()) Op = It->second;
+    ValueMap<const Value *, Value*>::iterator It = VMap.find(Op);
+    if (It != VMap.end()) Op = It->second;
     I->setOperand(op, Op);
   }
 }
 
 /// CloneLoop - Recursively clone the specified loop and all of its children,
 /// mapping the blocks with the specified map.
-static Loop *CloneLoop(Loop *L, Loop *PL, DenseMap<const Value*, Value*> &VM,
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueMap<const Value*, Value*> &VM,
                        LoopInfo *LI, LPPassManager *LPM) {
   Loop *New = new Loop();
   LPM->insertLoop(New, PL);
@@ -615,11 +615,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   // the loop preheader and exit blocks), keeping track of the mapping between
   // the instructions and blocks.
   NewBlocks.reserve(LoopBlocks.size());
-  DenseMap<const Value*, Value*> ValueMap;
+  ValueMap<const Value*, Value*> VMap;
   for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
-    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], ValueMap, ".us", F);
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
     NewBlocks.push_back(NewBB);
-    ValueMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
+    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
     LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
   }
 
@@ -629,7 +629,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                 NewBlocks[0], F->end());
 
   // Now we create the new Loop object for the versioned loop.
-  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), ValueMap, LI, LPM);
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
   Loop *ParentLoop = L->getParentLoop();
   if (ParentLoop) {
     // Make sure to add the cloned preheader and exit blocks to the parent loop
@@ -638,7 +638,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   }
   
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    BasicBlock *NewExit = cast<BasicBlock>(ValueMap[ExitBlocks[i]]);
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
     // The new exit block should be in the same loop as the old one.
     if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
       ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
@@ -653,8 +653,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) {
       PN = cast<PHINode>(I);
       Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
-      DenseMap<const Value *, Value*>::iterator It = ValueMap.find(V);
-      if (It != ValueMap.end()) V = It->second;
+      ValueMap<const Value *, Value*>::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
       PN->addIncoming(V, NewExit);
     }
   }
@@ -663,7 +663,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
-      RemapInstruction(I, ValueMap);
+      RemapInstruction(I, VMap);
   
   // Rewrite the original preheader to select between versions of the loop.
   BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 3611b8e..0e566c5 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -632,7 +632,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
   // Remove the memcpy
   MD.removeInstruction(cpy);
   cpy->eraseFromParent();
-  NumMemCpyInstr++;
+  ++NumMemCpyInstr;
 
   return true;
 }
@@ -710,7 +710,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   if (MD.getDependency(C) == dep) {
     MD.removeInstruction(M);
     M->eraseFromParent();
-    NumMemCpyInstr++;
+    ++NumMemCpyInstr;
     return true;
   }
   
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 5aca9cdc..98452f5 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -407,13 +407,14 @@ static Value *NegateValue(Value *V, Instruction *BI) {
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-    if (!BinaryOperator::isNeg(*UI)) continue;
+    User *U = *UI;
+    if (!BinaryOperator::isNeg(U)) continue;
 
     // We found one!  Now we have to make sure that the definition dominates
     // this use.  We do this by moving it to the entry block (if it is a
     // non-instruction value) or right after the definition.  These negates will
     // be zapped by reassociate later, so we don't need much finesse here.
-    BinaryOperator *TheNeg = cast<BinaryOperator>(*UI);
+    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
 
     // Verify that the negate is in this function, V might be a constant expr.
     if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 5ca9ce3..dd445f6 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -926,7 +926,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
   DeleteDeadInstructions();
   AI->eraseFromParent();
 
-  NumReplaced++;
+  ++NumReplaced;
 }
 
 /// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
@@ -965,11 +965,11 @@ void SROA::isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       isSafeGEP(GEPI, AI, GEPOffset, Info);
       if (!Info.isUnsafe)
         isSafeForScalarRepl(GEPI, AI, GEPOffset, Info);
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       if (Length)
         isSafeMemAccess(AI, Offset, Length->getZExtValue(), 0,
-                        UI.getOperandNo() == 1, Info);
+                        UI.getOperandNo() == CallInst::ArgOffset, Info);
       else
         MarkUnsafe(Info);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
@@ -1272,6 +1272,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
   // If there is an other pointer, we want to convert it to the same pointer
   // type as AI has, so we can GEP through it safely.
   if (OtherPtr) {
+    unsigned AddrSpace =
+      cast<PointerType>(OtherPtr->getType())->getAddressSpace();
 
     // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
     // optimization, but it's also required to detect the corner case where
@@ -1279,20 +1281,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
     // function is only called for mem intrinsics that access the whole
     // aggregate, so non-zero GEPs are not an issue here.)
-    while (1) {
-      if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr)) {
-        OtherPtr = BC->getOperand(0);
-        continue;
-      }
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(OtherPtr)) {
-        // All zero GEPs are effectively bitcasts.
-        if (GEP->hasAllZeroIndices()) {
-          OtherPtr = GEP->getOperand(0);
-          continue;
-        }
-      }
-      break;
-    }
+    OtherPtr = OtherPtr->stripPointerCasts();
+    
     // Copying the alloca to itself is a no-op: just delete it.
     if (OtherPtr == AI || OtherPtr == NewElts[0]) {
       // This code will run twice for a no-op memcpy -- once for each operand.
@@ -1304,15 +1294,13 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       return;
     }
     
-    if (ConstantExpr *BCE = dyn_cast<ConstantExpr>(OtherPtr))
-      if (BCE->getOpcode() == Instruction::BitCast)
-        OtherPtr = BCE->getOperand(0);
-    
     // If the pointer is not the right type, insert a bitcast to the right
     // type.
-    if (OtherPtr->getType() != AI->getType())
-      OtherPtr = new BitCastInst(OtherPtr, AI->getType(), OtherPtr->getName(),
-                                 MI);
+    const Type *NewTy =
+      PointerType::get(AI->getType()->getElementType(), AddrSpace);
+    
+    if (OtherPtr->getType() != NewTy)
+      OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
   }
   
   // Process each element of the aggregate.
@@ -1373,7 +1361,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       // If the stored element is zero (common case), just store a null
       // constant.
       Constant *StoreVal;
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getOperand(2))) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
         if (CI->isZero()) {
           StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
         } else {
@@ -1436,7 +1424,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       Value *Ops[] = {
         SROADest ? EltPtr : OtherElt,  // Dest ptr
         SROADest ? OtherElt : EltPtr,  // Src ptr
-        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        ConstantInt::get(MI->getArgOperand(2)->getType(), EltSize), // Size
         // Align
         ConstantInt::get(Type::getInt32Ty(MI->getContext()), OtherEltAlign),
         MI->getVolatileCst()
@@ -1451,8 +1439,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     } else {
       assert(isa<MemSetInst>(MI));
       Value *Ops[] = {
-        EltPtr, MI->getOperand(2),  // Dest, Value,
-        ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+        EltPtr, MI->getArgOperand(1),  // Dest, Value,
+        ConstantInt::get(MI->getArgOperand(2)->getType(), EltSize), // Size
         Zero,  // Align
         ConstantInt::get(Type::getInt1Ty(MI->getContext()), 0) // isVolatile
       };
@@ -1655,7 +1643,12 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
       SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
     }
 
-    ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+    // Don't create an 'or x, 0' on the first iteration.
+    if (!isa<Constant>(ResultVal) ||
+        !cast<Constant>(ResultVal)->isNullValue())
+      ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+    else
+      ResultVal = SrcField;
   }
 
   // Handle tail padding by truncating the result
@@ -1794,7 +1787,7 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     if (isOffset) return false;
 
     // If the memintrinsic isn't using the alloca as the dest, reject it.
-    if (UI.getOperandNo() != 1) return false;
+    if (UI.getOperandNo() != CallInst::ArgOffset) return false;
     
     // If the source of the memcpy/move is not a constant global, reject it.
     if (!PointsToConstantGlobal(MI->getSource()))
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 9744100..49d93a2 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -137,6 +137,9 @@ static bool MarkAliveBlocks(BasicBlock *BB,
       // they should be changed to unreachable by passes that can't modify the
       // CFG.
       if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
         Value *Ptr = SI->getOperand(1);
         
         if (isa<UndefValue>(Ptr) ||
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 7414be7..b1c6191 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -66,6 +66,11 @@ public:
     this->TD = TD;
     if (CI->getCalledFunction())
       Context = &CI->getCalledFunction()->getContext();
+
+    // We never change the calling convention.
+    if (CI->getCallingConv() != llvm::CallingConv::C)
+      return NULL;
+
     return CallOptimizer(CI->getCalledFunction(), CI, B);
   }
 };
@@ -92,6 +97,20 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
   return true;
 }
 
+/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // String and Memory LibCall Optimizations
 //===----------------------------------------------------------------------===//
@@ -110,8 +129,8 @@ struct StrCatOpt : public LibCallOptimization {
       return 0;
 
     // Extract some information from the instruction
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
 
     // See if we can get the length of the input string.
     uint64_t Len = GetStringLength(Src);
@@ -162,12 +181,12 @@ struct StrNCatOpt : public StrCatOpt {
       return 0;
 
     // Extract some information from the instruction
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
     uint64_t Len;
 
     // We don't do anything if length is not constant
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
       Len = LengthArg->getZExtValue();
     else
       return 0;
@@ -207,11 +226,11 @@ struct StrChrOpt : public LibCallOptimization {
         FT->getParamType(0) != FT->getReturnType())
       return 0;
 
-    Value *SrcStr = CI->getOperand(1);
+    Value *SrcStr = CI->getArgOperand(0);
 
     // If the second operand is non-constant, see if we can compute the length
     // of the input string and turn this into memchr.
-    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getOperand(2));
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
     if (CharC == 0) {
       // These optimizations require TargetData.
       if (!TD) return 0;
@@ -220,7 +239,7 @@ struct StrChrOpt : public LibCallOptimization {
       if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
         return 0;
 
-      return EmitMemChr(SrcStr, CI->getOperand(2), // include nul.
+      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
                         ConstantInt::get(TD->getIntPtrType(*Context), Len),
                         B, TD);
     }
@@ -260,12 +279,12 @@ struct StrCmpOpt : public LibCallOptimization {
     // Verify the "strcmp" function prototype.
     const FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
-	!FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(*Context))
       return 0;
 
-    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
     if (Str1P == Str2P)      // strcmp(x,x)  -> 0
       return ConstantInt::get(CI->getType(), 0);
 
@@ -308,19 +327,19 @@ struct StrNCmpOpt : public LibCallOptimization {
     // Verify the "strncmp" function prototype.
     const FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 ||
-	!FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(*Context) ||
         !FT->getParamType(2)->isIntegerTy())
       return 0;
 
-    Value *Str1P = CI->getOperand(1), *Str2P = CI->getOperand(2);
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
     if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
       return ConstantInt::get(CI->getType(), 0);
 
     // Get the length argument if it is constant.
     uint64_t Length;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getOperand(3)))
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
       Length = LengthArg->getZExtValue();
     else
       return 0;
@@ -328,6 +347,9 @@ struct StrNCmpOpt : public LibCallOptimization {
     if (Length == 0) // strncmp(x,y,0)   -> 0
       return ConstantInt::get(CI->getType(), 0);
 
+    if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD);
+
     std::string Str1, Str2;
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
@@ -365,7 +387,7 @@ struct StrCpyOpt : public LibCallOptimization {
         FT->getParamType(0) != Type::getInt8PtrTy(*Context))
       return 0;
 
-    Value *Dst = CI->getOperand(1), *Src = CI->getOperand(2);
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
     if (Dst == Src)      // strcpy(x,x)  -> x
       return Src;
 
@@ -381,7 +403,7 @@ struct StrCpyOpt : public LibCallOptimization {
     if (OptChkCall)
       EmitMemCpyChk(Dst, Src,
                     ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                    CI->getOperand(3), B, TD);
+                    CI->getArgOperand(2), B, TD);
     else
       EmitMemCpy(Dst, Src,
                  ConstantInt::get(TD->getIntPtrType(*Context), Len),
@@ -402,9 +424,9 @@ struct StrNCpyOpt : public LibCallOptimization {
         !FT->getParamType(2)->isIntegerTy())
       return 0;
 
-    Value *Dst = CI->getOperand(1);
-    Value *Src = CI->getOperand(2);
-    Value *LenOp = CI->getOperand(3);
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    Value *LenOp = CI->getArgOperand(2);
 
     // See if we can get the length of the input string.
     uint64_t SrcLen = GetStringLength(Src);
@@ -452,7 +474,7 @@ struct StrLenOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy())
       return 0;
 
-    Value *Src = CI->getOperand(1);
+    Value *Src = CI->getArgOperand(0);
 
     // Constant folding: strlen("xyz") -> 3
     if (uint64_t Len = GetStringLength(Src))
@@ -477,7 +499,7 @@ struct StrToOpt : public LibCallOptimization {
         !FT->getParamType(1)->isPointerTy())
       return 0;
 
-    Value *EndPtr = CI->getOperand(2);
+    Value *EndPtr = CI->getArgOperand(1);
     if (isa<ConstantPointerNull>(EndPtr)) {
       CI->setOnlyReadsMemory();
       CI->addAttribute(1, Attribute::NoCapture);
@@ -500,17 +522,34 @@ struct StrStrOpt : public LibCallOptimization {
       return 0;
 
     // fold strstr(x, x) -> x.
-    if (CI->getOperand(1) == CI->getOperand(2))
-      return B.CreateBitCast(CI->getOperand(1), CI->getType());
+    if (CI->getArgOperand(0) == CI->getArgOperand(1))
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+    if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD);
+      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   StrLen, B, TD);
+      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+           UI != UE; ) {
+        ICmpInst *Old = cast<ICmpInst>(UI++);
+        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+                                  ConstantInt::getNullValue(StrNCmp->getType()),
+                                  "cmp");
+        Old->replaceAllUsesWith(Cmp);
+        Old->eraseFromParent();
+      }
+      return CI;
+    }
 
     // See if either input string is a constant string.
     std::string SearchStr, ToFindStr;
-    bool HasStr1 = GetConstantStringInfo(CI->getOperand(1), SearchStr);
-    bool HasStr2 = GetConstantStringInfo(CI->getOperand(2), ToFindStr);
+    bool HasStr1 = GetConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = GetConstantStringInfo(CI->getArgOperand(1), ToFindStr);
 
     // fold strstr(x, "") -> x.
     if (HasStr2 && ToFindStr.empty())
-      return B.CreateBitCast(CI->getOperand(1), CI->getType());
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
 
     // If both strings are known, constant fold it.
     if (HasStr1 && HasStr2) {
@@ -520,14 +559,14 @@ struct StrStrOpt : public LibCallOptimization {
         return Constant::getNullValue(CI->getType());
 
       // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-      Value *Result = CastToCStr(CI->getOperand(1), B);
+      Value *Result = CastToCStr(CI->getArgOperand(0), B);
       Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
       return B.CreateBitCast(Result, CI->getType());
     }
 
     // fold strstr(x, "y") -> strchr(x, 'y').
     if (HasStr2 && ToFindStr.size() == 1)
-      return B.CreateBitCast(EmitStrChr(CI->getOperand(1), ToFindStr[0], B, TD),
+      return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD),
                              CI->getType());
     return 0;
   }
@@ -545,13 +584,13 @@ struct MemCmpOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy(32))
       return 0;
 
-    Value *LHS = CI->getOperand(1), *RHS = CI->getOperand(2);
+    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
 
     if (LHS == RHS)  // memcmp(s,s,x) -> 0
       return Constant::getNullValue(CI->getType());
 
     // Make sure we have a constant length.
-    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
     if (!LenC) return 0;
     uint64_t Len = LenC->getZExtValue();
 
@@ -598,9 +637,9 @@ struct MemCpyOpt : public LibCallOptimization {
       return 0;
 
     // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-    EmitMemCpy(CI->getOperand(1), CI->getOperand(2),
-               CI->getOperand(3), 1, false, B, TD);
-    return CI->getOperand(1);
+    EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+               CI->getArgOperand(2), 1, false, B, TD);
+    return CI->getArgOperand(0);
   }
 };
 
@@ -620,9 +659,9 @@ struct MemMoveOpt : public LibCallOptimization {
       return 0;
 
     // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-    EmitMemMove(CI->getOperand(1), CI->getOperand(2),
-                CI->getOperand(3), 1, false, B, TD);
-    return CI->getOperand(1);
+    EmitMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                CI->getArgOperand(2), 1, false, B, TD);
+    return CI->getArgOperand(0);
   }
 };
 
@@ -642,10 +681,10 @@ struct MemSetOpt : public LibCallOptimization {
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
-    Value *Val = B.CreateIntCast(CI->getOperand(2), Type::getInt8Ty(*Context),
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), Type::getInt8Ty(*Context),
                                  false);
-    EmitMemSet(CI->getOperand(1), Val,  CI->getOperand(3), false, B, TD);
-    return CI->getOperand(1);
+    EmitMemSet(CI->getArgOperand(0), Val,  CI->getArgOperand(2), false, B, TD);
+    return CI->getArgOperand(0);
   }
 };
 
@@ -666,7 +705,7 @@ struct PowOpt : public LibCallOptimization {
         !FT->getParamType(0)->isFloatingPointTy())
       return 0;
 
-    Value *Op1 = CI->getOperand(1), *Op2 = CI->getOperand(2);
+    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
       if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
         return Op1C;
@@ -720,18 +759,18 @@ struct Exp2Opt : public LibCallOptimization {
         !FT->getParamType(0)->isFloatingPointTy())
       return 0;
 
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
     // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
     Value *LdExpArg = 0;
     if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
         LdExpArg = B.CreateSExt(OpC->getOperand(0),
-				Type::getInt32Ty(*Context), "tmp");
+                                Type::getInt32Ty(*Context), "tmp");
     } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
         LdExpArg = B.CreateZExt(OpC->getOperand(0),
-				Type::getInt32Ty(*Context), "tmp");
+                                Type::getInt32Ty(*Context), "tmp");
     }
 
     if (LdExpArg) {
@@ -772,7 +811,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
       return 0;
 
     // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getOperand(1));
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
     if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
       return 0;
 
@@ -797,11 +836,11 @@ struct FFSOpt : public LibCallOptimization {
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 1 ||
-	!FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getReturnType()->isIntegerTy(32) ||
         !FT->getParamType(0)->isIntegerTy())
       return 0;
 
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
 
     // Constant fold.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
@@ -821,7 +860,7 @@ struct FFSOpt : public LibCallOptimization {
 
     Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
     return B.CreateSelect(Cond, V,
-			  ConstantInt::get(Type::getInt32Ty(*Context), 0));
+                          ConstantInt::get(Type::getInt32Ty(*Context), 0));
   }
 };
 
@@ -837,7 +876,7 @@ struct IsDigitOpt : public LibCallOptimization {
       return 0;
 
     // isdigit(c) -> (c-'0') <u 10
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     Op = B.CreateSub(Op, ConstantInt::get(Type::getInt32Ty(*Context), '0'),
                      "isdigittmp");
     Op = B.CreateICmpULT(Op, ConstantInt::get(Type::getInt32Ty(*Context), 10),
@@ -858,7 +897,7 @@ struct IsAsciiOpt : public LibCallOptimization {
       return 0;
 
     // isascii(c) -> c <u 128
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     Op = B.CreateICmpULT(Op, ConstantInt::get(Type::getInt32Ty(*Context), 128),
                          "isascii");
     return B.CreateZExt(Op, CI->getType());
@@ -877,7 +916,7 @@ struct AbsOpt : public LibCallOptimization {
       return 0;
 
     // abs(x) -> x >s -1 ? x : -x
-    Value *Op = CI->getOperand(1);
+    Value *Op = CI->getArgOperand(0);
     Value *Pos = B.CreateICmpSGT(Op,
                              Constant::getAllOnesValue(Op->getType()),
                                  "ispos");
@@ -899,7 +938,7 @@ struct ToAsciiOpt : public LibCallOptimization {
       return 0;
 
     // isascii(c) -> c & 0x7f
-    return B.CreateAnd(CI->getOperand(1),
+    return B.CreateAnd(CI->getArgOperand(0),
                        ConstantInt::get(CI->getType(),0x7F));
   }
 };
@@ -922,7 +961,7 @@ struct PrintFOpt : public LibCallOptimization {
 
     // Check for a fixed format string.
     std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(1), FormatStr))
+    if (!GetConstantStringInfo(CI->getArgOperand(0), FormatStr))
       return 0;
 
     // Empty format string -> noop.
@@ -954,20 +993,20 @@ struct PrintFOpt : public LibCallOptimization {
     }
 
     // Optimize specific format strings.
-    // printf("%c", chr) --> putchar(*(i8*)dst)
-    if (FormatStr == "%c" && CI->getNumOperands() > 2 &&
-        CI->getOperand(2)->getType()->isIntegerTy()) {
-      Value *Res = EmitPutChar(CI->getOperand(2), B, TD);
+    // printf("%c", chr) --> putchar(chr)
+    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isIntegerTy()) {
+      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD);
 
       if (CI->use_empty()) return CI;
       return B.CreateIntCast(Res, CI->getType(), true);
     }
 
     // printf("%s\n", str) --> puts(str)
-    if (FormatStr == "%s\n" && CI->getNumOperands() > 2 &&
-        CI->getOperand(2)->getType()->isPointerTy() &&
+    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isPointerTy() &&
         CI->use_empty()) {
-      EmitPutS(CI->getOperand(2), B, TD);
+      EmitPutS(CI->getArgOperand(1), B, TD);
       return CI;
     }
     return 0;
@@ -988,11 +1027,11 @@ struct SPrintFOpt : public LibCallOptimization {
 
     // Check for a fixed format string.
     std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+    if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
     // If we just have a format string (nothing else crazy) transform it.
-    if (CI->getNumOperands() == 3) {
+    if (CI->getNumArgOperands() == 2) {
       // Make sure there's no % in the constant array.  We could try to handle
       // %% -> % in the future if we cared.
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
@@ -1003,7 +1042,7 @@ struct SPrintFOpt : public LibCallOptimization {
       if (!TD) return 0;
 
       // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-      EmitMemCpy(CI->getOperand(1), CI->getOperand(2), // Copy the nul byte.
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), // Copy the nul byte.
                  ConstantInt::get(TD->getIntPtrType(*Context),
                  FormatStr.size()+1), 1, false, B, TD);
       return ConstantInt::get(CI->getType(), FormatStr.size());
@@ -1011,16 +1050,17 @@ struct SPrintFOpt : public LibCallOptimization {
 
     // The remaining optimizations require the format string to be "%s" or "%c"
     // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
       return 0;
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
       // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getOperand(3)->getType()->isIntegerTy()) return 0;
-      Value *V = B.CreateTrunc(CI->getOperand(3),
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *V = B.CreateTrunc(CI->getArgOperand(2),
                                Type::getInt8Ty(*Context), "char");
-      Value *Ptr = CastToCStr(CI->getOperand(1), B);
+      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
       B.CreateStore(V, Ptr);
       Ptr = B.CreateGEP(Ptr, ConstantInt::get(Type::getInt32Ty(*Context), 1),
                         "nul");
@@ -1034,13 +1074,13 @@ struct SPrintFOpt : public LibCallOptimization {
       if (!TD) return 0;
 
       // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      if (!CI->getOperand(3)->getType()->isPointerTy()) return 0;
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
 
-      Value *Len = EmitStrLen(CI->getOperand(3), B, TD);
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD);
       Value *IncLen = B.CreateAdd(Len,
                                   ConstantInt::get(Len->getType(), 1),
                                   "leninc");
-      EmitMemCpy(CI->getOperand(1), CI->getOperand(3), IncLen, 1, false, B, TD);
+      EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1, false, B, TD);
 
       // The sprintf result is the unincremented number of bytes in the string.
       return B.CreateIntCast(Len, CI->getType(), false);
@@ -1064,8 +1104,8 @@ struct FWriteOpt : public LibCallOptimization {
       return 0;
 
     // Get the element size and count.
-    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getOperand(2));
-    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getOperand(3));
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
     if (!SizeC || !CountC) return 0;
     uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
 
@@ -1075,8 +1115,8 @@ struct FWriteOpt : public LibCallOptimization {
 
     // If this is writing one byte, turn it into fputc.
     if (Bytes == 1) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
-      Value *Char = B.CreateLoad(CastToCStr(CI->getOperand(1), B), "char");
-      EmitFPutC(Char, CI->getOperand(4), B, TD);
+      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+      EmitFPutC(Char, CI->getArgOperand(3), B, TD);
       return ConstantInt::get(CI->getType(), 1);
     }
 
@@ -1100,11 +1140,11 @@ struct FPutsOpt : public LibCallOptimization {
       return 0;
 
     // fputs(s,F) --> fwrite(s,1,strlen(s),F)
-    uint64_t Len = GetStringLength(CI->getOperand(1));
+    uint64_t Len = GetStringLength(CI->getArgOperand(0));
     if (!Len) return 0;
-    EmitFWrite(CI->getOperand(1),
+    EmitFWrite(CI->getArgOperand(0),
                ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-               CI->getOperand(2), B, TD);
+               CI->getArgOperand(1), B, TD);
     return CI;  // Known to have no uses (see above).
   }
 };
@@ -1123,11 +1163,11 @@ struct FPrintFOpt : public LibCallOptimization {
 
     // All the optimizations depend on the format string.
     std::string FormatStr;
-    if (!GetConstantStringInfo(CI->getOperand(2), FormatStr))
+    if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
     // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
-    if (CI->getNumOperands() == 3) {
+    if (CI->getNumArgOperands() == 2) {
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
         if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
           return 0; // We found a format specifier.
@@ -1135,31 +1175,32 @@ struct FPrintFOpt : public LibCallOptimization {
       // These optimizations require TargetData.
       if (!TD) return 0;
 
-      EmitFWrite(CI->getOperand(2),
+      EmitFWrite(CI->getArgOperand(1),
                  ConstantInt::get(TD->getIntPtrType(*Context),
                                   FormatStr.size()),
-                 CI->getOperand(1), B, TD);
+                 CI->getArgOperand(0), B, TD);
       return ConstantInt::get(CI->getType(), FormatStr.size());
     }
 
     // The remaining optimizations require the format string to be "%s" or "%c"
     // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' || CI->getNumOperands() <4)
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
       return 0;
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
-      // fprintf(F, "%c", chr) --> *(i8*)dst = chr
-      if (!CI->getOperand(3)->getType()->isIntegerTy()) return 0;
-      EmitFPutC(CI->getOperand(3), CI->getOperand(1), B, TD);
+      // fprintf(F, "%c", chr) --> fputc(chr, F)
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
       return ConstantInt::get(CI->getType(), 1);
     }
 
     if (FormatStr[1] == 's') {
-      // fprintf(F, "%s", str) -> fputs(str, F)
-      if (!CI->getOperand(3)->getType()->isPointerTy() || !CI->use_empty())
+      // fprintf(F, "%s", str) --> fputs(str, F)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
         return 0;
-      EmitFPutS(CI->getOperand(3), CI->getOperand(1), B, TD);
+      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
       return CI;
     }
     return 0;
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
index 2306a77..9208238 100644
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ b/lib/Transforms/Scalar/TailDuplication.cpp
@@ -206,12 +206,13 @@ static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
   // there is only one other pred, get it, otherwise we can't handle it.
   PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
   BasicBlock *DstOtherPred = 0;
-  if (*PI == SrcBlock) {
+  BasicBlock *P = *PI;
+  if (P == SrcBlock) {
     if (++PI == PE) return 0;
     DstOtherPred = *PI;
     if (++PI != PE) return 0;
   } else {
-    DstOtherPred = *PI;
+    DstOtherPred = P;
     if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
   }
 
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 5ad5de2..01c8e5d 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -16,9 +16,9 @@
 //     transformation from taking place, though currently the analysis cannot
 //     support moving any really useful instructions (only dead ones).
 //  2. This pass transforms functions that are prevented from being tail
-//     recursive by an associative expression to use an accumulator variable,
-//     thus compiling the typical naive factorial or 'fib' implementation into
-//     efficient code.
+//     recursive by an associative and commutative expression to use an
+//     accumulator variable, thus compiling the typical naive factorial or
+//     'fib' implementation into efficient code.
 //  3. TRE is performed if the function returns void, if the return
 //     returns the result returned by the call, or if the function returns a
 //     run-time constant on all exits from the function.  It is possible, though
@@ -60,6 +60,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/ADT/Statistic.h"
@@ -252,7 +253,7 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
     // If we are passing this argument into call as the corresponding
     // argument operand, then the argument is dynamically constant.
     // Otherwise, we cannot transform this function safely.
-    if (CI->getOperand(ArgNo+1) == Arg)
+    if (CI->getArgOperand(ArgNo) == Arg)
       return true;
   }
 
@@ -269,16 +270,16 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
 }
 
 // getCommonReturnValue - Check to see if the function containing the specified
-// return instruction and tail call consistently returns the same
-// runtime-constant value at all exit points.  If so, return the returned value.
+// tail call consistently returns the same runtime-constant value at all exit
+// points except for IgnoreRI.  If so, return the returned value.
 //
-static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
-  Function *F = TheRI->getParent()->getParent();
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
   Value *ReturnedValue = 0;
 
   for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
-      if (RI != TheRI) {
+      if (RI != IgnoreRI) {
         Value *RetOp = RI->getOperand(0);
 
         // We can only perform this transformation if the value returned is
@@ -301,9 +302,9 @@ static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
 ///
 Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
                                                       CallInst *CI) {
-  if (!I->isAssociative()) return 0;
+  if (!I->isAssociative() || !I->isCommutative()) return 0;
   assert(I->getNumOperands() == 2 &&
-         "Associative operations should have 2 args!");
+         "Associative/commutative operations should have 2 args!");
 
   // Exactly one operand should be the result of the call instruction...
   if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
@@ -368,11 +369,16 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
       return false;
   }
 
-  // If we are introducing accumulator recursion to eliminate associative
-  // operations after the call instruction, this variable contains the initial
-  // value for the accumulator.  If this value is set, we actually perform
-  // accumulator recursion elimination instead of simple tail recursion
-  // elimination.
+  // If we are introducing accumulator recursion to eliminate operations after
+  // the call instruction that are both associative and commutative, the initial
+  // value for the accumulator is placed in this variable.  If this value is set
+  // then we actually perform accumulator recursion elimination instead of
+  // simple tail recursion elimination.  If the operation is an LLVM instruction
+  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
+  // we are handling the case when the return instruction returns a constant C
+  // which is different to the constant returned by other return instructions
+  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
+  // special case of accumulator recursion, the operation being "return C".
   Value *AccumulatorRecursionEliminationInitVal = 0;
   Instruction *AccumulatorRecursionInstr = 0;
 
@@ -383,9 +389,9 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI)
     if (!CanMoveAboveCall(BBI, CI)) {
       // If we can't move the instruction above the call, it might be because it
-      // is an associative operation that could be tranformed using accumulator
-      // recursion elimination.  Check to see if this is the case, and if so,
-      // remember the initial accumulator value for later.
+      // is an associative and commutative operation that could be tranformed
+      // using accumulator recursion elimination.  Check to see if this is the
+      // case, and if so, remember the initial accumulator value for later.
       if ((AccumulatorRecursionEliminationInitVal =
                              CanTransformAccumulatorRecursion(BBI, CI))) {
         // Yes, this is accumulator recursion.  Remember which instruction
@@ -403,8 +409,18 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
       !isa<UndefValue>(Ret->getReturnValue()) &&
       AccumulatorRecursionEliminationInitVal == 0 &&
-      !getCommonReturnValue(Ret, CI))
-    return false;
+      !getCommonReturnValue(0, CI)) {
+    // One case remains that we are able to handle: the current return
+    // instruction returns a constant, and all other return instructions
+    // return a different constant.
+    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+      return false; // Current return instruction does not return a constant.
+    // Check that all other return instructions return a common constant.  If
+    // so, record it in AccumulatorRecursionEliminationInitVal.
+    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+    if (!AccumulatorRecursionEliminationInitVal)
+      return false;
+  }
 
   // OK! We can transform this tail call.  If this is the first one found,
   // create the new entry block, allowing us to branch back to the old entry.
@@ -453,8 +469,8 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   // Ok, now that we know we have a pseudo-entry block WITH all of the
   // required PHI nodes, add entries into the PHI node for the actual
   // parameters passed into the tail-recursive call.
-  for (unsigned i = 0, e = CI->getNumOperands()-1; i != e; ++i)
-    ArgumentPHIs[i]->addIncoming(CI->getOperand(i+1), BB);
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
 
   // If we are introducing an accumulator variable to eliminate the recursion,
   // do so now.  Note that we _know_ that no subsequent tail recursion
@@ -464,8 +480,9 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
   if (AccumulatorRecursionEliminationInitVal) {
     Instruction *AccRecInstr = AccumulatorRecursionInstr;
     // Start by inserting a new PHI node for the accumulator.
-    PHINode *AccPN = PHINode::Create(AccRecInstr->getType(), "accumulator.tr",
-                                     OldEntry->begin());
+    PHINode *AccPN =
+      PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(),
+                      "accumulator.tr", OldEntry->begin());
 
     // Loop over all of the predecessors of the tail recursion block.  For the
     // real entry into the function we seed the PHI with the initial value,
@@ -475,20 +492,27 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
     // it will not show up as a predecessor.
     for (pred_iterator PI = pred_begin(OldEntry), PE = pred_end(OldEntry);
          PI != PE; ++PI) {
-      if (*PI == &F->getEntryBlock())
-        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, *PI);
+      BasicBlock *P = *PI;
+      if (P == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
       else
-        AccPN->addIncoming(AccPN, *PI);
+        AccPN->addIncoming(AccPN, P);
     }
 
-    // Add an incoming argument for the current block, which is computed by our
-    // associative accumulator instruction.
-    AccPN->addIncoming(AccRecInstr, BB);
-
-    // Next, rewrite the accumulator recursion instruction so that it does not
-    // use the result of the call anymore, instead, use the PHI node we just
-    // inserted.
-    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    if (AccRecInstr) {
+      // Add an incoming argument for the current block, which is computed by
+      // our associative and commutative accumulator instruction.
+      AccPN->addIncoming(AccRecInstr, BB);
+
+      // Next, rewrite the accumulator recursion instruction so that it does not
+      // use the result of the call anymore, instead, use the PHI node we just
+      // inserted.
+      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    } else {
+      // Add an incoming argument for the current block, which is just the
+      // constant returned by the current return instruction.
+      AccPN->addIncoming(Ret->getReturnValue(), BB);
+    }
 
     // Finally, rewrite any return instructions in the program to return the PHI
     // node instead of the "initval" that they do currently.  This loop will