128 files changed, 20331 insertions, 11722 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index be48b20..e6fa4ed 100644
--- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -31,21 +31,21 @@
 
 #define DEBUG_TYPE "argpromotion"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/CallGraphSCCPass.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Support/CallSite.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include <set>
 using namespace llvm;
 
@@ -153,8 +153,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   SmallPtrSet<Argument*, 8> ArgsToPromote;
   SmallPtrSet<Argument*, 8> ByValArgsToTransform;
   for (unsigned i = 0; i != PointerArgs.size(); ++i) {
-    bool isByVal=F->getParamAttributes(PointerArgs[i].second+1).
-      hasAttribute(Attributes::ByVal);
+    bool isByVal=F->getAttributes().
+      hasAttribute(PointerArgs[i].second+1, Attribute::ByVal);
     Argument *PtrArg = PointerArgs[i].first;
     Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
@@ -511,17 +511,16 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // what the new GEP/Load instructions we are inserting look like.
   std::map<IndicesVector, LoadInst*> OriginalLoads;
 
-  // Attributes - Keep track of the parameter attributes for the arguments
+  // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
-  const AttrListPtr &PAL = F->getAttributes();
+  SmallVector<AttributeSet, 8> AttributesVec;
+  const AttributeSet &PAL = F->getAttributes();
 
   // Add any return attributes.
-  Attributes attrs = PAL.getRetAttributes();
-  if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
-                                                    attrs));
+  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getRetAttributes()));
 
   // First, determine the new argument list
   unsigned ArgIndex = 1;
@@ -537,9 +536,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     } else if (!ArgsToPromote.count(I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      Attributes attrs = PAL.getParamAttributes(ArgIndex);
-      if (attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs));
+      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes(ArgIndex)) {
+        AttrBuilder B(attrs, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -591,10 +593,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   }
 
   // Add any function attributes.
-  attrs = PAL.getFnAttributes();
-  if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                                    attrs));
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(FTy->getContext(),
+                                              PAL.getFnAttributes()));
 
   Type *RetTy = FTy->getReturnType();
 
@@ -611,7 +612,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttrListPtr::get(F->getContext(), AttributesVec));
+  NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
   AttributesVec.clear();
 
   F->getParent()->getFunctionList().insert(F, NF);
@@ -636,13 +637,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     CallSite CS(F->use_back());
     assert(CS.getCalledFunction() == F);
     Instruction *Call = CS.getInstruction();
-    const AttrListPtr &CallPAL = CS.getAttributes();
+    const AttributeSet &CallPAL = CS.getAttributes();
 
     // Add any return attributes.
-    Attributes attrs = CallPAL.getRetAttributes();
-    if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
-                                                      attrs));
+    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                                CallPAL.getRetAttributes()));
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
@@ -653,10 +653,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
         Args.push_back(*AI);          // Unmodified argument
 
-        Attributes Attrs = CallPAL.getParamAttributes(ArgIndex);
-        if (Attrs.hasAttributes())
-          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
-
+        if (CallPAL.hasAttributes(ArgIndex)) {
+          AttrBuilder B(CallPAL, ArgIndex);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
       } else if (ByValArgsToTransform.count(I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = cast<PointerType>(I->getType())->getElementType();
@@ -715,28 +716,29 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     // Push any varargs arguments on the list.
     for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
       Args.push_back(*AI);
-      Attributes Attrs = CallPAL.getParamAttributes(ArgIndex);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      if (CallPAL.hasAttributes(ArgIndex)) {
+        AttrBuilder B(CallPAL, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
     }
 
     // Add any function attributes.
-    attrs = CallPAL.getFnAttributes();
-    if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                                      attrs));
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
                                Args, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(II->getContext(),
+      cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(),
                                                             AttributesVec));
     } else {
       New = CallInst::Create(NF, Args, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttrListPtr::get(New->getContext(),
+      cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
                                                           AttributesVec));
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index e2f0126..8336d3a 100644
--- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -19,15 +19,15 @@
 
 #define DEBUG_TYPE "constmerge"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 using namespace llvm;
 
 STATISTIC(NumMerged, "Number of global constants merged");
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4cfd0b2..49ef1e7 100644
--- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -19,23 +19,23 @@
 
 #define DEBUG_TYPE "deadargelim"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constant.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/DIBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include <map>
 #include <set>
 using namespace llvm;
@@ -271,16 +271,15 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
     Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
-    AttrListPtr PAL = CS.getAttributes();
-    if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) {
-      SmallVector<AttributeWithIndex, 8> AttributesVec;
-      for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
-        AttributesVec.push_back(PAL.getSlot(i));
-      Attributes FnAttrs = PAL.getFnAttributes();
-      if (FnAttrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                                        FnAttrs));
-      PAL = AttrListPtr::get(Fn.getContext(), AttributesVec);
+    AttributeSet PAL = CS.getAttributes();
+    if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
+      SmallVector<AttributeSet, 8> AttributesVec;
+      for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
+        AttributesVec.push_back(PAL.getSlotAttributes(i));
+      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
+                                                  PAL.getFnAttributes()));
+      PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
     }
 
     Instruction *New;
@@ -351,7 +350,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.use_empty())
     return false;
 
-  llvm::SmallVector<unsigned, 8> UnusedArgs;
+  SmallVector<unsigned, 8> UnusedArgs;
   for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); 
        I != E; ++I) {
     Argument *Arg = I;
@@ -697,15 +696,10 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   std::vector<Type*> Params;
 
   // Set up to build a new list of parameter attributes.
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
-  const AttrListPtr &PAL = F->getAttributes();
-
-  // The existing function return attributes.
-  Attributes RAttrs = PAL.getRetAttributes();
-  Attributes FnAttrs = PAL.getFnAttributes();
+  SmallVector<AttributeSet, 8> AttributesVec;
+  const AttributeSet &PAL = F->getAttributes();
 
   // Find out the new return value.
-
   Type *RetTy = FTy->getReturnType();
   Type *NRetTy = NULL;
   unsigned RetCount = NumRetVals(F);
@@ -759,22 +753,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   assert(NRetTy && "No new return type found?");
 
+  // The existing function return attributes.
+  AttributeSet RAttrs = PAL.getRetAttributes();
+
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
     RAttrs =
-      Attributes::get(NRetTy->getContext(), AttrBuilder(RAttrs).
-                      removeAttributes(Attributes::typeIncompatible(NRetTy)));
+      AttributeSet::get(NRetTy->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+         removeAttributes(AttributeFuncs::
+                          typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                          AttributeSet::ReturnIndex));
   else
-    assert(!AttrBuilder(RAttrs).
-             hasAttributes(Attributes::typeIncompatible(NRetTy)) &&
+    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+             hasAttributes(AttributeFuncs::
+                           typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                           AttributeSet::ReturnIndex) &&
            "Return attributes no longer compatible?");
 
-  if (RAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
-                                                    RAttrs));
+  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -791,9 +792,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
       // Get the original parameter attributes (skipping the first one, that is
       // for the return value.
-      Attributes Attrs = PAL.getParamAttributes(i + 1);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs));
+      if (PAL.hasAttributes(i + 1)) {
+        AttrBuilder B(PAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
     } else {
       ++NumArgumentsEliminated;
       DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
@@ -801,12 +804,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     }
   }
 
-  if (FnAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                                    FnAttrs));
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getFnAttributes()));
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttrListPtr NewPAL = AttrListPtr::get(F->getContext(), AttributesVec);
+  AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -833,18 +836,21 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     Instruction *Call = CS.getInstruction();
 
     AttributesVec.clear();
-    const AttrListPtr &CallPAL = CS.getAttributes();
+    const AttributeSet &CallPAL = CS.getAttributes();
 
     // The call return attributes.
-    Attributes RAttrs = CallPAL.getRetAttributes();
-    Attributes FnAttrs = CallPAL.getFnAttributes();
+    AttributeSet RAttrs = CallPAL.getRetAttributes();
+
     // Adjust in case the function was changed to return void.
     RAttrs =
-      Attributes::get(NF->getContext(), AttrBuilder(RAttrs).
-           removeAttributes(Attributes::typeIncompatible(NF->getReturnType())));
-    if (RAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
-                                                      RAttrs));
+      AttributeSet::get(NF->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+        removeAttributes(AttributeFuncs::
+                         typeIncompatible(NF->getReturnType(),
+                                          AttributeSet::ReturnIndex),
+                         AttributeSet::ReturnIndex));
+    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -856,25 +862,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        Attributes Attrs = CallPAL.getParamAttributes(i + 1);
-        if (Attrs.hasAttributes())
-          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+        if (CallPAL.hasAttributes(i + 1)) {
+          AttrBuilder B(CallPAL, i + 1);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      Attributes Attrs = CallPAL.getParamAttributes(i + 1);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      if (CallPAL.hasAttributes(i + 1)) {
+        AttrBuilder B(CallPAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
     }
 
-    if (FnAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                                      FnAttrs));
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
 
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttrListPtr NewCallPAL = AttrListPtr::get(F->getContext(), AttributesVec);
+    AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
index 6716deb..fa3d72d 100644
--- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Constants.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -60,7 +60,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->hasLocalLinkage();
+        bool Local = I->isDiscardableIfUnused();
         if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
 
@@ -80,7 +80,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->hasLocalLinkage();
+        bool Local = I->isDiscardableIfUnused();
         if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
 
@@ -97,7 +97,7 @@ namespace {
         Module::alias_iterator CurI = I;
         ++I;
 
-        if (CurI->hasLocalLinkage()) {
+        if (CurI->isDiscardableIfUnused()) {
           CurI->setVisibility(GlobalValue::HiddenVisibility);
           CurI->setLinkage(GlobalValue::ExternalLinkage);
         }
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 18409f7..bc5109b 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1,4 +1,4 @@
-//===- FunctionAttrs.cpp - Pass which marks functions readnone or readonly ===//
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,30 +14,34 @@
 // to the function does not create any copies of the pointer value that
 // outlive the call.  This more or less means that the pointer is only
 // dereferenced, and not returned from the function or stored in a global.
+// Finally, well-known library call declarations are marked with all
+// attributes that are consistent with the function's standard definition.
 // This pass is implemented as a bottom-up traversal of the call-graph.
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "functionattrs"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/CallGraphSCCPass.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
 STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
 namespace {
   struct FunctionAttrs : public CallGraphSCCPass {
@@ -62,14 +66,63 @@ namespace {
     // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
     bool AddNoAliasAttrs(const CallGraphSCC &SCC);
 
+    // Utility methods used by inferPrototypeAttributes to add attributes
+    // and maintain annotation statistics.
+
+    void setDoesNotAccessMemory(Function &F) {
+      if (!F.doesNotAccessMemory()) {
+	F.setDoesNotAccessMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setOnlyReadsMemory(Function &F) {
+      if (!F.onlyReadsMemory()) {
+	F.setOnlyReadsMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotThrow(Function &F) {
+      if (!F.doesNotThrow()) {
+	F.setDoesNotThrow();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotCapture(Function &F, unsigned n) {
+      if (!F.doesNotCapture(n)) {
+	F.setDoesNotCapture(n);
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotAlias(Function &F, unsigned n) {
+      if (!F.doesNotAlias(n)) {
+	F.setDoesNotAlias(n);
+	++NumAnnotated;
+      }
+    }
+
+    // inferPrototypeAttributes - Analyze the name and prototype of the
+    // given function and set any applicable attributes.  Returns true
+    // if any attributes were set and false otherwise.
+    bool inferPrototypeAttributes(Function &F);
+
+    // annotateLibraryCalls - Adds attributes to well-known standard library
+    // call declarations.
+    bool annotateLibraryCalls(const CallGraphSCC &SCC);
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
       CallGraphSCCPass::getAnalysisUsage(AU);
     }
 
   private:
     AliasAnalysis *AA;
+    TargetLibraryInfo *TLI;
   };
 }
 
@@ -77,6 +130,7 @@ char FunctionAttrs::ID = 0;
 INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 
@@ -213,16 +267,15 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
 
     // Clear out any existing attributes.
     AttrBuilder B;
-    B.addAttribute(Attributes::ReadOnly)
-      .addAttribute(Attributes::ReadNone);
-    F->removeAttribute(AttrListPtr::FunctionIndex,
-                       Attributes::get(F->getContext(), B));
+    B.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone);
+    F->removeAttributes(AttributeSet::FunctionIndex,
+                        AttributeSet::get(F->getContext(),
+                                          AttributeSet::FunctionIndex, B));
 
     // Add in the new attribute.
-    B.clear();
-    B.addAttribute(ReadsMemory ? Attributes::ReadOnly : Attributes::ReadNone);
-    F->addAttribute(AttrListPtr::FunctionIndex,
-                    Attributes::get(F->getContext(), B));
+    F->addAttribute(AttributeSet::FunctionIndex,
+                    ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
 
     if (ReadsMemory)
       ++NumReadOnly;
@@ -358,7 +411,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
   ArgumentGraph AG;
 
   AttrBuilder B;
-  B.addAttribute(Attributes::NoCapture);
+  B.addAttribute(Attribute::NoCapture);
 
   // Check each function in turn, determining which pointer arguments are not
   // captured.
@@ -381,7 +434,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
            A != E; ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(Attributes::get(F->getContext(), B));
+          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
           ++NumNoCapture;
           Changed = true;
         }
@@ -396,7 +449,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
         if (!Tracker.Captured) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
-            A->addAttr(Attributes::get(F->getContext(), B));
+            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B));
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -431,7 +484,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
         ArgumentSCC[0]->
           Definition->
-          addAttr(Attributes::get(ArgumentSCC[0]->Definition->getContext(), B));
+          addAttr(AttributeSet::get(ArgumentSCC[0]->Definition->getContext(),
+                                    ArgumentSCC[0]->Definition->getArgNo() + 1,
+                                    B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -473,7 +528,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(Attributes::get(A->getContext(), B));
+      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
       ++NumNoCapture;
       Changed = true;
     }
@@ -530,7 +585,7 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,
         case Instruction::Call:
         case Instruction::Invoke: {
           CallSite CS(RVI);
-          if (CS.paramHasAttr(0, Attributes::NoAlias))
+          if (CS.paramHasAttr(0, Attribute::NoAlias))
             break;
           if (CS.getCalledFunction() &&
               SCCNodes.count(CS.getCalledFunction()))
@@ -597,10 +652,693 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
   return MadeChange;
 }
 
+/// inferPrototypeAttributes - Analyze the name and prototype of the
+/// given function and set any applicable attributes.  Returns true
+/// if any attributes were set and false otherwise.
+bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
+  FunctionType *FTy = F.getFunctionType();
+  LibFunc::Func TheLibFunc;
+  if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc)))
+    return false;
+
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isIntegerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+  case LibFunc::strtoull:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strxfrm:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strcmp:
+  case LibFunc::strspn:
+  case LibFunc::strncmp:
+  case LibFunc::strcspn:
+  case LibFunc::strcoll:
+  case LibFunc::strcasecmp:
+  case LibFunc::strncasecmp:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::scanf:
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat:
+  case LibFunc::sscanf:
+  case LibFunc::sprintf:
+  case LibFunc::statvfs:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::snprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::setitimer:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::system:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "system" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::malloc:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::memcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    if (FTy->getNumParams() != 3)
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memalign:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::mkdir:
+  case LibFunc::mktime:
+    if (FTy->getNumParams() == 0 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::realloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::read:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "read" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::rmdir:
+  case LibFunc::rewind:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::rename:
+  case LibFunc::readlink:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::write:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "write" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcopy:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bzero:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::calloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::access:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::fopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fdopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ferror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F);
+    break;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fgets:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 3);
+  case LibFunc::fread:
+  case LibFunc::fwrite:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 4);
+  case LibFunc::fputs:
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+  case LibFunc::fgetpos:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::getenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::getitimer:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getpwnam:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ungetc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::uname:
+  case LibFunc::unlink:
+  case LibFunc::unsetenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::pread:
+  case LibFunc::pwrite:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; these are valid pthread cancellation points.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::popen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::pclose:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vscanf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vsscanf:
+  case LibFunc::vfscanf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::valloc:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::vprintf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::vsnprintf:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::open:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::opendir:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::times:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    setDoesNotThrow(F);
+    setDoesNotAccessMemory(F);
+    break;
+  case LibFunc::lstat:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::lchown:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::qsort:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
+      return false;
+    // May throw; places call through function pointer.
+    setDoesNotCapture(F, 4);
+    break;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::dunder_strtok_r:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::under_IO_getc:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::under_IO_putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::dunder_isoc99_scanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+  case LibFunc::dunder_isoc99_sscanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fopen64:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile64:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::open64:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  default:
+    // Didn't mark any attributes.
+    return false;
+  }
+
+  return true;
+}
+
+/// annotateLibraryCalls - Adds attributes to well-known standard library
+/// call declarations.
+bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
+  bool MadeChange = false;
+
+  // Check each function in turn annotating well-known library function
+  // declarations with attributes.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F != 0 && F->isDeclaration())
+      MadeChange |= inferPrototypeAttributes(*F);
+  }
+
+  return MadeChange;
+}
+
 bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
   AA = &getAnalysis<AliasAnalysis>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
 
-  bool Changed = AddReadAttrs(SCC);
+  bool Changed = annotateLibraryCalls(SCC);
+  Changed |= AddReadAttrs(SCC);
   Changed |= AddNoCaptureAttrs(SCC);
   Changed |= AddNoAliasAttrs(SCC);
   return Changed;
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 18c1c7b..dc99492 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -17,11 +17,11 @@
 
 #define DEBUG_TYPE "globaldce"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 using namespace llvm;
 
 STATISTIC(NumAliases  , "Number of global aliases removed");
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 591278f..b035a82 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -15,29 +15,29 @@
 
 #define DEBUG_TYPE "globalopt"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#include "llvm/Operator.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -148,17 +148,13 @@ struct GlobalStatus {
   /// an instruction (e.g. a constant expr or GV initializer).
   bool HasNonInstructionUser;
 
-  /// HasPHIUser - Set to true if this global has a user that is a PHI node.
-  bool HasPHIUser;
-
   /// AtomicOrdering - Set to the strongest atomic ordering requirement.
   AtomicOrdering Ordering;
 
   GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
                    StoredOnceValue(0), AccessingFunction(0),
                    HasMultipleAccessingFunctions(false),
-                   HasNonInstructionUser(false), HasPHIUser(false),
-                   Ordering(NotAtomic) {}
+                   HasNonInstructionUser(false), Ordering(NotAtomic) {}
 };
 
 }
@@ -200,11 +196,11 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
     const User *U = *UI;
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
       GS.HasNonInstructionUser = true;
-      
+
       // If the result of the constantexpr isn't pointer type, then we won't
       // know to expect it in various places.  Just reject early.
       if (!isa<PointerType>(CE->getType())) return true;
-      
+
       if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
     } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
       if (!GS.HasMultipleAccessingFunctions) {
@@ -274,7 +270,6 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
         // have to be careful about infinite recursion.
         if (PHIUsers.insert(PN))  // Not already visited.
           if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-        GS.HasPHIUser = true;
       } else if (isa<CmpInst>(I)) {
         GS.isCompared = true;
       } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
@@ -453,8 +448,8 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-	if (isAllocationFn(I, TLI))
-	  break;
+        if (isAllocationFn(I, TLI))
+          break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
           break;
@@ -475,8 +470,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
-    User *U = *UI++;
+  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  while (!WorkList.empty()) {
+    User *U = WorkList.pop_back_val();
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (Init) {
@@ -539,7 +535,6 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // us, and if they are all dead, nuke them without remorse.
       if (SafeToDestroyConstant(C)) {
         C->destroyConstant();
-        // This could have invalidated UI, start over from scratch.
         CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
       }
@@ -1830,7 +1825,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GlobalValue::InternalLinkage,
                                         ConstantInt::getFalse(GV->getContext()),
                                              GV->getName()+".b",
-                                             GV->getThreadLocalMode());
+                                             GV->getThreadLocalMode(),
+                                             GV->getType()->getAddressSpace());
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   Constant *InitVal = GV->getInitializer();
@@ -1850,10 +1846,10 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
       bool StoringOther = SI->getOperand(0) == OtherVal;
       // Only do this if we weren't storing a loaded value.
       Value *StoreVal;
-      if (StoringOther || SI->getOperand(0) == InitVal)
+      if (StoringOther || SI->getOperand(0) == InitVal) {
         StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
                                     StoringOther);
-      else {
+      } else {
         // Otherwise, we are storing a previously loaded copy.  To do this,
         // change the copy from copying the original value to just copying the
         // bool.
@@ -1892,6 +1888,9 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
     UI->eraseFromParent();
   }
 
+  // Retain the name of the old global variable. People who are debugging their
+  // programs may expect these variables to be named the same.
+  NewGV->takeName(GV);
   GV->eraseFromParent();
   return true;
 }
@@ -1994,7 +1993,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     return Changed;
 
   } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
-    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV);
+    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
@@ -2070,14 +2069,14 @@ static void ChangeCalleesToFastCall(Function *F) {
   }
 }
 
-static AttrListPtr StripNest(LLVMContext &C, const AttrListPtr &Attrs) {
+static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    if (!Attrs.getSlot(i).Attrs.hasAttribute(Attributes::Nest))
+    unsigned Index = Attrs.getSlotIndex(i);
+    if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
       continue;
 
     // There can be only one.
-    return Attrs.removeAttr(C, Attrs.getSlot(i).Index,
-                            Attributes::get(C, Attributes::Nest));
+    return Attrs.removeAttribute(C, Index, Attribute::Nest);
   }
 
   return Attrs;
@@ -2118,7 +2117,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
         Changed = true;
       }
 
-      if (F->getAttributes().hasAttrSomewhere(Attributes::Nest) &&
+      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
           !F->hasAddressTaken()) {
         // The function is not used by a trampoline intrinsic, so it is safe
         // to remove the 'nest' attribute.
@@ -2157,7 +2156,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
 GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
   GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
   if (GV == 0) return 0;
-  
+
   // Verify that the initializer is simple enough for us to handle. We are
   // only allowed to optimize the initializer if it is unique.
   if (!GV->hasUniqueInitializer()) return 0;
@@ -2263,7 +2262,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
 }
 
 
-static inline bool 
+static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
                             const DataLayout *TD);
@@ -2285,7 +2284,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
   if (C->getNumOperands() == 0 || isa<BlockAddress>(C) ||
       isa<GlobalValue>(C))
     return true;
-  
+
   // Aggregate values are safe if all their elements are.
   if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
       isa<ConstantVector>(C)) {
@@ -2296,7 +2295,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
     }
     return true;
   }
-  
+
   // We don't know exactly what relocations are allowed in constant expressions,
   // so we allow &global+constantoffset, which is safe and uniformly supported
   // across targets.
@@ -2314,14 +2313,14 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
                TD->getTypeSizeInBits(CE->getOperand(0)->getType()))
       return false;
     return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
-      
+
   // GEP is fine if it is simple + constant offset.
   case Instruction::GetElementPtr:
     for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
       if (!isa<ConstantInt>(CE->getOperand(i)))
         return false;
     return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
-      
+
   case Instruction::Add:
     // We allow simple+cst.
     if (!isa<ConstantInt>(CE->getOperand(1)))
@@ -2331,7 +2330,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
   return false;
 }
 
-static inline bool 
+static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
                             const DataLayout *TD) {
@@ -2379,7 +2378,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
         return false;
 
       return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
-    
+
     // A constantexpr bitcast from a pointer to another pointer is a no-op,
     // and we know how to evaluate it by moving the bitcast from the pointer
     // operand to the value operand.
@@ -2390,7 +2389,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
       return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
     }
   }
-  
+
   return false;
 }
 
@@ -2420,7 +2419,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
     // Return the modified struct.
     return ConstantStruct::get(STy, Elts);
   }
-  
+
   ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
   SequentialType *InitTy = cast<SequentialType>(Init->getType());
 
@@ -2589,31 +2588,45 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
   while (1) {
     Constant *InstResult = 0;
 
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (!SI->isSimple()) return false;  // no volatile/atomic accesses.
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
       Constant *Ptr = getVal(SI->getOperand(1));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
         Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
-      if (!isSimpleEnoughPointerToCommit(Ptr))
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
         // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
         return false;
-      
+      }
+
       Constant *Val = getVal(SI->getOperand(0));
 
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
-      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD))
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
         return false;
-        
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      }
+
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
           // If we're evaluating a store through a bitcast, then we need
           // to pull the bitcast off the pointer type and push it onto the
           // stored value.
           Ptr = CE->getOperand(0);
-          
+
           Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
-          
+
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
           // introspecting NewTy to find a legal conversion.
@@ -2635,32 +2648,45 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
             } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
               return false;
             }
           }
-          
+
           // If we found compatible types, go ahead and push the bitcast
           // onto the stored value.
           Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
         }
-          
+      }
+
       MutatedMemory[Ptr] = Val;
     } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
       InstResult = ConstantExpr::get(BO->getOpcode(),
                                      getVal(BO->getOperand(0)),
                                      getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
     } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
       InstResult = ConstantExpr::getCompare(CI->getPredicate(),
                                             getVal(CI->getOperand(0)),
                                             getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
     } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
       InstResult = ConstantExpr::getCast(CI->getOpcode(),
                                          getVal(CI->getOperand(0)),
                                          CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
       InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
                                            getVal(SI->getOperand(1)),
                                            getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
@@ -2670,41 +2696,70 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult =
         ConstantExpr::getGetElementPtr(P, GEPOps,
                                        cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-      if (!LI->isSimple()) return false;  // no volatile/atomic accesses.
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
       Constant *Ptr = getVal(LI->getOperand(0));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
       InstResult = ComputeLoadResult(Ptr);
-      if (InstResult == 0) return false; // Could not evaluate load.
+      if (InstResult == 0) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
-      if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
       Type *Ty = AI->getType()->getElementType();
       AllocaTmps.push_back(new GlobalVariable(Ty, false,
                                               GlobalValue::InternalLinkage,
                                               UndefValue::get(Ty),
                                               AI->getName()));
       InstResult = AllocaTmps.back();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallSite CS(CurInst);
 
       // Debug info can safely be ignored here.
       if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
-      if (isa<InlineAsm>(CS.getCalledValue())) return false;
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
 
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
         if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
-          if (MSI->isVolatile()) return false;
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
           Constant *Ptr = getVal(MSI->getDest());
           Constant *Val = getVal(MSI->getValue());
           Constant *DestVal = ComputeLoadResult(getVal(Ptr));
           if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
             // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
             ++CurInst;
             continue;
           }
@@ -2712,6 +2767,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
         if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
             II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
           ++CurInst;
           continue;
         }
@@ -2719,8 +2775,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         if (II->getIntrinsicID() == Intrinsic::invariant_start) {
           // We don't insert an entry into Values, as it doesn't have a
           // meaningful return value.
-          if (!II->use_empty())
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Cant evaluate.\n");
             return false;
+          }
           ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
           Value *PtrArg = getVal(II->getArgOperand(1));
           Value *Ptr = PtrArg->stripPointerCasts();
@@ -2728,20 +2786,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
             if (!Size->isAllOnesValue() &&
                 Size->getValue().getLimitedValue() >=
-                TD->getTypeStoreSize(ElemTy))
+                TD->getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
           }
           // Continue even if we do nothing.
           ++CurInst;
           continue;
         }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
         return false;
       }
 
       // Resolve function pointers.
       Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
-      if (!Callee || Callee->mayBeOverridden())
+      if (!Callee || Callee->mayBeOverridden()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
         return false;  // Cannot resolve.
+      }
 
       SmallVector<Constant*, 8> Formals;
       for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
@@ -2751,22 +2819,38 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         // If this is a function we can constant fold, do it.
         if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
           InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
         } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
           return false;
         }
       } else {
-        if (Callee->getFunctionType()->isVarArg())
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
           return false;
+        }
 
-        Constant *RetVal;
+        Constant *RetVal = 0;
         // Execute the call, if successful, use the return value.
         ValueStack.push_back(new DenseMap<Value*, Constant*>);
-        if (!EvaluateFunction(Callee, RetVal, Formals))
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
           return false;
+        }
         delete ValueStack.pop_back_val();
         InstResult = RetVal;
+
+        if (InstResult != NULL) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: " <<
+                InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
       }
     } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
         if (BI->isUnconditional()) {
           NextBB = BI->getSuccessor(0);
@@ -2792,26 +2876,31 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         NextBB = 0;
       } else {
         // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
         return false;  // Cannot handle this terminator.
       }
 
       // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
       return true;
     } else {
       // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
       return false;
     }
 
     if (!CurInst->use_empty()) {
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
         InstResult = ConstantFoldConstantExpression(CE, TD, TLI);
-      
+
       setVal(CurInst, InstResult);
     }
 
     // If we just processed an invoke, we finished evaluating the block.
     if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
       NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
       return true;
     }
 
@@ -2850,6 +2939,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
   while (1) {
     BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
     if (!EvaluateBlock(CurInst, NextBB))
       return false;
 
@@ -2891,7 +2982,7 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD,
   Constant *RetValDummy;
   bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
                                            SmallVector<Constant*, 0>());
-  
+
   if (EvalSuccess) {
     // We succeeded at evaluation: commit the result.
     DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
@@ -2929,6 +3020,7 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
       }
       break;
     }
+    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
     // We cannot simplify external ctor functions.
     if (F->empty()) continue;
@@ -3011,13 +3103,13 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
     return 0;
 
   Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit));
-  
+
   if (!Fn)
     return 0;
 
   FunctionType *FTy = Fn->getFunctionType();
-  
-  // Checking that the function has the right return type, the right number of 
+
+  // Checking that the function has the right return type, the right number of
   // parameters and that they all have pointer types should be enough.
   if (!FTy->getReturnType()->isIntegerTy() ||
       FTy->getNumParams() != 3 ||
@@ -3092,7 +3184,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   // and remove them.
   bool Changed = false;
 
-  for (Function::use_iterator I = CXAAtExitFn->use_begin(), 
+  for (Function::use_iterator I = CXAAtExitFn->use_begin(),
        E = CXAAtExitFn->use_end(); I != E;) {
     // We're only interested in calls. Theoretically, we could handle invoke
     // instructions as well, but neither llvm-gcc nor clang generate invokes
@@ -3101,7 +3193,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
     if (!CI)
       continue;
 
-    Function *DtorFn = 
+    Function *DtorFn =
       dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
     if (!DtorFn)
       continue;
diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
index d757e1f..4ac1dfc 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -17,14 +17,14 @@
 
 #define DEBUG_TYPE "ipconstprop"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 STATISTIC(NumArgumentsProped, "Number of args turned into constants");
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
index b1c36c1..a0095da 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
@@ -13,47 +13,58 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "inline"
-#include "llvm/CallingConv.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
 
 namespace {
 
-  // AlwaysInliner only inlines functions that are mark as "always inline".
-  class AlwaysInliner : public Inliner {
-  public:
-    // Use extremely low threshold.
-    AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) {
-      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    AlwaysInliner(bool InsertLifetime) : Inliner(ID, -2000000000,
-                                                 InsertLifetime) {
-      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    static char ID; // Pass identification, replacement for typeid
-    virtual InlineCost getInlineCost(CallSite CS);
-    virtual bool doFinalization(CallGraph &CG) {
-      return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
-    }
-    virtual bool doInitialization(CallGraph &CG);
-  };
+/// \brief Inliner pass which only handles "always inline" functions.
+class AlwaysInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  // Use extremely low threshold.
+  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  AlwaysInliner(bool InsertLifetime)
+      : Inliner(ID, -2000000000, InsertLifetime), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  virtual InlineCost getInlineCost(CallSite CS);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+
+  using llvm::Pass::doFinalization;
+  virtual bool doFinalization(CallGraph &CG) {
+    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true);
+  }
+};
+
 }
 
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 
@@ -63,35 +74,6 @@ Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
   return new AlwaysInliner(InsertLifetime);
 }
 
-/// \brief Minimal filter to detect invalid constructs for inlining.
-static bool isInlineViable(Function &F) {
-  bool ReturnsTwice =F.getFnAttributes().hasAttribute(Attributes::ReturnsTwice);
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    // Disallow inlining of functions which contain an indirect branch.
-    if (isa<IndirectBrInst>(BI->getTerminator()))
-      return false;
-
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
-         ++II) {
-      CallSite CS(II);
-      if (!CS)
-        continue;
-
-      // Disallow recursive calls.
-      if (&F == CS.getCalledFunction())
-        return false;
-
-      // Disallow calls which expose returns-twice to a function not previously
-      // attributed as such.
-      if (!ReturnsTwice && CS.isCall() &&
-          cast<CallInst>(CS.getInstruction())->canReturnTwice())
-        return false;
-    }
-  }
-
-  return true;
-}
-
 /// \brief Get the inline cost for the always-inliner.
 ///
 /// The always inliner *only* handles functions which are marked with the
@@ -106,27 +88,25 @@ static bool isInlineViable(Function &F) {
 /// likely not worth it in practice.
 InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
   Function *Callee = CS.getCalledFunction();
-  // We assume indirect calls aren't calling an always-inline function.
-  if (!Callee) return InlineCost::getNever();
 
-  // We can't inline calls to external functions.
-  // FIXME: We shouldn't even get here.
-  if (Callee->isDeclaration()) return InlineCost::getNever();
+  // Only inline direct calls to functions with always-inline attributes
+  // that are viable for inlining. FIXME: We shouldn't even get here for
+  // declarations.
+  if (Callee && !Callee->isDeclaration() &&
+      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::AlwaysInline) &&
+      ICA->isInlineViable(*Callee))
+    return InlineCost::getAlways();
 
-  // Return never for anything not marked as always inline.
-  if (!Callee->getFnAttributes().hasAttribute(Attributes::AlwaysInline))
-    return InlineCost::getNever();
-
-  // Do some minimal analysis to preclude non-viable functions.
-  if (!isInlineViable(*Callee))
-    return InlineCost::getNever();
+  return InlineCost::getNever();
+}
 
-  // Otherwise, force inlining.
-  return InlineCost::getAlways();
+bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
 }
 
-// doInitialization - Initializes the vector of functions that have not
-// been annotated with the "always inline" attribute.
-bool AlwaysInliner::doInitialization(CallGraph &CG) {
-  return false;
+void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
index bf0b1f9..a4f7026 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -12,44 +12,57 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "inline"
-#include "llvm/CallingConv.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/DataLayout.h"
 
 using namespace llvm;
 
 namespace {
 
-  class SimpleInliner : public Inliner {
-    InlineCostAnalyzer CA;
-  public:
-    SimpleInliner() : Inliner(ID) {
-      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    SimpleInliner(int Threshold) : Inliner(ID, Threshold,
-                                           /*InsertLifetime*/true) {
-      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    static char ID; // Pass identification, replacement for typeid
-    InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, getInlineThreshold(CS));
-    }
-    virtual bool doInitialization(CallGraph &CG);
-  };
-}
+/// \brief Actaul inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  SimpleInliner() : Inliner(ID), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  SimpleInliner(int Threshold)
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) {
+    return ICA->getInlineCost(CS, getInlineThreshold(CS));
+  }
+
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+} // end anonymous namespace
 
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 
@@ -59,10 +72,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
   return new SimpleInliner(Threshold);
 }
 
-// doInitialization - Initializes the vector of functions that have been
-// annotated with the noinline attribute.
-bool SimpleInliner::doInitialization(CallGraph &CG) {
-  CA.setDataLayout(getAnalysisIfAvailable<DataLayout>());
-  return false;
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
 }
 
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
index abcb25f..663ddb7 100644
--- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -14,22 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "inline"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumInlined, "Number of functions inlined");
@@ -64,14 +64,48 @@ Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
 /// getAnalysisUsage - For this class, we declare that we require and preserve
 /// the call graph.  If the derived class implements this method, it should
 /// always explicitly call the implementation here.
-void Inliner::getAnalysisUsage(AnalysisUsage &Info) const {
-  CallGraphSCCPass::getAnalysisUsage(Info);
+void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
 
 typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
 InlinedArrayAllocasTy;
 
+/// \brief If the inlined function had a higher stack protection level than the
+/// calling function, then bump up the caller's stack protection level.
+static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
+  // If upgrading the SSP attribute, clear out the old SSP Attributes first.
+  // Having multiple SSP attributes doesn't actually hurt, but it adds useless
+  // clutter to the IR.
+  AttrBuilder B;
+  B.addAttribute(Attribute::StackProtect)
+    .addAttribute(Attribute::StackProtectStrong);
+  AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(),
+                                              AttributeSet::FunctionIndex,
+                                              B);
+  AttributeSet CallerAttr = Caller->getAttributes(),
+               CalleeAttr = Callee->getAttributes();
+
+  if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtectStrong) &&
+             !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectStrong);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtect) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectReq) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectStrong))
+    Caller->addFnAttr(Attribute::StackProtect);
+}
+
 /// InlineCallIfPossible - If it is possible to inline the specified call site,
 /// do so and update the CallGraph for this operation.
 ///
@@ -91,13 +125,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
   if (!InlineFunction(CS, IFI, InsertLifetime))
     return false;
 
-  // If the inlined function had a higher stack protection level than the
-  // calling function, then bump up the caller's stack protection level.
-  if (Callee->getFnAttributes().hasAttribute(Attributes::StackProtectReq))
-    Caller->addFnAttr(Attributes::StackProtectReq);
-  else if (Callee->getFnAttributes().hasAttribute(Attributes::StackProtect) &&
-           !Caller->getFnAttributes().hasAttribute(Attributes::StackProtectReq))
-    Caller->addFnAttr(Attributes::StackProtect);
+  AdjustCallerSSPLevel(Caller, Callee);
 
   // Look at all of the allocas that we inlined through this call site.  If we
   // have already inlined other allocas through other calls into this function,
@@ -209,16 +237,21 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   // would decrease the threshold.
   Function *Caller = CS.getCaller();
   bool OptSize = Caller && !Caller->isDeclaration() &&
-    Caller->getFnAttributes().hasAttribute(Attributes::OptimizeForSize);
+    Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::OptimizeForSize);
   if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
       OptSizeThreshold < thres)
     thres = OptSizeThreshold;
 
-  // Listen to the inlinehint attribute when it would increase the threshold.
+  // Listen to the inlinehint attribute when it would increase the threshold
+  // and the caller does not need to minimize its size.
   Function *Callee = CS.getCalledFunction();
   bool InlineHint = Callee && !Callee->isDeclaration() &&
-    Callee->getFnAttributes().hasAttribute(Attributes::InlineHint);
-  if (InlineHint && HintThreshold > thres)
+    Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::InlineHint);
+  if (InlineHint && HintThreshold > thres
+      && !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                               Attribute::MinSize))
     thres = HintThreshold;
 
   return thres;
@@ -534,7 +567,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
     // about always-inline functions. This is a bit of a hack to share code
     // between here and the InlineAlways pass.
     if (AlwaysInlineOnly &&
-        !F->getFnAttributes().hasAttribute(Attributes::AlwaysInline))
+        !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::AlwaysInline))
       continue;
 
     // If the only remaining users of the function are dead constants, remove
diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
index aa629cc..4bfab5b 100644
--- a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -14,14 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "internalize"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
 #include <fstream>
 #include <set>
 using namespace llvm;
@@ -48,8 +48,10 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit InternalizePass();
-    explicit InternalizePass(const std::vector <const char *>& exportList);
+    explicit InternalizePass(ArrayRef<const char *> exportList);
     void LoadFile(const char *Filename);
+    void ClearExportList();
+    void AddToExportList(const std::string &val);
     virtual bool runOnModule(Module &M);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -72,10 +74,10 @@ InternalizePass::InternalizePass()
     ExternalNames.insert(APIList.begin(), APIList.end());
 }
 
-InternalizePass::InternalizePass(const std::vector<const char *>&exportList)
+InternalizePass::InternalizePass(ArrayRef<const char *> exportList)
   : ModulePass(ID){
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  for(std::vector<const char *>::const_iterator itr = exportList.begin();
+  for(ArrayRef<const char *>::const_iterator itr = exportList.begin();
         itr != exportList.end(); itr++) {
     ExternalNames.insert(*itr);
   }
@@ -97,6 +99,14 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
+void InternalizePass::ClearExportList() {
+  ExternalNames.clear();
+}
+
+void InternalizePass::AddToExportList(const std::string &val) {
+  ExternalNames.insert(val);
+}
+
 bool InternalizePass::runOnModule(Module &M) {
   CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
@@ -173,6 +183,6 @@ ModulePass *llvm::createInternalizePass() {
   return new InternalizePass();
 }
 
-ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) {
+ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) {
   return new InternalizePass(el);
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index 97d7cdc..8282a8e 100644
--- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -16,16 +16,16 @@
 
 #define DEBUG_TYPE "loop-extract"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/ADT/Statistic.h"
 #include <fstream>
 #include <set>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 44283dd..892100f 100644
--- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -45,25 +45,25 @@
 
 #define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Operator.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include <vector>
 using namespace llvm;
 
@@ -346,13 +346,11 @@ bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
                                          const GEPOperator *GEP2) {
   // When we have target data, we can reduce the GEP down to the value in bytes
   // added to the address.
-  if (TD && GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) {
-    SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end());
-    SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end());
-    uint64_t Offset1 = TD->getIndexedOffset(GEP1->getPointerOperandType(),
-                                            Indices1);
-    uint64_t Offset2 = TD->getIndexedOffset(GEP2->getPointerOperandType(),
-                                            Indices2);
+  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1;
+  APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
+  if (TD &&
+      GEP1->accumulateConstantOffset(*TD, Offset1) &&
+      GEP2->accumulateConstantOffset(*TD, Offset2)) {
     return Offset1 == Offset2;
   }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 9c9910b..fa518cb 100644
--- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -14,14 +14,14 @@
 
 #define DEBUG_TYPE "partialinlining"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CFG.h"
 using namespace llvm;
 
 STATISTIC(NumPartialInlined, "Number of functions partially inlined");
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 05253fc..027a9f2 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -14,21 +14,17 @@
 
 
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-
 #include "llvm-c/Transforms/PassManagerBuilder.h"
-
-#include "llvm/PassManager.h"
-#include "llvm/DefaultPasses.h"
-#include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
+#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Vectorize.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -190,10 +186,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (LoopVectorize) {
+  if (LoopVectorize && OptLevel > 2)
     MPM.add(createLoopVectorizePass());
-    MPM.add(createLICMPass());
-  }
 
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
@@ -220,6 +214,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(createGVNPass());                   // Remove redundancies
     else
       MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+
+    // BBVectorize may have significantly shortened a loop body; unroll again.
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
@@ -323,7 +321,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createGlobalDCEPass());
 }
 
-LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate(void) {
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
   PassManagerBuilder *PMB = new PassManagerBuilder();
   return wrap(PMB);
 }
@@ -393,9 +391,9 @@ LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
 
 void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
                                                   LLVMPassManagerRef PM,
-                                                  bool Internalize,
-                                                  bool RunInliner) {
+                                                  LLVMBool Internalize,
+                                                  LLVMBool RunInliner) {
   PassManagerBuilder *Builder = unwrap(PMB);
   PassManagerBase *LPM = unwrap(PM);
-  Builder->populateLTOPassManager(*LPM, Internalize, RunInliner);
+  Builder->populateLTOPassManager(*LPM, Internalize != 0, RunInliner != 0);
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
index fb4ecbf..73d9323 100644
--- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -16,16 +16,16 @@
 
 #define DEBUG_TYPE "prune-eh"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/CallGraphSCCPass.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include <algorithm>
 using namespace llvm;
@@ -140,15 +140,17 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
       AttrBuilder NewAttributes;
 
       if (!SCCMightUnwind)
-        NewAttributes.addAttribute(Attributes::NoUnwind);
+        NewAttributes.addAttribute(Attribute::NoUnwind);
       if (!SCCMightReturn)
-        NewAttributes.addAttribute(Attributes::NoReturn);
+        NewAttributes.addAttribute(Attribute::NoReturn);
 
       Function *F = (*I)->getFunction();
-      const AttrListPtr &PAL = F->getAttributes();
-      const AttrListPtr &NPAL = PAL.addAttr(F->getContext(), ~0,
-                                            Attributes::get(F->getContext(),
-                                                            NewAttributes));
+      const AttributeSet &PAL = F->getAttributes();
+      const AttributeSet &NPAL =
+        PAL.addAttributes(F->getContext(), AttributeSet::FunctionIndex,
+                          AttributeSet::get(F->getContext(),
+                                            AttributeSet::FunctionIndex,
+                                            NewAttributes));
       if (PAL != NPAL) {
         MadeChange = true;
         F->setAttributes(NPAL);
diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
index b5f09ec..f00830a 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -16,9 +16,9 @@
 
 #define DEBUG_TYPE "strip-dead-prototypes"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Pass.h"
-#include "llvm/Module.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 using namespace llvm;
 
 STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 80bfc1c..5f8681f 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -21,17 +21,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Pass.h"
-#include "llvm/TypeFinder.h"
-#include "llvm/ValueSymbolTable.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h
index 7467eca..1f6a3a5 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h
@@ -11,12 +11,12 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Operator.h"
-#include "llvm/Pass.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Support/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 
@@ -27,7 +27,7 @@ namespace llvm {
   class DbgDeclareInst;
   class MemIntrinsic;
   class MemSetInst;
-  
+
 /// SelectPatternFlavor - We can match a variety of different patterns for
 /// select operations.
 enum SelectPatternFlavor {
@@ -36,7 +36,7 @@ enum SelectPatternFlavor {
   SPF_SMAX, SPF_UMAX
   //SPF_ABS - TODO.
 };
-  
+
 /// getComplexity:  Assign a complexity or rank value to LLVM Values...
 ///   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
 static inline unsigned getComplexity(Value *V) {
@@ -51,23 +51,23 @@ static inline unsigned getComplexity(Value *V) {
   return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
 }
 
-  
+
 /// InstCombineIRInserter - This is an IRBuilder insertion helper that works
 /// just like the normal insertion helper, but also adds any new instructions
 /// to the instcombine worklist.
-class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter 
+class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
     : public IRBuilderDefaultInserter<true> {
   InstCombineWorklist &Worklist;
 public:
   InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {}
-  
+
   void InsertHelper(Instruction *I, const Twine &Name,
                     BasicBlock *BB, BasicBlock::iterator InsertPt) const {
     IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
     Worklist.Add(I);
   }
 };
-  
+
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
                              : public FunctionPass,
@@ -76,6 +76,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
   TargetLibraryInfo *TLI;
   bool MadeIRChange;
   LibCallSimplifier *Simplifier;
+  bool MinimizeSize;
 public:
   /// Worklist - All of the instructions that need to be simplified.
   InstCombineWorklist Worklist;
@@ -84,15 +85,16 @@ public:
   /// instructions into the worklist when they are created.
   typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
   BuilderTy *Builder;
-      
+
   static char ID; // Pass identification, replacement for typeid
   InstCombiner() : FunctionPass(ID), TD(0), Builder(0) {
+    MinimizeSize = false;
     initializeInstCombinerPass(*PassRegistry::getPassRegistry());
   }
 
 public:
   virtual bool runOnFunction(Function &F);
-  
+
   bool DoOneIteration(Function &F, unsigned ItNum);
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
@@ -114,6 +116,8 @@ public:
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
+  Value *foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
+                       Instruction *InsertBefore);
   Instruction *visitFMul(BinaryOperator &I);
   Instruction *visitURem(BinaryOperator &I);
   Instruction *visitSRem(BinaryOperator &I);
@@ -207,11 +211,11 @@ public:
 private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
-  Value *dyn_castFNegVal(Value *V) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset, 
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
                                   SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
-                                 
+
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
   /// results in any code being generated and is interesting to optimize out. If
   /// the cast can be eliminated by some other simple transformation, we prefer
@@ -243,7 +247,7 @@ public:
     return New;
   }
 
-  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the 
+  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the
   // debug loc.
   //
   Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
@@ -259,10 +263,10 @@ public:
   //
   Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
     Worklist.AddUsersToWorkList(I);   // Add all modified instrs to worklist.
-    
+
     // If we are replacing the instruction with itself, this must be in a
     // segment of unreachable code, so just clobber the instruction.
-    if (&I == V) 
+    if (&I == V)
       V = UndefValue::get(I.getType());
 
     DEBUG(errs() << "IC: Replacing " << I << "\n"
@@ -292,13 +296,13 @@ public:
     MadeIRChange = true;
     return 0;  // Don't do anything with FI
   }
-      
+
   void ComputeMaskedBits(Value *V, APInt &KnownZero,
                          APInt &KnownOne, unsigned Depth = 0) const {
     return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   }
-  
-  bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+
+  bool MaskedValueIsZero(Value *V, const APInt &Mask,
                          unsigned Depth = 0) const {
     return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
   }
@@ -321,21 +325,26 @@ private:
 
   /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
   /// based on the demanded bits.
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                  APInt& KnownZero, APInt& KnownOne,
                                  unsigned Depth);
-  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+  bool SimplifyDemandedBits(Use &U, APInt DemandedMask,
                             APInt& KnownZero, APInt& KnownOne,
                             unsigned Depth=0);
-      
+  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+  Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
+                                    APInt DemandedMask, APInt &KnownZero,
+                                    APInt &KnownOne);
+
   /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
   /// SimplifyDemandedBits knows about.  See if the instruction has any
   /// properties that allow us to simplify its operands.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
-      
+
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                     APInt& UndefElts, unsigned Depth = 0);
-    
+
   // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
   // which has a PHI node as operand #0, see if we can fold the instruction
   // into the PHI (which is only possible if all operands to the PHI are
@@ -351,10 +360,10 @@ private:
   Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
 
-  
+
   Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
-  
+
   Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
                             bool isSub, Instruction &I);
   Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
@@ -373,8 +382,8 @@ private:
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
-      
-  
+
+
 } // end namespace llvm.
 
 #endif
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d8257e6..7595da0 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -13,16 +13,840 @@
 
 #include "InstCombine.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
 
+namespace {
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient, which is especially true for
+  /// the constructor. As of I write this comment, the cost of the default
+  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to 
+  /// perform write-merging).
+  /// 
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is uncessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    //
+    FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {}
+    ~FAddendCoef();
+  
+    void set(short C) {
+      assert(!insaneIntVal(C) && "Insane coefficient");
+      IsFp = false; IntVal = C;
+    }
+  
+    void set(const APFloat& C);
+
+    void negate();
+  
+    bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
+    Value *getValue(Type *) const;
+  
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator-=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+  
+    bool isOne() const { return isInt() && IntVal == 1; }
+    bool isTwo() const { return isInt() && IntVal == 2; }
+    bool isMinusOne() const { return isInt() && IntVal == -1; }
+    bool isMinusTwo() const { return isInt() && IntVal == -2; }
+  
+  private:
+    bool insaneIntVal(int V) { return V > 4 || V < -4; }
+    APFloat *getFpValPtr(void)
+      { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); }
+    const APFloat *getFpValPtr(void) const
+      { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); }
+
+    const APFloat &getFpVal(void) const {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
+    APFloat &getFpVal(void)
+      { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); }
+  
+    bool isInt() const { return !IsFp; }
+
+    // If the coefficient is represented by an integer, promote it to a
+    // floating point. 
+    void convertToFpType(const fltSemantics &Sem);
+
+    // Construct an APFloat from a signed integer.
+    // TODO: We should get rid of this function when APFloat can be constructed
+    //       from an *SIGNED* integer. 
+    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
+  private:
+
+    bool IsFp;
+  
+    // True iff FpValBuf contains an instance of APFloat.
+    bool BufHasFpVal;
+  
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal;
+
+    AlignedCharArrayUnion<APFloat> FpValBuf;
+  };
+  
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is a symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  ///
+  class FAddend {
+  public:
+    FAddend() { Val = 0; }
+  
+    Value *getSymVal (void) const { return Val; }
+    const FAddendCoef &getCoef(void) const { return Coeff; }
+  
+    bool isConstant() const { return Val == 0; }
+    bool isZero() const { return Coeff.isZero(); }
+
+    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
+    void set(const APFloat& Coefficient, Value *V)
+      { Coeff.set(Coefficient); Val = V; }
+    void set(const ConstantFP* Coefficient, Value *V)
+      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+  
+    void negate() { Coeff.negate(); }
+  
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+  
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+  
+    void operator+=(const FAddend &T) {
+      assert((Val == T.Val) && "Symbolic-values disagree");
+      Coeff += T.Coeff;
+    }
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+  
+    // This addend has the value of "Coeff * Val".
+    Value *Val;
+    FAddendCoef Coeff;
+  };
+  
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+    Value *simplify(Instruction *FAdd);
+  
+  private:
+    typedef SmallVector<const FAddend*, 4> AddendVect;
+  
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    Value *performFactorization(Instruction *I);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+    
+    /// Return the number of instructions needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFDiv(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst);
+  
+    InstCombiner::BuilderTy *Builder;
+    Instruction *Instr;
+  
+  private:
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+  };
+} 
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    {FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+FAddendCoef::~FAddendCoef() {
+  if (BufHasFpVal)
+    getFpValPtr()->~APFloat();
+}
+
+void FAddendCoef::set(const APFloat& C) {
+  APFloat *P = getFpValPtr();
+
+  if (isInt()) {
+    // As the buffer is meanless byte stream, we cannot call
+    // APFloat::operator=().
+    new(P) APFloat(C);
+  } else
+    *P = C;
+
+  IsFp = BufHasFpVal = true; 
+}
+
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+  if (!isInt())
+    return;
+
+  APFloat *P = getFpValPtr();
+  if (IntVal > 0)
+    new(P) APFloat(Sem, IntVal);
+  else {
+    new(P) APFloat(Sem, 0 - IntVal);
+    P->changeSign();
+  }
+  IsFp = BufHasFpVal = true; 
+}
+
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+  if (Val >= 0)
+    return APFloat(Sem, Val);
+
+  APFloat T(Sem, 0 - Val);
+  T.changeSign();
+
+  return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
+  if (That.isInt())
+    set(That.IntVal);
+  else
+    set(That.getFpVal());
+}
+
+void FAddendCoef::operator+=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt() == That.isInt()) {
+    if (isInt())
+      IntVal += That.IntVal;
+    else
+      getFpVal().add(That.getFpVal(), RndMode);
+    return;
+  }
+  
+  if (isInt()) {
+    const APFloat &T = That.getFpVal();
+    convertToFpType(T.getSemantics());
+    getFpVal().add(T, RndMode);
+    return;
+  }
+  
+  APFloat &T = getFpVal();
+  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
+}
+
+void FAddendCoef::operator-=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt() == That.isInt()) {
+    if (isInt())
+      IntVal -= That.IntVal;
+    else
+      getFpVal().subtract(That.getFpVal(), RndMode);
+    return;
+  }
+  
+  if (isInt()) {
+    const APFloat &T = That.getFpVal();
+    convertToFpType(T.getSemantics());
+    getFpVal().subtract(T, RndMode);
+    return;
+  }
+
+  APFloat &T = getFpVal();
+  T.subtract(createAPFloatFromInt(T.getSemantics(), IntVal), RndMode);
+}
+
+void FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt() && That.isInt()) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!insaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic = 
+    isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
+
+  if (isInt())
+    convertToFpType(Semantic);
+  APFloat &F0 = getFpVal();
+
+  if (That.isInt())
+    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+                APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
+
+  return;
+}
+
+void FAddendCoef::negate() {
+  if (isInt())
+    IntVal = 0 - IntVal;
+  else
+    getFpVal().changeSign();
+}
+
+Value *FAddendCoef::getValue(Type *Ty) const {
+  return isInt() ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), getFpVal());
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL> 
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A and B are not constant, C is constant
+// 
+unsigned FAddend::drillValueDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = 0;
+  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = 0;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = 0;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, 0);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, 0);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), 0);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+// Try to break *this* addend into two addends. e.g. Suppose this addend is
+// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
+// i.e. <2.3, X> and <2.3, Y>.
+//
+unsigned FAddend::drillAddendDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne()) 
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+// Try to perform following optimization on the input instruction I. Return the
+// simplified expression if was successful; otherwise, return 0.
+//
+//   Instruction "I" is                Simplified into
+// -------------------------------------------------------
+//   (x * y) +/- (x * z)               x * (y +/- z)
+//   (y / x) +/- (z / x)               (y +/- z) / x
+//
+Value *FAddCombine::performFactorization(Instruction *I) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+  
+  Instruction *I0 = dyn_cast<Instruction>(I->getOperand(0));
+  Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
+  
+  if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
+    return 0;
+
+  bool isMpy = false;
+  if (I0->getOpcode() == Instruction::FMul)
+    isMpy = true;
+  else if (I0->getOpcode() != Instruction::FDiv)
+    return 0;
+
+  Value *Opnd0_0 = I0->getOperand(0);
+  Value *Opnd0_1 = I0->getOperand(1);
+  Value *Opnd1_0 = I1->getOperand(0);
+  Value *Opnd1_1 = I1->getOperand(1);
+
+  //  Input Instr I       Factor   AddSub0  AddSub1 
+  //  ----------------------------------------------
+  // (x*y) +/- (x*z)        x        y         z
+  // (y/x) +/- (z/x)        x        y         z
+  //
+  Value *Factor = 0;
+  Value *AddSub0 = 0, *AddSub1 = 0;
+  
+  if (isMpy) {
+    if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1)
+      Factor = Opnd0_0;
+    else if (Opnd0_1 == Opnd1_0 || Opnd0_1 == Opnd1_1)
+      Factor = Opnd0_1;
+
+    if (Factor) {
+      AddSub0 = (Factor == Opnd0_0) ? Opnd0_1 : Opnd0_0;
+      AddSub1 = (Factor == Opnd1_0) ? Opnd1_1 : Opnd1_0;
+    }
+  } else if (Opnd0_1 == Opnd1_1) {
+    Factor = Opnd0_1;
+    AddSub0 = Opnd0_0;
+    AddSub1 = Opnd1_0;
+  }
+
+  if (!Factor)
+    return 0;
+
+  // Create expression "NewAddSub = AddSub0 +/- AddsSub1"
+  Value *NewAddSub = (I->getOpcode() == Instruction::FAdd) ?
+                      createFAdd(AddSub0, AddSub1) :
+                      createFSub(AddSub0, AddSub1);
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) {
+    const APFloat &F = CFP->getValueAPF();
+    if (!F.isNormal() || F.isDenormal())
+      return 0;
+  }
+
+  if (isMpy)
+    return createFMul(Factor, NewAddSub);
+ 
+  return createFDiv(NewAddSub, Factor);
+}
+
+Value *FAddCombine::simplify(Instruction *I) {
+  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+  // Currently we are not able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return 0;
+
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+  // Save the instruction before calling other member-functions. 
+  Instr = I;
+
+  FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant()) 
+    Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef &CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : 0;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 6: Try factorization as the last resort, 
+  return performFactorization(I);
+}
+
+Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
+
+  unsigned AddendNum = Addends.size();
+  assert(AddendNum <= 4 && "Too many addends");
+
+  // For saving intermediate results; 
+  unsigned NextTmpIdx = 0;
+  FAddend TmpResult[3];
+
+  // Points to the constant addend of the resulting simplified expression.
+  // If the resulting expr has constant-addend, this constant-addend is
+  // desirable to reside at the top of the resulting expression tree. Placing
+  // constant close to supper-expr(s) will potentially reveal some optimization
+  // opportunities in super-expr(s).
+  //
+  const FAddend *ConstAdd = 0;
+
+  // Simplified addends are placed <SimpVect>.
+  AddendVect SimpVect;
+
+  // The outer loop works on one symbolic-value at a time. Suppose the input
+  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... 
+  // The symbolic-values will be processed in this order: x, y, z.
+  //
+  for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
+
+    const FAddend *ThisAddend = Addends[SymIdx];
+    if (!ThisAddend) {
+      // This addend was processed before.
+      continue;
+    }
+
+    Value *Val = ThisAddend->getSymVal();
+    unsigned StartIdx = SimpVect.size();
+    SimpVect.push_back(ThisAddend);
+
+    // The inner loop collects addends sharing same symbolic-value, and these
+    // addends will be later on folded into a single addend. Following above
+    // example, if the symbolic value "y" is being processed, the inner loop
+    // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
+    // be later on folded into "<b1+b2, y>".
+    //
+    for (unsigned SameSymIdx = SymIdx + 1;
+         SameSymIdx < AddendNum; SameSymIdx++) {
+      const FAddend *T = Addends[SameSymIdx];
+      if (T && T->getSymVal() == Val) {
+        // Set null such that next iteration of the outer loop will not process
+        // this addend again.
+        Addends[SameSymIdx] = 0; 
+        SimpVect.push_back(T);
+      }
+    }
+
+    // If multiple addends share same symbolic value, fold them together.
+    if (StartIdx + 1 != SimpVect.size()) {
+      FAddend &R = TmpResult[NextTmpIdx ++];
+      R = *SimpVect[StartIdx];
+      for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
+        R += *SimpVect[Idx];
+
+      // Pop all addends being folded and push the resulting folded addend.
+      SimpVect.resize(StartIdx); 
+      if (Val != 0) {
+        if (!R.isZero()) {
+          SimpVect.push_back(&R);
+        }
+      } else {
+        // Don't push constant addend at this time. It will be the last element
+        // of <SimpVect>.
+        ConstAdd = &R;
+      }
+    }
+  }
+
+  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) && 
+         "out-of-bound access");
+
+  if (ConstAdd)
+    SimpVect.push_back(ConstAdd);
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0.
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FAddCombine::createNaryFAdd
+  (const AddendVect &Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Expect at least one addend");
+
+  // Step 1: Check if the # of instructions needed exceeds the quota.
+  // 
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return 0;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most three instructions are involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies that the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = 0;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    bool NeedNeg; 
+    Value *V = createAddendVal(**I, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+  #ifndef NDEBUG
+    assert(CreateInstrNum == InstrNeeded && 
+           "Inconsistent in instruction numbers");
+  #endif
+
+  return LastVal;
+}
+
+Value *FAddCombine::createFSub
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFNeg(Value *V) {
+  Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+  return createFSub(Zero, V);
+}
+
+Value *FAddCombine::createFAdd
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFDiv(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+void FAddCombine::createInstPostProc(Instruction *NewInstr) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // Keep track of the number of instruction created.
+  incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form of "(-1)*x". 
+  unsigned NegOpndNum = 0; 
+
+  // Adjust the number of instructions needed to emit the N-ary add.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    const FAddend *Opnd = *I;
+    if (Opnd->isConstant())
+      continue;
+
+    const FAddendCoef &CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately available; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  if (NegOpndNum == OpndNum)
+    InstrNeeded++;
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
+Value *FAddCombine::createAddendVal
+  (const FAddend &Opnd, bool &NeedNeg) {
+  const FAddendCoef &Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
 /// AddOne - Add one to a ConstantInt.
 static Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
 }
+
 /// SubOne - Subtract one from a ConstantInt.
 static Constant *SubOne(ConstantInt *C) {
   return ConstantInt::get(C->getContext(), C->getValue()-1);
@@ -37,10 +861,10 @@ static Constant *SubOne(ConstantInt *C) {
 static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
   if (!V->hasOneUse() || !V->getType()->isIntegerTy())
     return 0;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return 0;
-  
+
   if (I->getOpcode() == Instruction::Mul)
     if ((CST = dyn_cast<ConstantInt>(I->getOperand(1))))
       return I->getOperand(0);
@@ -64,22 +888,22 @@ static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
 bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
   // There are different heuristics we can use for this.  Here are some simple
   // ones.
-  
-  // Add has the property that adding any two 2's complement numbers can only 
+
+  // Add has the property that adding any two 2's complement numbers can only
   // have one carry bit which can change a sign.  As such, if LHS and RHS each
   // have at least two sign bits, we know that the addition of the two values
   // will sign extend fine.
   if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
     return true;
-  
-  
+
+
   // If one of the operands only has one non-zero bit, and if the other operand
   // has a known-zero bit in a more significant place than it (not including the
   // sign bit) the ripple may go up to and fill the zero, but won't change the
   // sign.  For example, (X & ~4) + 1.
-  
+
   // TODO: Implement.
-  
+
   return false;
 }
 
@@ -100,7 +924,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     const APInt &Val = CI->getValue();
     if (Val.isSignBit())
       return BinaryOperator::CreateXor(LHS, RHS);
-    
+
     // See if SimplifyDemandedBits can simplify this.  This handles stuff like
     // (X & 254)+1 -> (X&254)|1
     if (SimplifyDemandedInstructionBits(I))
@@ -110,7 +934,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
       if (ZI->getSrcTy()->isIntegerTy(1))
         return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI);
-    
+
     Value *XorLHS = 0; ConstantInt *XorRHS = 0;
     if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
       uint32_t TySizeBits = I.getType()->getScalarSizeInBits();
@@ -124,13 +948,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         else if (XorRHS->getValue().isPowerOf2())
           ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1;
       }
-      
+
       if (ExtendAmt) {
         APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
         if (!MaskedValueIsZero(XorLHS, Mask))
           ExtendAmt = 0;
       }
-      
+
       if (ExtendAmt) {
         Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt);
         Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext");
@@ -175,7 +999,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
         return BinaryOperator::CreateNeg(NewAdd);
       }
-    
+
     return BinaryOperator::CreateSub(RHS, LHSV);
   }
 
@@ -209,7 +1033,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       APInt RHSKnownOne(IT->getBitWidth(), 0);
       APInt RHSKnownZero(IT->getBitWidth(), 0);
       ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
-      
+
       // No bits in common -> bitwise or.
       if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
         return BinaryOperator::CreateOr(LHS, RHS);
@@ -251,7 +1075,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       // See if all bits from the first bit set in the Add RHS up are included
       // in the mask.  First, get the rightmost bit.
       const APInt &AddRHSV = CRHS->getValue();
-      
+
       // Form a mask of all bits from the lowest bit added through the top.
       APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
 
@@ -289,7 +1113,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
         // Fold the add into the true select value.
         return SelectInst::Create(SI->getCondition(), N, A);
-      
+
       if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
         // Fold the add into the false select value.
         return SelectInst::Create(SI->getCondition(), A, N);
@@ -301,18 +1125,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
     // (add (sext x), cst) --> (sext (add x, cst'))
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
-      Constant *CI = 
+      Constant *CI =
         ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
           WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
         // Insert the new, smaller add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), 
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               CI, "addconv");
         return new SExtInst(NewAdd, I.getType());
       }
     }
-    
+
     // (add (sext x), (sext y)) --> (sext (add int x, y))
     if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
       // Only do this if x/y have the same type, if at last one of them has a
@@ -323,7 +1147,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
                                    RHSConv->getOperand(0))) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), 
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                              RHSConv->getOperand(0), "addconv");
         return new SExtInst(NewAdd, I.getType());
       }
@@ -351,18 +1175,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
-  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
-    // X + 0 --> X
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
-      if (CFP->isExactlyValue(ConstantFP::getNegativeZero
-                              (I.getType())->getValueAPF()))
-        return ReplaceInstUsesWith(I, LHS);
-    }
+  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
 
-    if (isa<PHINode>(LHS))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
-  }
+  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
 
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
@@ -374,11 +1192,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     if (Value *V = dyn_castFNegVal(RHS))
       return BinaryOperator::CreateFSub(LHS, V);
 
-  // Check for X+0.0.  Simplify it to X if we know X is not -0.0.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
-    if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
-      return ReplaceInstUsesWith(I, LHS);
-
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
@@ -388,7 +1201,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     // requires a constant pool load, and generally allows the add to be better
     // instcombined.
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
-      Constant *CI = 
+      Constant *CI =
       ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
@@ -399,7 +1212,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
         return new SIToFPInst(NewAdd, I.getType());
       }
     }
-    
+
     // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
     if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
       // Only do this if x/y have the same type, if at last one of them has a
@@ -410,13 +1223,18 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
                                    RHSConv->getOperand(0))) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), 
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               RHSConv->getOperand(0),"addconv");
         return new SIToFPInst(NewAdd, I.getType());
       }
     }
   }
-  
+
+  if (I.hasUnsafeAlgebra()) {
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return ReplaceInstUsesWith(I, V);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -428,7 +1246,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
                                                Type *Ty) {
   assert(TD && "Must have target data info for this");
-  
+
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
@@ -451,7 +1269,7 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
       }
     }
   }
-  
+
   if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
     // X - (gep X, ...)
     if (RHSGEP->getOperand(0) == LHS) {
@@ -467,16 +1285,16 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
       }
     }
   }
-  
+
   // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
   // multiple users.
   if (GEP1 == 0 ||
       (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
     return 0;
-  
+
   // Emit the offset of the GEP and an intptr_t.
   Value *Result = EmitGEPOffset(GEP1);
-  
+
   // If we had a constant expression GEP on the other side offsetting the
   // pointer, subtract it from the offset we have.
   if (GEP2) {
@@ -517,7 +1335,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   // Replace (-1 - A) with (~A).
   if (match(Op0, m_AllOnes()))
     return BinaryOperator::CreateNot(Op1);
-  
+
   if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
     // C - ~X == X + (1+C)
     Value *X = 0;
@@ -551,20 +1369,30 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     if (SimplifyDemandedInstructionBits(I))
       return &I;
+
+    // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
+    if (C->isZero() && match(Op1, m_ZExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateSExtOrBitCast(X, Op1->getType());
+
+    // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
+    if (C->isZero() && match(Op1, m_SExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateZExtOrBitCast(X, Op1->getType());
   }
 
-  
+
   { Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
     if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
         match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
       return BinaryOperator::CreateNeg(Y);
-    
+
     // (X-Y)-X == -Y
     if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
       return BinaryOperator::CreateNeg(Y);
   }
-  
+
   if (Op1->hasOneUse()) {
     Value *X = 0, *Y = 0, *Z = 0;
     Constant *C = 0;
@@ -581,7 +1409,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         match(Op1, m_And(m_Specific(Op0), m_Value(Y))))
       return BinaryOperator::CreateAnd(Op0,
                                   Builder->CreateNot(Y, Y->getName() + ".not"));
-    
+
     // 0 - (X sdiv C)  -> (X sdiv -C)
     if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) &&
         match(Op0, m_Zero()))
@@ -604,14 +1432,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI));
       return BinaryOperator::CreateMul(Op0, C);
     }
-    
+
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
     Value *A, *B;
     if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) ||
         match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B))))
       return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B));
-      
+
     // X - A*CI -> X + A*-CI
     // X - CI*A -> X + A*-CI
     if (match(Op1, m_Mul(m_Value(A), m_ConstantInt(CI))) ||
@@ -630,7 +1458,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     if (X == dyn_castFoldableMul(Op1, C2))
       return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2));
   }
-  
+
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
   if (TD) {
@@ -639,23 +1467,31 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         match(Op1, m_PtrToInt(m_Value(RHSOp))))
       if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
         return ReplaceInstUsesWith(I, Res);
-    
+
     // trunc(p)-trunc(q) -> trunc(p-q)
     if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
         match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
       if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
         return ReplaceInstUsesWith(I, Res);
   }
-  
+
   return 0;
 }
 
 Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
+
   // If this is a 'B = x-(-A)', change to B = x+A...
   if (Value *V = dyn_castFNegVal(Op1))
     return BinaryOperator::CreateFAdd(Op0, V);
 
+  if (I.hasUnsafeAlgebra()) {
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return ReplaceInstUsesWith(I, V);
+  }
+
   return 0;
 }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 7d0af0d..990cbc3 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -12,18 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Intrinsics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
 using namespace llvm;
 using namespace PatternMatch;
 
 
 /// AddOne - Add one to a ConstantInt.
-static Constant *AddOne(Constant *C) {
-  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+static Constant *AddOne(ConstantInt *C) {
+  return ConstantInt::get(C->getContext(), C->getValue() + 1);
 }
 /// SubOne - Subtract one from a ConstantInt.
 static Constant *SubOne(ConstantInt *C) {
@@ -36,15 +36,15 @@ static inline bool isFreeToInvert(Value *V) {
   // ~(~(X)) -> X.
   if (BinaryOperator::isNot(V))
     return true;
-  
+
   // Constants can be considered to be not'ed values.
   if (isa<ConstantInt>(V))
     return true;
-  
+
   // Compares can be inverted if they have a single use.
   if (CmpInst *CI = dyn_cast<CmpInst>(V))
     return CI->hasOneUse();
-  
+
   return false;
 }
 
@@ -56,7 +56,7 @@ static inline Value *dyn_castNotVal(Value *V) {
     if (!isFreeToInvert(Operand))
       return Operand;
   }
-  
+
   // Constants can be considered to be not'ed values...
   if (ConstantInt *C = dyn_cast<ConstantInt>(V))
     return ConstantInt::get(C->getType(), ~C->getValue());
@@ -91,7 +91,7 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
 }
 
 /// getNewICmpValue - This is the complement of getICmpCode, which turns an
-/// opcode and two operands into either a constant true or false, or a brand 
+/// opcode and two operands into either a constant true or false, or a brand
 /// new ICmp instruction. The sign is passed in to determine which kind
 /// of predicate to use in the new icmp instruction.
 static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
@@ -118,7 +118,7 @@ static Value *getFCmpValue(bool isordered, unsigned code,
   case 4: Pred = isordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; break;
   case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break;
   case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break;
-  case 7: 
+  case 7:
     if (!isordered) return ConstantInt::getTrue(LHS->getContext());
     Pred = FCmpInst::FCMP_ORD; break;
   }
@@ -154,7 +154,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
         Or->takeName(Op);
         return BinaryOperator::CreateAnd(Or, AndRHS);
       }
-      
+
       ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together);
       if (TogetherCI && !TogetherCI->isZero()){
         // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1
@@ -166,7 +166,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
         return BinaryOperator::CreateOr(And, OpRHS);
       }
     }
-    
+
     break;
   case Instruction::Add:
     if (Op->hasOneUse()) {
@@ -215,7 +215,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     if (CI->getValue() == ShlMask)
       // Masking out bits that the shift already masks.
       return ReplaceInstUsesWith(TheAnd, Op);   // No need for the and.
-    
+
     if (CI != AndRHS) {                  // Reducing bits set in and.
       TheAnd.setOperand(1, CI);
       return &TheAnd;
@@ -236,7 +236,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     if (CI->getValue() == ShrMask)
       // Masking out bits that the shift already masks.
       return ReplaceInstUsesWith(TheAnd, Op);
-    
+
     if (CI != AndRHS) {
       TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
       return &TheAnd;
@@ -269,22 +269,22 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
 
 /// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
 /// true, otherwise (V < Lo || V >= Hi).  In practice, we emit the more efficient
-/// (V-Lo) <u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
+/// (V-Lo) \<u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
 /// whether to treat the V, Lo and HI as signed or not. IB is the location to
 /// insert new instructions.
 Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
                                      bool isSigned, bool Inside) {
-  assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? 
+  assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ?
             ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() &&
          "Lo is not <= Hi in range emission code!");
-    
+
   if (Inside) {
     if (Lo == Hi)  // Trivially false.
       return ConstantInt::getFalse(V->getContext());
 
     // V >= Min && V < Hi --> V < Hi
     if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
-      ICmpInst::Predicate pred = (isSigned ? 
+      ICmpInst::Predicate pred = (isSigned ?
         ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT);
       return Builder->CreateICmp(pred, V, Hi);
     }
@@ -302,7 +302,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
   // V < Min || V >= Hi -> V > Hi-1
   Hi = SubOne(cast<ConstantInt>(Hi));
   if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
-    ICmpInst::Predicate pred = (isSigned ? 
+    ICmpInst::Predicate pred = (isSigned ?
         ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
     return Builder->CreateICmp(pred, V, Hi);
   }
@@ -327,14 +327,14 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
   // look for the first zero bit after the run of ones
   MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
   // look for the first non-zero bit
-  ME = V.getActiveBits(); 
+  ME = V.getActiveBits();
   return true;
 }
 
 /// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask,
 /// where isSub determines whether the operator is a sub.  If we can fold one of
 /// the following xforms:
-/// 
+///
 /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
 /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
 /// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
@@ -355,8 +355,8 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
   case Instruction::And:
     if (ConstantExpr::getAnd(N, Mask) == Mask) {
       // If the AndRHS is a power of two minus one (0+1+), this is simple.
-      if ((Mask->getValue().countLeadingZeros() + 
-           Mask->getValue().countPopulation()) == 
+      if ((Mask->getValue().countLeadingZeros() +
+           Mask->getValue().countPopulation()) ==
           Mask->getValue().getBitWidth())
         break;
 
@@ -375,33 +375,33 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
   case Instruction::Or:
   case Instruction::Xor:
     // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
-    if ((Mask->getValue().countLeadingZeros() + 
+    if ((Mask->getValue().countLeadingZeros() +
          Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
         && ConstantExpr::getAnd(N, Mask)->isNullValue())
       break;
     return 0;
   }
-  
+
   if (isSub)
     return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold");
   return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold");
 }
 
 /// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C)
-/// One of A and B is considered the mask, the other the value. This is 
-/// described as the "AMask" or "BMask" part of the enum. If the enum 
+/// One of A and B is considered the mask, the other the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum
 /// contains only "Mask", then both A and B can be considered masks.
 /// If A is the mask, then it was proven, that (A & C) == C. This
 /// is trivial if C == A, or C == 0. If both A and C are constants, this
 /// proof is also easy.
 /// For the following explanations we assume that A is the mask.
-/// The part "AllOnes" declares, that the comparison is true only 
+/// The part "AllOnes" declares, that the comparison is true only
 /// if (A & B) == A, or all bits of A are set in B.
 ///   Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes
-/// The part "AllZeroes" declares, that the comparison is true only 
+/// The part "AllZeroes" declares, that the comparison is true only
 /// if (A & B) == 0, or all bits of A are cleared in B.
 ///   Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes
-/// The part "Mixed" declares, that (A & B) == C and C might or might not 
+/// The part "Mixed" declares, that (A & B) == C and C might or might not
 /// contain any number of one bits and zero bits.
 ///   Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed
 /// The Part "Not" means, that in above descriptions "==" should be replaced
@@ -425,16 +425,16 @@ enum MaskedICmpType {
 
 /// return the set of pattern classes (from MaskedICmpType)
 /// that (icmp SCC (A & B), C) satisfies
-static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, 
+static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
                                     ICmpInst::Predicate SCC)
 {
   ConstantInt *ACst = dyn_cast<ConstantInt>(A);
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
   ConstantInt *CCst = dyn_cast<ConstantInt>(C);
   bool icmp_eq = (SCC == ICmpInst::ICMP_EQ);
-  bool icmp_abit = (ACst != 0 && !ACst->isZero() && 
+  bool icmp_abit = (ACst != 0 && !ACst->isZero() &&
                     ACst->getValue().isPowerOf2());
-  bool icmp_bbit = (BCst != 0 && !BCst->isZero() && 
+  bool icmp_bbit = (BCst != 0 && !BCst->isZero() &&
                     BCst->getValue().isPowerOf2());
   unsigned result = 0;
   if (CCst != 0 && CCst->isZero()) {
@@ -449,12 +449,12 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
                           FoldMskICmp_BMask_NotMixed));
     if (icmp_abit)
       result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes |
-                            FoldMskICmp_AMask_NotMixed) 
+                            FoldMskICmp_AMask_NotMixed)
                          : (FoldMskICmp_AMask_AllOnes |
                             FoldMskICmp_AMask_Mixed));
     if (icmp_bbit)
       result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes |
-                            FoldMskICmp_BMask_NotMixed) 
+                            FoldMskICmp_BMask_NotMixed)
                          : (FoldMskICmp_BMask_AllOnes |
                             FoldMskICmp_BMask_Mixed));
     return result;
@@ -469,26 +469,23 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
                             FoldMskICmp_AMask_NotMixed)
                          : (FoldMskICmp_Mask_AllZeroes |
                             FoldMskICmp_AMask_Mixed));
-  }
-  else if (ACst != 0 && CCst != 0 &&
-        ConstantExpr::getAnd(ACst, CCst) == CCst) {
+  } else if (ACst != 0 && CCst != 0 &&
+             ConstantExpr::getAnd(ACst, CCst) == CCst) {
     result |= (icmp_eq ? FoldMskICmp_AMask_Mixed
                        : FoldMskICmp_AMask_NotMixed);
   }
-  if (B == C) 
-  {
+  if (B == C) {
     result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes |
                           FoldMskICmp_BMask_Mixed)
                        : (FoldMskICmp_BMask_NotAllOnes |
                           FoldMskICmp_BMask_NotMixed));
     if (icmp_bbit)
       result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_BMask_NotMixed) 
+                            FoldMskICmp_BMask_NotMixed)
                          : (FoldMskICmp_Mask_AllZeroes |
                             FoldMskICmp_BMask_Mixed));
-  }
-  else if (BCst != 0 && CCst != 0 &&
-        ConstantExpr::getAnd(BCst, CCst) == CCst) {
+  } else if (BCst != 0 && CCst != 0 &&
+             ConstantExpr::getAnd(BCst, CCst) == CCst) {
     result |= (icmp_eq ? FoldMskICmp_BMask_Mixed
                        : FoldMskICmp_BMask_NotMixed);
   }
@@ -531,7 +528,7 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,
 /// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// return the set of pattern classes (from MaskedICmpType)
 /// that both LHS and RHS satisfy
-static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, 
+static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
                                              Value*& B, Value*& C,
                                              Value*& D, Value*& E,
                                              ICmpInst *LHS, ICmpInst *RHS,
@@ -542,10 +539,10 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   if (LHS->getOperand(0)->getType()->isVectorTy()) return 0;
 
   // Here comes the tricky part:
-  // LHS might be of the form L11 & L12 == X, X == L21 & L22, 
+  // LHS might be of the form L11 & L12 == X, X == L21 & L22,
   // and L11 & L12 == L21 & L22. The same goes for RHS.
   // Now we must find those components L** and R**, that are equal, so
-  // that we can extract the parameters A, B, C, D, and E for the canonical 
+  // that we can extract the parameters A, B, C, D, and E for the canonical
   // above.
   Value *L1 = LHS->getOperand(0);
   Value *L2 = LHS->getOperand(1);
@@ -610,14 +607,11 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 
   if (L11 == A) {
     B = L12; C = L2;
-  }
-  else if (L12 == A) {
+  } else if (L12 == A) {
     B = L11; C = L2;
-  }
-  else if (L21 == A) {
+  } else if (L21 == A) {
     B = L22; C = L1;
-  }
-  else if (L22 == A) {
+  } else if (L22 == A) {
     B = L21; C = L1;
   }
 
@@ -643,32 +637,32 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     mask >>= 1; // treat "Not"-states as normal states
 
   if (mask & FoldMskICmp_Mask_AllZeroes) {
-    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) 
+    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
     Value* newOr = Builder->CreateOr(B, D);
     Value* newAnd = Builder->CreateAnd(A, newOr);
     // we can't use C as zero, because we might actually handle
-    //   (icmp ne (A & B), B) & (icmp ne (A & D), D) 
+    //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
     // with B and D, having a single bit set
     Value* zero = Constant::getNullValue(A->getType());
     return Builder->CreateICmp(NEWCC, newAnd, zero);
   }
-  else if (mask & FoldMskICmp_BMask_AllOnes) {
-    // (icmp eq (A & B), B) & (icmp eq (A & D), D) 
+  if (mask & FoldMskICmp_BMask_AllOnes) {
+    // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
     Value* newOr = Builder->CreateOr(B, D);
     Value* newAnd = Builder->CreateAnd(A, newOr);
     return Builder->CreateICmp(NEWCC, newAnd, newOr);
-  }     
-  else if (mask & FoldMskICmp_AMask_AllOnes) {
-    // (icmp eq (A & B), A) & (icmp eq (A & D), A) 
+  }
+  if (mask & FoldMskICmp_AMask_AllOnes) {
+    // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
     Value* newAnd1 = Builder->CreateAnd(B, D);
     Value* newAnd = Builder->CreateAnd(A, newAnd1);
     return Builder->CreateICmp(NEWCC, newAnd, A);
   }
-  else if (mask & FoldMskICmp_BMask_Mixed) {
-    // (icmp eq (A & B), C) & (icmp eq (A & D), E) 
+  if (mask & FoldMskICmp_BMask_Mixed) {
+    // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
     // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
     // C and E, which are shared by both the mask B and the mask D, don't
@@ -680,7 +674,7 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     ConstantInt *DCst = dyn_cast<ConstantInt>(D);
     if (DCst == 0) return 0;
     // we can't simply use C and E, because we might actually handle
-    //   (icmp ne (A & B), B) & (icmp eq (A & D), D) 
+    //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set
 
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
@@ -727,13 +721,13 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder))
     return V;
-  
+
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
   Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
   ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
   ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
   if (LHSCst == 0 || RHSCst == 0) return 0;
-  
+
   if (LHSCst == RHSCst && LHSCC == RHSCC) {
     // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
     // where C is a power of 2
@@ -742,7 +736,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       Value *NewOr = Builder->CreateOr(Val, Val2);
       return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
     }
-    
+
     // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
     if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) {
       Value *NewOr = Builder->CreateOr(Val, Val2);
@@ -759,14 +753,13 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0;
 
     // (trunc x) == C1 & (and x, CA) == C2
+    // (and x, CA) == C2 & (trunc x) == C1
     if (match(Val2, m_Trunc(m_Value(V))) &&
         match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
       SmallCst = RHSCst;
       BigCst = LHSCst;
-    }
-    // (and x, CA) == C2 & (trunc x) == C1
-    else if (match(Val, m_Trunc(m_Value(V))) &&
-             match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
+    } else if (match(Val, m_Trunc(m_Value(V))) &&
+               match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
       SmallCst = LHSCst;
       BigCst = RHSCst;
     }
@@ -789,7 +782,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
   if (Val != Val2) return 0;
-  
+
   // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
   if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
       RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
@@ -799,9 +792,9 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // Make a constant range that's the intersection of the two icmp ranges.
   // If the intersection is empty, we know that the result is false.
-  ConstantRange LHSRange = 
+  ConstantRange LHSRange =
     ConstantRange::makeICmpRegion(LHSCC, LHSCst->getValue());
-  ConstantRange RHSRange = 
+  ConstantRange RHSRange =
     ConstantRange::makeICmpRegion(RHSCC, RHSCst->getValue());
 
   if (LHSRange.intersectWith(RHSRange).isEmptySet())
@@ -810,16 +803,16 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // We can't fold (ugt x, C) & (sgt x, C2).
   if (!PredicatesFoldable(LHSCC, RHSCC))
     return 0;
-    
+
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
   if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) && 
+      (ICmpInst::isEquality(LHSCC) &&
        CmpInst::isSigned(RHSCC)))
     ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
   else
     ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
-    
+
   if (ShouldSwap) {
     std::swap(LHS, RHS);
     std::swap(LHSCst, RHSCst);
@@ -829,8 +822,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // At this point, we know we have two icmp instructions
   // comparing a value against two constants and and'ing the result
   // together.  Because of the above check, we know that we only have
-  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
-  // (from the icmp folding check above), that the two constants 
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
+  // (from the icmp folding check above), that the two constants
   // are not equal and that the larger constant is on the RHS
   assert(LHSCst != RHSCst && "Compares not folded above?");
 
@@ -932,7 +925,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     }
     break;
   }
- 
+
   return 0;
 }
 
@@ -951,7 +944,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
           return ConstantInt::getFalse(LHS->getContext());
         return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
       }
-    
+
     // Handle vector zeros.  This occurs because the canonical form of
     // "fcmp ord x,x" is "fcmp ord x, 0".
     if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
@@ -959,18 +952,18 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
       return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
     return 0;
   }
-  
+
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
   Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
   FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
-  
-  
+
+
   if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
     // Swap RHS operands to match LHS.
     Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
     std::swap(Op1LHS, Op1RHS);
   }
-  
+
   if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
     // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
     if (Op0CC == Op1CC)
@@ -981,7 +974,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
       return RHS;
     if (Op1CC == FCmpInst::FCMP_TRUE)
       return LHS;
-    
+
     bool Op0Ordered;
     bool Op1Ordered;
     unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
@@ -1001,7 +994,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         return LHS;
       if (Op0Ordered && (Op0Ordered == Op1Ordered))
         return RHS;
-      
+
       // uno && oeq -> uno && (ord && eq) -> false
       if (!Op0Ordered)
         return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
@@ -1025,10 +1018,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return ReplaceInstUsesWith(I, V);
 
-  // See if we can simplify any instructions used by the instruction whose sole 
+  // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
-    return &I;  
+    return &I;
 
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
@@ -1043,7 +1036,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       case Instruction::Or: {
         // If the mask is only needed on one incoming arm, push it up.
         if (!Op0I->hasOneUse()) break;
-          
+
         APInt NotAndRHS(~AndRHSMask);
         if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
           // Not masking anything out for the LHS, move to RHS.
@@ -1103,12 +1096,12 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         }
         break;
       }
-          
+
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
         if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
           return Res;
     }
-    
+
     // If this is an integer truncation, and if the source is an 'and' with
     // immediate, transform it.  This frequently occurs for bitfield accesses.
     {
@@ -1116,7 +1109,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) {
         // Change: and (trunc (and X, YC) to T), C2
         // into  : and (trunc X to T), trunc(YC) & C2
-        // This will fold the two constants together, which may allow 
+        // This will fold the two constants together, which may allow
         // other simplifications.
         Value *NewCast = Builder->CreateTrunc(X, I.getType(), "and.shrunk");
         Constant *C3 = ConstantExpr::getTrunc(YC, I.getType());
@@ -1143,7 +1136,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
                                       I.getName()+".demorgan");
         return BinaryOperator::CreateNot(Or);
       }
-  
+
   {
     Value *A = 0, *B = 0, *C = 0, *D = 0;
     // (A|B) & ~(A&B) -> A^B
@@ -1151,13 +1144,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) &&
         ((A == C && B == D) || (A == D && B == C)))
       return BinaryOperator::CreateXor(A, B);
-    
+
     // ~(A&B) & (A|B) -> A^B
     if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
         match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) &&
         ((A == C && B == D) || (A == D && B == C)))
       return BinaryOperator::CreateXor(A, B);
-    
+
     // A&(A^B) => A & ~B
     {
       Value *tmpOp0 = Op0;
@@ -1193,19 +1186,19 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
       return BinaryOperator::CreateAnd(A, Op0);
   }
-  
+
   if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1))
     if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0))
       if (Value *Res = FoldAndOfICmps(LHS, RHS))
         return ReplaceInstUsesWith(I, Res);
-  
+
   // If and'ing two fcmp, try combine them into one.
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
       if (Value *Res = FoldAndOfFCmps(LHS, RHS))
         return ReplaceInstUsesWith(I, Res);
-  
-  
+
+
   // fold (and (cast A), (cast B)) -> (cast (and A, B))
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
     if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) {
@@ -1214,21 +1207,21 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
           SrcTy == Op1C->getOperand(0)->getType() &&
           SrcTy->isIntOrIntVectorTy()) {
         Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
-        
+
         // Only do this if the casts both really cause code to be generated.
         if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) &&
             ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) {
           Value *NewOp = Builder->CreateAnd(Op0COp, Op1COp, I.getName());
           return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
         }
-        
+
         // If this is and(cast(icmp), cast(icmp)), try to fold this even if the
         // cast is otherwise not optimizable.  This happens for vector sexts.
         if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
           if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
             if (Value *Res = FoldAndOfICmps(LHS, RHS))
               return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
-        
+
         // If this is and(cast(fcmp), cast(fcmp)), try to fold this even if the
         // cast is otherwise not optimizable.  This happens for vector sexts.
         if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp))
@@ -1237,21 +1230,49 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
               return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
       }
     }
-    
+
   // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
   if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
     if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
-      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() &&
           SI0->getOperand(1) == SI1->getOperand(1) &&
           (SI0->hasOneUse() || SI1->hasOneUse())) {
         Value *NewOp =
           Builder->CreateAnd(SI0->getOperand(0), SI1->getOperand(0),
                              SI0->getName());
-        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp,
                                       SI1->getOperand(1));
       }
   }
 
+  {
+    Value *X = 0;
+    bool OpsSwapped = false;
+    // Canonicalize SExt or Not to the LHS
+    if (match(Op1, m_SExt(m_Value())) ||
+        match(Op1, m_Not(m_Value()))) {
+      std::swap(Op0, Op1);
+      OpsSwapped = true;
+    }
+
+    // Fold (and (sext bool to A), B) --> (select bool, B, 0)
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op1->getType());
+      return SelectInst::Create(X, Op1, Zero);
+    }
+
+    // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
+    if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op0->getType());
+      return SelectInst::Create(X, Zero, Op1);
+    }
+
+    if (OpsSwapped)
+      std::swap(Op0, Op1);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -1288,11 +1309,11 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
              CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask,
                                ByteValues);
     }
-  
+
     // If this is a logical shift by a constant multiple of 8, recurse with
     // OverallLeftShift and ByteMask adjusted.
     if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned ShAmt = 
+      unsigned ShAmt =
         cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
       // Ensure the shift amount is defined and of a byte value.
       if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size()))
@@ -1313,7 +1334,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
       if (OverallLeftShift >= (int)ByteValues.size()) return true;
       if (OverallLeftShift <= -(int)ByteValues.size()) return true;
 
-      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, 
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
                                ByteValues);
     }
 
@@ -1325,20 +1346,20 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
       unsigned NumBytes = ByteValues.size();
       APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255);
       const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
-      
+
       for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) {
         // If this byte is masked out by a later operation, we don't care what
         // the and mask is.
         if ((ByteMask & (1 << i)) == 0)
           continue;
-        
+
         // If the AndMask is all zeros for this byte, clear the bit.
         APInt MaskB = AndMask & Byte;
         if (MaskB == 0) {
           ByteMask &= ~(1U << i);
           continue;
         }
-        
+
         // If the AndMask is not all ones for this byte, it's not a bytezap.
         if (MaskB != Byte)
           return true;
@@ -1346,11 +1367,11 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
         // Otherwise, this byte is kept.
       }
 
-      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, 
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
                                ByteValues);
     }
   }
-  
+
   // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
   // the input value to the bswap.  Some observations: 1) if more than one byte
   // is demanded from this input, then it could not be successfully assembled
@@ -1358,7 +1379,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
   // their ultimate destination.
   if (!isPowerOf2_32(ByteMask)) return true;
   unsigned InputByteNo = CountTrailingZeros_32(ByteMask);
-  
+
   // 2) The input and ultimate destinations must line up: if byte 3 of an i32
   // is demanded, it needs to go into byte 0 of the result.  This means that the
   // byte needs to be shifted until it lands in the right byte bucket.  The
@@ -1368,7 +1389,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
   unsigned DestByteNo = InputByteNo + OverallLeftShift;
   if (ByteValues.size()-1-DestByteNo != InputByteNo)
     return true;
-  
+
   // If the destination byte value is already defined, the values are or'd
   // together, which isn't a bswap (unless it's an or of the same bits).
   if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V)
@@ -1381,25 +1402,25 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
 /// If so, insert the new bswap intrinsic and return it.
 Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
   IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-  if (!ITy || ITy->getBitWidth() % 16 || 
+  if (!ITy || ITy->getBitWidth() % 16 ||
       // ByteMask only allows up to 32-byte values.
-      ITy->getBitWidth() > 32*8) 
+      ITy->getBitWidth() > 32*8)
     return 0;   // Can only bswap pairs of bytes.  Can't do vectors.
-  
+
   /// ByteValues - For each byte of the result, we keep track of which value
   /// defines each byte.
   SmallVector<Value*, 8> ByteValues;
   ByteValues.resize(ITy->getBitWidth()/8);
-    
+
   // Try to find all the pieces corresponding to the bswap.
   uint32_t ByteMask = ~0U >> (32-ByteValues.size());
   if (CollectBSwapParts(&I, 0, ByteMask, ByteValues))
     return 0;
-  
+
   // Check to see if all of the bytes come from the same value.
   Value *V = ByteValues[0];
   if (V == 0) return 0;  // Didn't find a byte?  Must be zero.
-  
+
   // Check to make sure that all of the bytes come from the same value.
   for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
     if (ByteValues[i] != V)
@@ -1425,7 +1446,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
     return SelectInst::Create(Cond, C, B);
   if (match(D, m_SExt(m_Not(m_Specific(Cond)))))
     return SelectInst::Create(Cond, C, B);
-  
+
   // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D.
   if (match(B, m_Not(m_SExt(m_Specific(Cond)))))
     return SelectInst::Create(Cond, C, D);
@@ -1483,33 +1504,33 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // From here on, we only handle:
   //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
   if (Val != Val2) return 0;
-  
+
   // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
   if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
       RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
       LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
       RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
     return 0;
-  
+
   // We can't fold (ugt x, C) | (sgt x, C2).
   if (!PredicatesFoldable(LHSCC, RHSCC))
     return 0;
-  
+
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
   if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) && 
+      (ICmpInst::isEquality(LHSCC) &&
        CmpInst::isSigned(RHSCC)))
     ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
   else
     ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
-  
+
   if (ShouldSwap) {
     std::swap(LHS, RHS);
     std::swap(LHSCst, RHSCst);
     std::swap(LHSCC, RHSCC);
   }
-  
+
   // At this point, we know we have two icmp instructions
   // comparing a value against two constants and or'ing the result
   // together.  Because of the above check, we know that we only have
@@ -1531,6 +1552,20 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
         AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
         return Builder->CreateICmpULT(Add, AddCST);
       }
+
+      if (LHS->getOperand(0) == RHS->getOperand(0)) {
+        // if LHSCst and RHSCst differ only by one bit:
+        // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
+        assert(LHSCst->getValue().ule(LHSCst->getValue()));
+
+        APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
+        if (Xor.isPowerOf2()) {
+          Value *NegCst = Builder->getInt(~Xor);
+          Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst);
+          return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst);
+        }
+      }
+
       break;                         // (X == 13 | X == 15) -> no change
     case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
     case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
@@ -1632,7 +1667,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 /// function.
 Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
-      RHS->getPredicate() == FCmpInst::FCMP_UNO && 
+      RHS->getPredicate() == FCmpInst::FCMP_UNO &&
       LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) {
     if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
       if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
@@ -1640,25 +1675,25 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // true.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
           return ConstantInt::getTrue(LHS->getContext());
-        
+
         // Otherwise, no need to compare the two constants, compare the
         // rest.
         return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
       }
-    
+
     // Handle vector zeros.  This occurs because the canonical form of
     // "fcmp uno x,x" is "fcmp uno x, 0".
     if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
         isa<ConstantAggregateZero>(RHS->getOperand(1)))
       return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
-    
+
     return 0;
   }
-  
+
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
   Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
   FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
-  
+
   if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
     // Swap RHS operands to match LHS.
     Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
@@ -1692,7 +1727,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 ///     ((A | B) & C1) | (B & C2)
 ///
 /// into:
-/// 
+///
 ///     (A & C1) | B
 ///
 /// when the XOR of the two constants is "all ones" (-1).
@@ -1727,7 +1762,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return ReplaceInstUsesWith(I, V);
 
-  // See if we can simplify any instructions used by the instruction whose sole 
+  // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
@@ -1741,7 +1776,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         Op0->hasOneUse()) {
       Value *Or = Builder->CreateOr(X, RHS);
       Or->takeName(Op0);
-      return BinaryOperator::CreateAnd(Or, 
+      return BinaryOperator::CreateAnd(Or,
                          ConstantInt::get(I.getContext(),
                                           RHS->getValue() | C1->getValue()));
     }
@@ -1778,7 +1813,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     if (Instruction *BSwap = MatchBSwap(I))
       return BSwap;
   }
-  
+
   // (X^C)|Y -> (X|Y)^C iff Y&C == 0
   if (Op0->hasOneUse() &&
       match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
@@ -1827,7 +1862,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
             return ReplaceInstUsesWith(I, B);
         }
       }
-      
+
       if ((C1->getValue() & C2->getValue()) == 0) {
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
         // iff (C1&C2) == 0 and (N&~C1) == 0
@@ -1844,7 +1879,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
           return BinaryOperator::CreateAnd(B,
                                ConstantInt::get(B->getContext(),
                                                 C1->getValue()|C2->getValue()));
-        
+
         // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
         // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
         ConstantInt *C3 = 0, *C4 = 0;
@@ -1904,16 +1939,16 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       if (Ret) return Ret;
     }
   }
-  
+
   // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
   if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
     if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
-      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() &&
           SI0->getOperand(1) == SI1->getOperand(1) &&
           (SI0->hasOneUse() || SI1->hasOneUse())) {
         Value *NewOp = Builder->CreateOr(SI0->getOperand(0), SI1->getOperand(0),
                                          SI0->getName());
-        return BinaryOperator::Create(SI1->getOpcode(), NewOp, 
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp,
                                       SI1->getOperand(1));
       }
   }
@@ -1975,13 +2010,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
       if (Value *Res = FoldOrOfICmps(LHS, RHS))
         return ReplaceInstUsesWith(I, Res);
-    
+
   // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
       if (Value *Res = FoldOrOfFCmps(LHS, RHS))
         return ReplaceInstUsesWith(I, Res);
-  
+
   // fold (or (cast A), (cast B)) -> (cast (or A, B))
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
     CastInst *Op1C = dyn_cast<CastInst>(Op1);
@@ -1999,14 +2034,14 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
           Value *NewOp = Builder->CreateOr(Op0COp, Op1COp, I.getName());
           return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
         }
-        
+
         // If this is or(cast(icmp), cast(icmp)), try to fold this even if the
         // cast is otherwise not optimizable.  This happens for vector sexts.
         if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
           if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
             if (Value *Res = FoldOrOfICmps(LHS, RHS))
               return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
-        
+
         // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the
         // cast is otherwise not optimizable.  This happens for vector sexts.
         if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp))
@@ -2035,7 +2070,21 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     Inner->takeName(Op0);
     return BinaryOperator::CreateOr(Inner, C1);
   }
-  
+
+  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+  // Since this OR statement hasn't been optimized further yet, we hope
+  // that this transformation will allow the new ORs to be optimized.
+  {
+    Value *X = 0, *Y = 0;
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+      Value *orTrue = Builder->CreateOr(A, C);
+      Value *orFalse = Builder->CreateOr(B, D);
+      return SelectInst::Create(X, orTrue, orFalse);
+    }
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -2050,7 +2099,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return ReplaceInstUsesWith(I, V);
 
-  // See if we can simplify any instructions used by the instruction whose sole 
+  // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
@@ -2058,7 +2107,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   // Is this a ~ operation?
   if (Value *NotOp = dyn_castNotVal(&I)) {
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
-      if (Op0I->getOpcode() == Instruction::And || 
+      if (Op0I->getOpcode() == Instruction::And ||
           Op0I->getOpcode() == Instruction::Or) {
         // ~(~X & Y) --> (X | ~Y) - De Morgan's Law
         // ~(~X | Y) === (X & ~Y) - De Morgan's Law
@@ -2072,10 +2121,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             return BinaryOperator::CreateOr(Op0NotVal, NotY);
           return BinaryOperator::CreateAnd(Op0NotVal, NotY);
         }
-        
+
         // ~(X & Y) --> (~X | ~Y) - De Morgan's Law
         // ~(X | Y) === (~X & ~Y) - De Morgan's Law
-        if (isFreeToInvert(Op0I->getOperand(0)) && 
+        if (isFreeToInvert(Op0I->getOperand(0)) &&
             isFreeToInvert(Op0I->getOperand(1))) {
           Value *NotX =
             Builder->CreateNot(Op0I->getOperand(0), "notlhs");
@@ -2093,8 +2142,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
   }
-  
-  
+
+
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     if (RHS->isOne() && Op0->hasOneUse())
       // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
@@ -2109,7 +2158,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         if (CI->hasOneUse() && Op0C->hasOneUse()) {
           Instruction::CastOps Opcode = Op0C->getOpcode();
           if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-              (RHS == ConstantExpr::getCast(Opcode, 
+              (RHS == ConstantExpr::getCast(Opcode,
                                            ConstantInt::getTrue(I.getContext()),
                                             Op0C->getDestTy()))) {
             CI->setPredicate(CI->getInversePredicate());
@@ -2128,7 +2177,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
                                       ConstantInt::get(I.getType(), 1));
           return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
         }
-          
+
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
         if (Op0I->getOpcode() == Instruction::Add) {
           // ~(X-c) --> (-c-1)-X
@@ -2152,13 +2201,34 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             // Anything in both C1 and C2 is known to be zero, remove it from
             // NewRHS.
             Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
-            NewRHS = ConstantExpr::getAnd(NewRHS, 
+            NewRHS = ConstantExpr::getAnd(NewRHS,
                                        ConstantExpr::getNot(CommonBits));
             Worklist.Add(Op0I);
             I.setOperand(0, Op0I->getOperand(0));
             I.setOperand(1, NewRHS);
             return &I;
           }
+        } else if (Op0I->getOpcode() == Instruction::LShr) {
+          // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
+          // E1 = "X ^ C1"
+          BinaryOperator *E1;
+          ConstantInt *C1;
+          if (Op0I->hasOneUse() &&
+              (E1 = dyn_cast<BinaryOperator>(Op0I->getOperand(0))) &&
+              E1->getOpcode() == Instruction::Xor &&
+              (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
+            // fold (C1 >> C2) ^ C3
+            ConstantInt *C2 = Op0CI, *C3 = RHS;
+            APInt FoldConst = C1->getValue().lshr(C2->getValue());
+            FoldConst ^= C3->getValue();
+            // Prepare the two operands.
+            Value *Opnd0 = Builder->CreateLShr(E1->getOperand(0), C2);
+            Opnd0->takeName(Op0I);
+            cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc());
+            Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst);
+
+            return BinaryOperator::CreateXor(Opnd0, FoldVal);
+          }
         }
       }
     }
@@ -2184,7 +2254,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         I.swapOperands();     // Simplified below.
         std::swap(Op0, Op1);
       }
-    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && 
+    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) &&
                Op1I->hasOneUse()){
       if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
         Op1I->swapOperands();
@@ -2196,7 +2266,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
   }
-  
+
   BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
   if (Op0I) {
     Value *A, *B;
@@ -2206,7 +2276,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         std::swap(A, B);
       if (B == Op1)                                  // (A|B)^B == A & ~B
         return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
-    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && 
+    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
                Op0I->hasOneUse()){
       if (A == Op1)                                        // (A&B)^A -> (B&A)^A
         std::swap(A, B);
@@ -2216,31 +2286,31 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
     }
   }
-  
+
   // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
-  if (Op0I && Op1I && Op0I->isShift() && 
-      Op0I->getOpcode() == Op1I->getOpcode() && 
+  if (Op0I && Op1I && Op0I->isShift() &&
+      Op0I->getOpcode() == Op1I->getOpcode() &&
       Op0I->getOperand(1) == Op1I->getOperand(1) &&
       (Op0I->hasOneUse() || Op1I->hasOneUse())) {
     Value *NewOp =
       Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0),
                          Op0I->getName());
-    return BinaryOperator::Create(Op1I->getOpcode(), NewOp, 
+    return BinaryOperator::Create(Op1I->getOpcode(), NewOp,
                                   Op1I->getOperand(1));
   }
-    
+
   if (Op0I && Op1I) {
     Value *A, *B, *C, *D;
     // (A & B)^(A | B) -> A ^ B
     if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
         match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
-      if ((A == C && B == D) || (A == D && B == C)) 
+      if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
     // (A | B)^(A & B) -> A ^ B
     if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
         match(Op1I, m_And(m_Value(C), m_Value(D)))) {
-      if ((A == C && B == D) || (A == D && B == C)) 
+      if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
   }
@@ -2257,7 +2327,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
           Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
           unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
           bool isSigned = LHS->isSigned() || RHS->isSigned();
-          return ReplaceInstUsesWith(I, 
+          return ReplaceInstUsesWith(I,
                                getNewICmpValue(isSigned, Code, Op0, Op1,
                                                Builder));
         }
@@ -2270,9 +2340,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         Type *SrcTy = Op0C->getOperand(0)->getType();
         if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() &&
             // Only do this if the casts both really cause code to be generated.
-            ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+            ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0),
                                I.getType()) &&
-            ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+            ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0),
                                I.getType())) {
           Value *NewOp = Builder->CreateXor(Op0C->getOperand(0),
                                             Op1C->getOperand(0), I.getName());
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 48f2704..64cd1bd 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -12,12 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/DataLayout.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/PatternMatch.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
+using namespace PatternMatch;
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
 
 /// getPromotedType - Return the specified type promoted as it would be to pass
 /// though a va_arg area.
@@ -273,25 +278,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
     return 0;
   }
-  case Intrinsic::bswap:
+  case Intrinsic::bswap: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = 0;
+
     // bswap(bswap(x)) -> x
-    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getArgOperand(0)))
-      if (Operand->getIntrinsicID() == Intrinsic::bswap)
-        return ReplaceInstUsesWith(CI, Operand->getArgOperand(0));
+    if (match(IIOperand, m_BSwap(m_Value(X))))
+        return ReplaceInstUsesWith(CI, X);
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
-    if (TruncInst *TI = dyn_cast<TruncInst>(II->getArgOperand(0))) {
-      if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(TI->getOperand(0)))
-        if (Operand->getIntrinsicID() == Intrinsic::bswap) {
-          unsigned C = Operand->getType()->getPrimitiveSizeInBits() -
-                       TI->getType()->getPrimitiveSizeInBits();
-          Value *CV = ConstantInt::get(Operand->getType(), C);
-          Value *V = Builder->CreateLShr(Operand->getArgOperand(0), CV);
-          return new TruncInst(V, TI->getType());
-        }
+    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
+      unsigned C = X->getType()->getPrimitiveSizeInBits() -
+        IIOperand->getType()->getPrimitiveSizeInBits();
+      Value *CV = ConstantInt::get(X->getType(), C);
+      Value *V = Builder->CreateLShr(X, CV);
+      return new TruncInst(V, IIOperand->getType());
     }
-
     break;
+  }
+
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // powi(x, 0) -> 1.0
@@ -690,7 +695,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         if (Splat->isOne()) {
           if (Zext)
             return CastInst::CreateZExtOrBitCast(Arg0, II->getType());
-          // else    
+          // else
           return CastInst::CreateSExtOrBitCast(Arg0, II->getType());
         }
       }
@@ -785,8 +790,10 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) {
   if (CI->getCalledFunction() == 0) return 0;
 
-  if (Value *With = Simplifier->optimizeCall(CI))
-    return ReplaceInstUsesWith(*CI, With);
+  if (Value *With = Simplifier->optimizeCall(CI)) {
+    ++NumSimplified;
+    return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
+  }
 
   return 0;
 }
@@ -894,7 +901,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
       new StoreInst(ConstantInt::getTrue(Callee->getContext()),
                 UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
                                   OldCall);
-      // If OldCall dues not return void then replaceAllUsesWith undef.
+      // If OldCall does not return void then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
       if (!OldCall->getType()->isVoidTy())
         ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
@@ -977,7 +984,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   if (Callee == 0)
     return false;
   Instruction *Caller = CS.getInstruction();
-  const AttrListPtr &CallerPAL = CS.getAttributes();
+  const AttributeSet &CallerPAL = CS.getAttributes();
 
   // Okay, this is a cast from a function to a different type.  Unless doing so
   // would cause a type conversion of one of our arguments, change this call to
@@ -1007,8 +1014,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       return false;   // Cannot transform this return value.
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs = CallerPAL.getRetAttributes();
-      if (RAttrs.hasAttributes(Attributes::typeIncompatible(NewRetTy)))
+      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      if (RAttrs.
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                        AttributeSet::ReturnIndex))
         return false;   // Attribute not compatible with transformed value.
     }
 
@@ -1037,14 +1047,16 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
 
-    Attributes Attrs = CallerPAL.getParamAttributes(i + 1);
-    if (AttrBuilder(Attrs).
-          hasAttributes(Attributes::typeIncompatible(ParamTy)))
+    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(ParamTy, i + 1), i + 1))
       return false;   // Attribute not compatible with transformed value.
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy && Attrs.hasAttribute(Attributes::ByVal)) {
+    if (ParamTy != ActTy &&
+        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
+                                                         Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
@@ -1093,10 +1105,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // won't be dropping them.  Check that these extra arguments have attributes
     // that are compatible with being a vararg call argument.
     for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
-      if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams())
+      unsigned Index = CallerPAL.getSlotIndex(i - 1);
+      if (Index <= FT->getNumParams())
         break;
-      Attributes PAttrs = CallerPAL.getSlot(i - 1).Attrs;
-      if (PAttrs.hasIncompatibleWithVarArgsAttrs())
+
+      // Check if it has an attribute that's incompatible with varargs.
+      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
+      if (PAttrs.hasAttribute(Index, Attribute::StructRet))
         return false;
     }
 
@@ -1105,21 +1120,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // inserting cast instructions as necessary.
   std::vector<Value*> Args;
   Args.reserve(NumActualArgs);
-  SmallVector<AttributeWithIndex, 8> attrVec;
+  SmallVector<AttributeSet, 8> attrVec;
   attrVec.reserve(NumCommonArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs = CallerPAL.getRetAttributes();
+  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
-  RAttrs.removeAttributes(Attributes::typeIncompatible(NewRetTy));
+  RAttrs.
+    removeAttributes(AttributeFuncs::
+                     typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                     AttributeSet::ReturnIndex);
 
   // Add the new return attributes.
   if (RAttrs.hasAttributes())
-    attrVec.push_back(
-      AttributeWithIndex::get(AttrListPtr::ReturnIndex,
-                              Attributes::get(FT->getContext(), RAttrs)));
+    attrVec.push_back(AttributeSet::get(Caller->getContext(),
+                                        AttributeSet::ReturnIndex, RAttrs));
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
@@ -1133,9 +1150,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     // Add any parameter attributes.
-    Attributes PAttrs = CallerPAL.getParamAttributes(i + 1);
+    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
     if (PAttrs.hasAttributes())
-      attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
+                                          PAttrs));
   }
 
   // If the function takes more arguments than the call was taking, add them
@@ -1145,10 +1163,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   // If we are removing arguments to the function, emit an obnoxious warning.
   if (FT->getNumParams() < NumActualArgs) {
-    if (!FT->isVarArg()) {
-      errs() << "WARNING: While resolving call to function '"
-             << Callee->getName() << "' arguments were dropped!\n";
-    } else {
+    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
+    if (FT->isVarArg()) {
       // Add all of the arguments in their promoted form to the arg list.
       for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
         Type *PTy = getPromotedType((*AI)->getType());
@@ -1162,23 +1178,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         }
 
         // Add any parameter attributes.
-        Attributes PAttrs = CallerPAL.getParamAttributes(i + 1);
+        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
         if (PAttrs.hasAttributes())
-          attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
+                                              PAttrs));
       }
     }
   }
 
-  Attributes FnAttrs = CallerPAL.getFnAttributes();
-  if (FnAttrs.hasAttributes())
-    attrVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                              FnAttrs));
+  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
+    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttrListPtr &NewCallerPAL = AttrListPtr::get(Callee->getContext(),
-                                                     attrVec);
+  const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
+                                                       attrVec);
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
@@ -1238,13 +1254,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Value *Callee = CS.getCalledValue();
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
-  const AttrListPtr &Attrs = CS.getAttributes();
+  const AttributeSet &Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
-  for (unsigned I = 0, E = Attrs.getNumAttrs(); I != E; ++I)
-    if (Attrs.getAttributesAtIndex(I).hasAttribute(Attributes::Nest))
-      return 0;
+  if (Attrs.hasAttrSomewhere(Attribute::Nest))
+    return 0;
 
   assert(Tramp &&
          "transformCallThroughTrampoline called with incorrect CallSite.");
@@ -1253,16 +1268,16 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   PointerType *NestFPTy = cast<PointerType>(NestF->getType());
   FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
-  const AttrListPtr &NestAttrs = NestF->getAttributes();
+  const AttributeSet &NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
     Type *NestTy = 0;
-    Attributes NestAttr;
+    AttributeSet NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
          E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
-      if (NestAttrs.getParamAttributes(NestIdx).hasAttribute(Attributes::Nest)){
+      if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
         NestAttr = NestAttrs.getParamAttributes(NestIdx);
@@ -1274,17 +1289,16 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       std::vector<Value*> NewArgs;
       NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
 
-      SmallVector<AttributeWithIndex, 8> NewAttrs;
+      SmallVector<AttributeSet, 8> NewAttrs;
       NewAttrs.reserve(Attrs.getNumSlots() + 1);
 
       // Insert the nest argument into the call argument list, which may
       // mean appending it.  Likewise for attributes.
 
       // Add any result attributes.
-      Attributes Attr = Attrs.getRetAttributes();
-      if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
-                                                   Attr));
+      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
+        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                             Attrs.getRetAttributes()));
 
       {
         unsigned Idx = 1;
@@ -1296,7 +1310,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
             if (NestVal->getType() != NestTy)
               NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
-            NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 NestAttr));
           }
 
           if (I == E)
@@ -1304,20 +1319,21 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          Attr = Attrs.getParamAttributes(Idx);
-          if (Attr.hasAttributes())
-            NewAttrs.push_back
-              (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr));
+          AttributeSet Attr = Attrs.getParamAttributes(Idx);
+          if (Attr.hasAttributes(Idx)) {
+            AttrBuilder B(Attr, Idx);
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 Idx + (Idx >= NestIdx), B));
+          }
 
           ++Idx, ++I;
         } while (1);
       }
 
       // Add any function attributes.
-      Attr = Attrs.getFnAttributes();
-      if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
-                                                   Attr));
+      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
+                                             Attrs.getFnAttributes()));
 
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
@@ -1356,7 +1372,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttrListPtr &NewPAL = AttrListPtr::get(FTy->getContext(), NewAttrs);
+      const AttributeSet &NewPAL = AttributeSet::get(FTy->getContext(), NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index bb59db8..2ee1278 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -13,9 +13,9 @@
 
 #include "InstCombine.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -30,7 +30,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
     Scale  = 0;
     return ConstantInt::get(Val->getType(), 0);
   }
-  
+
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
@@ -47,19 +47,19 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Mul) {
         // This value is scaled by 'RHS'.
         Scale = RHS->getZExtValue();
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Add) {
-        // We have X+C.  Check to see if we really have (X*C2)+C1, 
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
         // where C1 is divisible by C2.
         unsigned SubScale;
-        Value *SubVal = 
+        Value *SubVal =
           DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
         Offset += RHS->getZExtValue();
         Scale = SubScale;
@@ -82,7 +82,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if (!TD) return 0;
 
   PointerType *PTy = cast<PointerType>(CI.getType());
-  
+
   BuilderTy AllocaBuilder(*Builder);
   AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
 
@@ -104,13 +104,19 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy);
   if (CastElTySize == 0 || AllocElTySize == 0) return 0;
 
+  // If the allocation has multiple uses, only promote it if we're not
+  // shrinking the amount of memory being allocated.
+  uint64_t AllocElTyStoreSize = TD->getTypeStoreSize(AllocElTy);
+  uint64_t CastElTyStoreSize = TD->getTypeStoreSize(CastElTy);
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return 0;
+
   // See if we can satisfy the modulus by pulling a scale out of the array
   // size argument.
   unsigned ArraySizeScale;
   uint64_t ArrayOffset;
   Value *NumElements = // See if the array size is a decomposable linear expr.
     DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
- 
+
   // If we can now satisfy the modulus, by using a non-1 scale, we really can
   // do the xform.
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
@@ -125,17 +131,17 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // Insert before the alloca, not before the cast.
     Amt = AllocaBuilder.CreateMul(Amt, NumElements);
   }
-  
+
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
     Amt = AllocaBuilder.CreateAdd(Amt, Off);
   }
-  
+
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
   New->setAlignment(AI.getAlignment());
   New->takeName(&AI);
-  
+
   // If the allocation has multiple real uses, insert a cast and change all
   // things that used it to use the new cast.  This will also hack on CI, but it
   // will die soon.
@@ -148,10 +154,10 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   return ReplaceInstUsesWith(CI, New);
 }
 
-/// EvaluateInDifferentType - Given an expression that 
+/// EvaluateInDifferentType - Given an expression that
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, 
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
@@ -181,7 +187,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
     Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
     break;
-  }    
+  }
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -190,7 +196,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     // new.
     if (I->getOperand(0)->getType() == Ty)
       return I->getOperand(0);
-    
+
     // Otherwise, must be the same type of cast, so just reinsert a new one.
     // This also handles the case of zext(trunc(x)) -> zext(x).
     Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
@@ -212,11 +218,11 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Res = NPN;
     break;
   }
-  default: 
+  default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
   }
-  
+
   Res->takeName(I);
   return InsertNewInstWith(Res, *I);
 }
@@ -224,7 +230,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
 
 /// This function is a wrapper around CastInst::isEliminableCastPair. It
 /// simply extracts arguments and returns what that function returns.
-static Instruction::CastOps 
+static Instruction::CastOps
 isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
@@ -253,7 +259,7 @@ isEliminableCastPair(
   if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
       (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
     Res = 0;
-  
+
   return Instruction::CastOps(Res);
 }
 
@@ -265,18 +271,18 @@ bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
                                       Type *Ty) {
   // Noop casts and casts of constants should be eliminated trivially.
   if (V->getType() == Ty || isa<Constant>(V)) return false;
-  
+
   // If this is another cast that can be eliminated, we prefer to have it
   // eliminated.
   if (const CastInst *CI = dyn_cast<CastInst>(V))
     if (isEliminableCastPair(CI, opc, Ty, TD))
       return false;
-  
+
   // If this is a vector sext from a compare, then we don't want to break the
   // idiom where each element of the extended vector is either zero or all ones.
   if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
     return false;
-  
+
   return true;
 }
 
@@ -288,7 +294,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
   // eliminate it now.
   if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
-    if (Instruction::CastOps opc = 
+    if (Instruction::CastOps opc =
         isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
@@ -311,7 +317,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       if (Instruction *NV = FoldOpIntoPhi(CI))
         return NV;
   }
-  
+
   return 0;
 }
 
@@ -330,15 +336,15 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   Type *OrigTy = V->getType();
-  
+
   // If this is an extension from the dest type, we can eliminate it, even if it
   // has multiple uses.
-  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
       I->getOperand(0)->getType() == Ty)
     return true;
 
@@ -423,29 +429,29 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType(), *SrcTy = Src->getType();
-  
+
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
       CanEvaluateTruncated(Src, DestTy)) {
-      
+
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -462,7 +468,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
-  
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = 0; ConstantInt *Cst = 0;
   if (Src->hasOneUse() &&
@@ -472,7 +478,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
     // between ASize and ResultSize.
     unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-    
+
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
@@ -485,7 +491,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Shift->takeName(Src);
     return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
   }
-  
+
   // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
   // type isn't non-native.
   if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) &&
@@ -508,7 +514,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   // cast to integer to avoid the comparison.
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
     const APInt &Op1CV = Op1C->getValue();
-      
+
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
     if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
@@ -538,14 +544,14 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
     // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
     // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
-    if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) &&
         // This only works for EQ and NE
         ICI->isEquality()) {
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
       ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
-        
+
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
         if (!DoXform) return ICI;
@@ -559,7 +565,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           Res = ConstantExpr::getZExt(Res, CI.getType());
           return ReplaceInstUsesWith(CI, Res);
         }
-          
+
         uint32_t ShiftAmt = KnownZeroMask.logBase2();
         Value *In = ICI->getOperand(0);
         if (ShiftAmt) {
@@ -568,12 +574,12 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
                                    In->getName()+".lobit");
         }
-          
+
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
           In = Builder->CreateXor(In, One);
         }
-          
+
         if (CI.getType() == In->getType())
           return ReplaceInstUsesWith(CI, In);
         return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
@@ -646,19 +652,19 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If the input is a truncate from the destination type, we can trivially
   // eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
-  
+
   unsigned Opc = I->getOpcode(), Tmp;
   switch (Opc) {
   case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
@@ -678,7 +684,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
       return true;
-      
+
     // If the operation is an AND/OR/XOR and the bits to clear are zero in the
     // other side, BitsToClear is ok.
     if (Tmp == 0 &&
@@ -691,10 +697,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
                             APInt::getHighBitsSet(VSize, BitsToClear)))
         return true;
     }
-      
+
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
     return false;
-      
+
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
@@ -716,7 +722,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
         Tmp != BitsToClear)
       return false;
     return true;
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -739,48 +745,48 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
 }
 
 Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
-  // If this zero extend is only used by a truncate, let the truncate by
+  // If this zero extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   // If one of the common conversion will work, do it.
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole 
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
-  
+
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear)) { 
+      CanEvaluateZExtd(Src, DestTy, BitsToClear)) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
-    
+
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
           " to avoid zero extend: " << CI);
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
-    
+
     uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
     uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-    
+
     // If the high bits are already filled with zeros, just replace this
     // cast with the result.
     if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
                                                      DestBitSize-SrcBitsKept)))
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit an AND to clear the high bits.
     Constant *C = ConstantInt::get(Res->getType(),
                                APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
@@ -792,7 +798,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // 'and' which will be much cheaper than the pair of casts.
   if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
     // TODO: Subsume this into EvaluateInDifferentType.
-    
+
     // Get the sizes of the types involved.  We know that the intermediate type
     // will be smaller than A or C, but don't know the relation between A and C.
     Value *A = CSrc->getOperand(0);
@@ -809,7 +815,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask");
       return new ZExtInst(And, CI.getType());
     }
-    
+
     if (SrcSize == DstSize) {
       APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
       return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
@@ -818,7 +824,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     if (SrcSize > DstSize) {
       Value *Trunc = Builder->CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
-      return BinaryOperator::CreateAnd(Trunc, 
+      return BinaryOperator::CreateAnd(Trunc,
                                        ConstantInt::get(Trunc->getType(),
                                                         AndValue));
     }
@@ -876,7 +882,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     Value *New = Builder->CreateZExt(X, CI.getType());
     return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
   }
-  
+
   return 0;
 }
 
@@ -989,14 +995,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   // If this is a constant, it can be trivially promoted.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If this is a truncate from the dest type, we can trivially eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
@@ -1015,14 +1021,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // These operators can all arbitrarily be extended if their inputs can.
     return CanEvaluateSExtd(I->getOperand(0), Ty) &&
            CanEvaluateSExtd(I->getOperand(1), Ty);
-      
+
   //case Instruction::Shl:   TODO
   //case Instruction::LShr:  TODO
-      
+
   case Instruction::Select:
     return CanEvaluateSExtd(I->getOperand(1), Ty) &&
            CanEvaluateSExtd(I->getOperand(2), Ty);
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -1036,24 +1042,24 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitSExt(SExtInst &CI) {
-  // If this sign extend is only used by a truncate, let the truncate by
-  // eliminated before we try to optimize this zext.
+  // If this sign extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this sext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -1076,7 +1082,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     // cast with the result.
     if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit a shl + ashr to do the sign extend.
     Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
     return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
@@ -1089,7 +1095,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
       uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
       uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-      
+
       // We need to emit a shl + ashr to do the sign extend.
       Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
       Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
@@ -1125,7 +1131,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     A = Builder->CreateShl(A, ShAmtV, CI.getName());
     return BinaryOperator::CreateAShr(A, ShAmtV);
   }
-  
+
   return 0;
 }
 
@@ -1147,7 +1153,7 @@ static Value *LookThroughFPExtensions(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::FPExt)
       return LookThroughFPExtensions(I->getOperand(0));
-  
+
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
@@ -1166,14 +1172,14 @@ static Value *LookThroughFPExtensions(Value *V) {
       return V;
     // Don't try to shrink to various long double types.
   }
-  
+
   return V;
 }
 
 Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
+
   // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
   // smaller than the destination type, we can eliminate the truncate by doing
   // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well
@@ -1190,7 +1196,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       Type *SrcTy = OpI->getType();
       Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
       Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
-      if (LHSTrunc->getType() != SrcTy && 
+      if (LHSTrunc->getType() != SrcTy &&
           RHSTrunc->getType() != SrcTy) {
         unsigned DstSize = CI.getType()->getScalarSizeInBits();
         // If the source types were both smaller than the destination type of
@@ -1202,10 +1208,36 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
           return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
         }
       }
-      break;  
+      break;
+    }
+
+    // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+    if (BinaryOperator::isFNeg(OpI)) {
+      Value *InnerTrunc = Builder->CreateFPTrunc(OpI->getOperand(1),
+                                                 CI.getType());
+      return BinaryOperator::CreateFNeg(InnerTrunc);
     }
   }
-  
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
+  if (II) {
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::fabs: {
+        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
+                                                   CI.getType());
+        Type *IntrinsicType[] = { CI.getType() };
+        Function *Overload =
+          Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(),
+                                    II->getIntrinsicID(), IntrinsicType);
+
+        Value *Args[] = { InnerTrunc };
+        return CallInst::Create(Overload, Args, II->getName());
+      }
+    }
+  }
+
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
@@ -1220,7 +1252,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", 
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
                                                    Callee->getAttributes(),
                                                    Builder->getFloatTy(),
                                                    Builder->getFloatTy(),
@@ -1228,15 +1260,15 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
-      
-      
+
+
       // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
       ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
       EraseInstFromFunction(*Call);
       return ret;
     }
   }
-  
+
   return 0;
 }
 
@@ -1254,7 +1286,7 @@ Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */
@@ -1268,19 +1300,19 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
   if (OpI == 0)
     return commonCastTransforms(FI);
-  
+
   // fptosi(sitofp(X)) --> X
   // fptosi(uitofp(X)) --> X
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() <=
                     OpI->getType()->getFPMantissaWidth())
     return ReplaceInstUsesWith(FI, OpI->getOperand(0));
-  
+
   return commonCastTransforms(FI);
 }
 
@@ -1296,21 +1328,16 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-  if (TD) {
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
-        TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                      TD->getIntPtrType(CI.getContext()));
-      return new IntToPtrInst(P, CI.getType());
-    }
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
-        TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreateZExt(CI.getOperand(0),
-                                     TD->getIntPtrType(CI.getContext()));
-      return new IntToPtrInst(P, CI.getType());
-    }
+  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
   }
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
@@ -1320,34 +1347,32 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
 /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
 Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
-  
+
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
     if (GEP->hasAllZeroIndices()) {
       // Changing the cast operand is usually not a good idea but it is safe
-      // here because the pointer operand is being replaced with another 
+      // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
       Worklist.Add(GEP);
       CI.setOperand(0, GEP->getOperand(0));
       return &CI;
     }
-    
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
     // non-type-safe code.
+    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
     if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
-        GEP->hasAllConstantIndices()) {
-      SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
-      int64_t Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
-
+        GEP->accumulateConstantOffset(*TD, Offset)) {
       // Get the base pointer input of the bitcast, and the type it points to.
       Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
       Type *GEPIdxTy =
       cast<PointerType>(OrigBase->getType())->getElementType();
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices)) {
+      if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) {
         // If we were able to index down into an element, create the GEP
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
@@ -1355,15 +1380,15 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
         Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
-        
+
         if (isa<BitCastInst>(CI))
           return new BitCastInst(NGEP, CI.getType());
         assert(isa<PtrToIntInst>(CI));
         return new PtrToIntInst(NGEP, CI.getType());
-      }      
+      }
     }
   }
-  
+
   return commonCastTransforms(CI);
 }
 
@@ -1371,19 +1396,15 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-  if (TD) {
-    if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
-      return new TruncInst(P, CI.getType());
-    }
-    if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
-      return new ZExtInst(P, CI.getType());
-    }
+  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
+    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
   }
-  
+
   return commonPointerCastTransforms(CI);
 }
 
@@ -1398,33 +1419,33 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
   VectorType *SrcTy = cast<VectorType>(InVal->getType());
-  
+
   if (SrcTy->getElementType() != DestTy->getElementType()) {
     // The input types don't need to be identical, but for now they must be the
     // same size.  There is no specific reason we couldn't handle things like
     // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
-    // there yet. 
+    // there yet.
     if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
         DestTy->getElementType()->getPrimitiveSizeInBits())
       return 0;
-    
+
     SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
   }
-  
+
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
   SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
-  
+
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
     // elements from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
     for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
       ShuffleMask.push_back(i);
-    
+
   } else {
     // If we're increasing the number of elements, shuffle in all of the
     // elements from InVal and fill the rest of the result elements with zeros
@@ -1438,7 +1459,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
     for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
       ShuffleMask.push_back(SrcElts);
   }
-  
+
   return new ShuffleVectorInst(InVal, V2,
                                ConstantDataVector::get(V2->getContext(),
                                                        ShuffleMask));
@@ -1465,7 +1486,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                      Type *VecEltTy) {
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
-  
+
   // If we got down to a value of the right type, we win, try inserting into the
   // right element.
   if (V->getType() == VecEltTy) {
@@ -1473,15 +1494,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (Constant *C = dyn_cast<Constant>(V))
       if (C->isNullValue())
         return true;
-    
+
     // Fail if multiple elements are inserted into this slot.
     if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
       return false;
-    
+
     Elements[ElementIndex] = V;
     return true;
   }
-  
+
   if (Constant *C = dyn_cast<Constant>(V)) {
     // Figure out the # elements this provides, and bitcast it or slice it up
     // as required.
@@ -1492,7 +1513,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
                                       ElementIndex, Elements, VecEltTy);
-    
+
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
     if (!isa<IntegerType>(C->getType()))
@@ -1500,7 +1521,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                        C->getType()->getPrimitiveSizeInBits()));
     unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
-    
+
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
                                                                i*ElementSize));
@@ -1510,23 +1531,23 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     }
     return true;
   }
-  
+
   if (!V->hasOneUse()) return false;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return false;
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::Or:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
                                     Elements, VecEltTy) &&
@@ -1538,11 +1559,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (CI == 0) return false;
     if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
     unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-    
+
     return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
                                     Elements, VecEltTy);
   }
-      
+
   }
 }
 
@@ -1577,11 +1598,11 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
   Value *Result = Constant::getNullValue(CI.getType());
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] == 0) continue;  // Unset element.
-    
+
     Result = IC.Builder->CreateInsertElement(Result, Elements[i],
                                              IC.Builder->getInt32(i));
   }
-  
+
   return Result;
 }
 
@@ -1589,6 +1610,9 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
 /// bitcast.  The various long double bitcasts can't get in here.
 static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType();
 
@@ -1609,11 +1633,14 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-    
-      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
+
+      unsigned Elt = 0;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
-  
+
   // bitcast(trunc(lshr(bitcast(somevector), cst))
   ConstantInt *ShAmt = 0;
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
@@ -1630,8 +1657,10 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-      
+
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
@@ -1654,12 +1683,12 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     PointerType *SrcPTy = cast<PointerType>(SrcTy);
     Type *DstElTy = DstPTy->getElementType();
     Type *SrcElTy = SrcPTy->getElementType();
-    
+
     // If the address spaces don't match, don't eliminate the bitcast, which is
     // required for changing types.
     if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
       return 0;
-    
+
     // If we are casting a alloca to a pointer to a type of the same
     // size, rewrite the allocation instruction to allocate the "right" type.
     // There is no need to modify malloc calls because it is their bitcast that
@@ -1667,14 +1696,14 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
       if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
         return V;
-    
+
     // If the source and destination are pointers, and this cast is equivalent
     // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
     // This can enhance SROA and other transforms that want type-safe pointers.
     Constant *ZeroUInt =
       Constant::getNullValue(Type::getInt32Ty(CI.getContext()));
     unsigned NumZeros = 0;
-    while (SrcElTy != DstElTy && 
+    while (SrcElTy != DstElTy &&
            isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
            SrcElTy->getNumContainedTypes() /* not "{}" */) {
       SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
@@ -1687,7 +1716,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
-  
+
   // Try to optimize int -> float bitcasts.
   if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
     if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
@@ -1700,7 +1729,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
       // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
     }
-    
+
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
@@ -1713,7 +1742,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                                                cast<VectorType>(DestTy), *this))
               return I;
       }
-      
+
       // If the input is an 'or' instruction, we may be doing shifts and ors to
       // assemble the elements of the vector manually.  Try to rip the code out
       // and replace it with insertelements.
@@ -1723,18 +1752,29 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   }
 
   if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
-    if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
-      Value *Elem = 
-        Builder->CreateExtractElement(Src,
-                   Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-      return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+    if (SrcVTy->getNumElements() == 1) {
+      // If our destination is not a vector, then make this a straight
+      // scalar-scalar cast.
+      if (!DestTy->isVectorTy()) {
+        Value *Elem =
+          Builder->CreateExtractElement(Src,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+        return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+      }
+
+      // Otherwise, see if our source is an insert. If so, then use the scalar
+      // component directly.
+      if (InsertElementInst *IEI =
+            dyn_cast<InsertElementInst>(CI.getOperand(0)))
+        return CastInst::Create(Instruction::BitCast, IEI->getOperand(1),
+                                DestTy);
     }
   }
 
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
-    if (SVI->hasOneUse() && DestTy->isVectorTy() && 
+    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
         cast<VectorType>(DestTy)->getNumElements() ==
               SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
@@ -1743,9 +1783,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
       // us to eliminate at least one cast.
-      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) && 
+      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
            Tmp->getOperand(0)->getType() == DestTy) ||
-          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) && 
+          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
         Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy);
         Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy);
@@ -1755,7 +1795,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       }
     }
   }
-  
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7c3f8fe..a96e754 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -12,15 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -139,6 +139,31 @@ static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
   }
 }
 
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+static bool isSignTest(ICmpInst::Predicate &pred, const ConstantInt *RHS) {
+  if (!ICmpInst::isSigned(pred))
+    return false;
+
+  if (RHS->isZero())
+    return ICmpInst::isRelational(pred);
+
+  if (RHS->isOne()) {
+    if (pred == ICmpInst::ICMP_SLT) {
+      pred = ICmpInst::ICMP_SLE;
+      return true;
+    }
+  } else if (RHS->isAllOnesValue()) {
+    if (pred == ICmpInst::ICMP_SGT) {
+      pred = ICmpInst::ICMP_SGE;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 // isHighOnes - Return true if the constant is of the form 1+0+.
 // This is the same as lowones(~X).
 static bool isHighOnes(const ConstantInt *CI) {
@@ -443,20 +468,29 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   }
 
 
-  // If a 32-bit or 64-bit magic bitvector captures the entire comparison state
+  // If a magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
-  if (ArrayElementCount <= 32 ||
-      (TD && ArrayElementCount <= 64 && TD->isLegalInteger(64))) {
-    Type *Ty;
-    if (ArrayElementCount <= 32)
+  {
+    Type *Ty = 0;
+
+    // Look for an appropriate type:
+    // - The type of Idx if the magic fits
+    // - The smallest fitting legal type if we have a DataLayout
+    // - Default to i32
+    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+      Ty = Idx->getType();
+    else if (TD)
+      Ty = TD->getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+    else if (ArrayElementCount <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
-    else
-      Ty = Type::getInt64Ty(Init->getContext());
-    Value *V = Builder->CreateIntCast(Idx, Ty, false);
-    V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
-    V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
-    return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+
+    if (Ty != 0) {
+      Value *V = Builder->CreateIntCast(Idx, Ty, false);
+      V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
+      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+    }
   }
 
   return 0;
@@ -1226,6 +1260,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         ICI.setOperand(0, NewAnd);
         return &ICI;
       }
+
+      // Replace ((X & AndCST) > RHSV) with ((X & AndCST) != 0), if any
+      // bit set in (X & AndCST) will produce a result greater than RHSV.
+      if (ICI.getPredicate() == ICmpInst::ICMP_UGT) {
+        unsigned NTZ = AndCST->getValue().countTrailingZeros();
+        if ((NTZ < AndCST->getBitWidth()) &&
+            APInt::getOneBitSet(AndCST->getBitWidth(), NTZ).ugt(RHSV))
+          return new ICmpInst(ICmpInst::ICMP_NE, LHSI,
+                              Constant::getNullValue(RHS->getType()));
+      }
     }
 
     // Try to optimize things like "A[i]&42 == 0" to index computations.
@@ -1263,6 +1307,23 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     break;
   }
 
+  case Instruction::Mul: {       // (icmp pred (mul X, Val), CI)
+    ConstantInt *Val = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!Val) break;
+
+    // If this is a signed comparison to 0 and the mul is sign preserving,
+    // use the mul LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) && !Val->isZero() &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(Val->isNegative() ?
+                          ICmpInst::getSwappedPredicate(pred) : pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
+    break;
+  }
+
   case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
     ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
     if (!ShAmt) break;
@@ -1294,6 +1355,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantExpr::getLShr(RHS, ShAmt));
 
+      // If the shift is NSW and we compare to 0, then it is just shifting out
+      // sign bits, no need for an AND either.
+      if (cast<BinaryOperator>(LHSI)->hasNoSignedWrap() && RHSV == 0)
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantExpr::getLShr(RHS, ShAmt));
+
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
@@ -1308,6 +1375,15 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       }
     }
 
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
     // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
     bool TrueIfSigned = false;
     if (LHSI->hasOneUse() &&
@@ -1321,6 +1397,26 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
                           And, Constant::getNullValue(And->getType()));
     }
+
+    // Transform (icmp pred iM (shl iM %v, N), CI)
+    // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (CI>>N))
+    // Transform the shl to a trunc if (trunc (CI>>N)) has no loss and M-N.
+    // This enables to get rid of the shift in favor of a trunc which can be
+    // free on the target. It has the additional benefit of comparing to a
+    // smaller constant, which will be target friendly.
+    unsigned Amt = ShAmt->getLimitedValue(TypeBits-1);
+    if (LHSI->hasOneUse() &&
+        Amt != 0 && RHSV.countTrailingZeros() >= Amt) {
+      Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt);
+      Constant *NCI = ConstantExpr::getTrunc(
+                        ConstantExpr::getAShr(RHS,
+                          ConstantInt::get(RHS->getType(), Amt)),
+                        NTy);
+      return new ICmpInst(ICI.getPredicate(),
+                          Builder->CreateTrunc(LHSI->getOperand(0), NTy),
+                          NCI);
+    }
+
     break;
   }
 
@@ -1502,6 +1598,19 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             return new ICmpInst(pred, X, NegX);
           }
         }
+        break;
+      case Instruction::Mul:
+        if (RHSV == 0 && BO->hasNoSignedWrap()) {
+          if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+            // The trivial case (mul X, 0) is handled by InstSimplify
+            // General case : (mul X, C) != 0 iff X != 0
+            //                (mul X, C) == 0 iff X == 0
+            if (!BOC->isZero())
+              return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                  Constant::getNullValue(RHS->getType()));
+          }
+        }
+        break;
       default: break;
       }
     } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 4d106fc..337cfe3 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
@@ -150,26 +150,6 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
   return 0;
 }
 
-/// getPointeeAlignment - Compute the minimum alignment of the value pointed
-/// to by the given pointer.
-static unsigned getPointeeAlignment(Value *V, const DataLayout &TD) {
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-    if (CE->getOpcode() == Instruction::BitCast ||
-        (CE->getOpcode() == Instruction::GetElementPtr &&
-         cast<GEPOperator>(CE)->hasAllZeroIndices()))
-      return getPointeeAlignment(CE->getOperand(0), TD);
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    if (!GV->isDeclaration())
-      return TD.getPreferredAlignment(GV);
-
-  if (PointerType *PT = dyn_cast<PointerType>(V->getType()))
-    if (PT->getElementType()->isSized())
-      return TD.getABITypeAlignment(PT->getElementType());
-
-  return 0;
-}
-
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
@@ -265,7 +245,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     }
   }
 
-  if (TD) {
+  if (AI.getAlignment()) {
     // Check to see if this allocation is only modified by a memcpy/memmove from
     // a constant global whose alignment is equal to or exceeds that of the
     // allocation.  If this is the case, we can change all users to use
@@ -274,7 +254,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     // is only subsequently read.
     SmallVector<Instruction *, 4> ToDelete;
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
-      if (AI.getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) {
+      unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(),
+                                                        AI.getAlignment(), TD);
+      if (AI.getAlignment() <= SourceAlign) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
@@ -820,6 +802,13 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
 
+  // If the two stores had the same TBAA tag, preserve it.
+  if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
+    if ((TBAATag = MDNode::getMostGenericTBAA(TBAATag,
+                               OtherStore->getMetadata(LLVMContext::MD_tbaa))))
+      NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  
   // Nuke the old stores.
   EraseInstFromFunction(SI);
   EraseInstFromFunction(*OtherStore);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index cefe45e..173f2bf 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -37,7 +37,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
                       m_Value(B))) &&
       // The "1" can be any value known to be a power of 2.
-      isPowerOfTwo(PowerOf2, IC.getDataLayout())) {
+      isKnownToBeAPowerOfTwo(PowerOf2)) {
     A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
@@ -45,8 +45,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
-    if (I->isLogicalShift() &&
-        isPowerOfTwo(I->getOperand(0), IC.getDataLayout())) {
+    if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0))) {
       // We know that this is an exact/nuw shift and that the input is a
       // non-zero context as well.
       if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
@@ -252,24 +251,136 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   return Changed ? &I : 0;
 }
 
+//
+// Detect pattern:
+//
+// log2(Y*0.5)
+//
+// And check for corresponding fast math flags
+//
+
+static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
+
+   if (!Op->hasOneUse())
+     return;
+
+   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
+   if (!II)
+     return;
+   if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+     return;
+   Log2 = II;
+
+   Value *OpLog2Of = II->getArgOperand(0);
+   if (!OpLog2Of->hasOneUse())
+     return;
+
+   Instruction *I = dyn_cast<Instruction>(OpLog2Of);
+   if (!I)
+     return;
+   if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+     return;
+              
+   ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0));
+   if (CFP && CFP->isExactlyValue(0.5)) {
+     Y = I->getOperand(1);
+     return;
+   }
+   CFP = dyn_cast<ConstantFP>(I->getOperand(1));
+   if (CFP && CFP->isExactlyValue(0.5))
+     Y = I->getOperand(0);
+} 
+
+/// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
+/// true iff the given value is FMul or FDiv with one and only one operand
+/// being a normal constant (i.e. not Zero/NaN/Infinity).
+static bool isFMulOrFDivWithConstant(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || (I->getOpcode() != Instruction::FMul && 
+             I->getOpcode() != Instruction::FDiv))
+    return false;
+
+  ConstantFP *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
+  ConstantFP *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
+
+  if (C0 && C1)
+    return false;
+
+  return (C0 && C0->getValueAPF().isNormal()) ||
+         (C1 && C1->getValueAPF().isNormal());
+}
+
+static bool isNormalFp(const ConstantFP *C) {
+  const APFloat &Flt = C->getValueAPF();
+  return Flt.isNormal() && !Flt.isDenormal();
+}
+
+/// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
+/// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
+/// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
+/// This function is to simplify "FMulOrDiv * C" and returns the 
+/// resulting expression. Note that this function could return NULL in
+/// case the constants cannot be folded into a normal floating-point.
+/// 
+Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
+                                   Instruction *InsertBefore) {
+  assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
+
+  Value *Opnd0 = FMulOrDiv->getOperand(0);
+  Value *Opnd1 = FMulOrDiv->getOperand(1);
+
+  ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0);
+  ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1);
+
+  BinaryOperator *R = 0;
+
+  // (X * C0) * C => X * (C0*C)
+  if (FMulOrDiv->getOpcode() == Instruction::FMul) {
+    Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C);
+    if (isNormalFp(cast<ConstantFP>(F)))
+      R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F);
+  } else {
+    if (C0) {
+      // (C0 / X) * C => (C0 * C) / X
+      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
+      if (isNormalFp(F))
+        R = BinaryOperator::CreateFDiv(F, Opnd1);
+    } else {
+      // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
+      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1));
+      if (isNormalFp(F)) {
+        R = BinaryOperator::CreateFMul(Opnd0, F);
+      } else {
+        // (X / C1) * C => X / (C1/C) 
+        Constant *F = ConstantExpr::getFDiv(C1, C);
+        if (isNormalFp(cast<ConstantFP>(F)))
+          R = BinaryOperator::CreateFDiv(Opnd0, F);
+      }
+    }
+  }
+
+  if (R) {
+    R->setHasUnsafeAlgebra(true);
+    InsertNewInstWith(R, *InsertBefore);
+  }
+
+  return R;
+}
+
 Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  // Simplify mul instructions with a constant RHS.
-  if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
-    if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1C)) {
-      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
-      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
-      if (Op1F->isExactlyValue(1.0))
-        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'fmul double %X, 1.0'
-    } else if (ConstantDataVector *Op1V = dyn_cast<ConstantDataVector>(Op1C)) {
-      // As above, vector X*splat(1.0) -> X in all defined cases.
-      if (ConstantFP *F = dyn_cast_or_null<ConstantFP>(Op1V->getSplatValue()))
-        if (F->isExactlyValue(1.0))
-          return ReplaceInstUsesWith(I, Op0);
-    }
+  if (isa<Constant>(Op0))
+    std::swap(Op0, Op1);
+
+  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  bool AllowReassociate = I.hasUnsafeAlgebra();
 
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {
     // Try to fold constant mul into select arguments.
     if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -278,11 +389,146 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     if (isa<PHINode>(Op0))
       if (Instruction *NV = FoldOpIntoPhi(I))
         return NV;
+
+    ConstantFP *C = dyn_cast<ConstantFP>(Op1);
+    if (C && AllowReassociate && C->getValueAPF().isNormal()) {
+      // Let MDC denote an expression in one of these forms:
+      // X * C, C/X, X/C, where C is a constant.
+      //
+      // Try to simplify "MDC * Constant"
+      if (isFMulOrFDivWithConstant(Op0)) {
+        Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I);
+        if (V)
+          return ReplaceInstUsesWith(I, V);
+      }
+
+      // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
+      Instruction *FAddSub = dyn_cast<Instruction>(Op0);
+      if (FAddSub &&
+          (FAddSub->getOpcode() == Instruction::FAdd ||
+           FAddSub->getOpcode() == Instruction::FSub)) {
+        Value *Opnd0 = FAddSub->getOperand(0);
+        Value *Opnd1 = FAddSub->getOperand(1);
+        ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0);
+        ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1);
+        bool Swap = false;
+        if (C0) {
+          std::swap(C0, C1);
+          std::swap(Opnd0, Opnd1);
+          Swap = true; 
+        }
+
+        if (C1 && C1->getValueAPF().isNormal() &&
+            isFMulOrFDivWithConstant(Opnd0)) {
+          Value *M1 = ConstantExpr::getFMul(C1, C);
+          Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ? 
+                      foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
+                      0;
+          if (M0 && M1) {
+            if (Swap && FAddSub->getOpcode() == Instruction::FSub)
+              std::swap(M0, M1);
+
+            Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ?
+                        BinaryOperator::CreateFAdd(M0, M1) :
+                        BinaryOperator::CreateFSub(M0, M1);
+            Instruction *RI = cast<Instruction>(R);
+            RI->copyFastMathFlags(&I);
+            return RI;
+          }
+        }
+      }
+    }
+  }
+
+
+  // Under unsafe algebra do:
+  // X * log2(0.5*Y) = X*log2(Y) - X
+  if (I.hasUnsafeAlgebra()) {
+    Value *OpX = NULL;
+    Value *OpY = NULL;
+    IntrinsicInst *Log2;
+    detectLog2OfHalf(Op0, OpY, Log2);
+    if (OpY) {
+      OpX = Op1;
+    } else {
+      detectLog2OfHalf(Op1, OpY, Log2);
+      if (OpY) {
+        OpX = Op0;
+      }
+    }
+    // if pattern detected emit alternate sequence
+    if (OpX && OpY) {
+      Log2->setArgOperand(0, OpY);
+      Value *FMulVal = Builder->CreateFMul(OpX, Log2);
+      Instruction *FMul = cast<Instruction>(FMulVal);
+      FMul->copyFastMathFlags(Log2);
+      Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX);
+      FSub->copyFastMathFlags(Log2);
+      return FSub;
+    }
   }
 
-  if (Value *Op0v = dyn_castFNegVal(Op0))     // -X * -Y = X*Y
-    if (Value *Op1v = dyn_castFNegVal(Op1))
-      return BinaryOperator::CreateFMul(Op0v, Op1v);
+  // Handle symmetric situation in a 2-iteration loop
+  Value *Opnd0 = Op0;
+  Value *Opnd1 = Op1;
+  for (int i = 0; i < 2; i++) {
+    bool IgnoreZeroSign = I.hasNoSignedZeros();
+    if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
+      Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
+
+      // -X * -Y => X*Y
+      if (N1)
+        return BinaryOperator::CreateFMul(N0, N1);
+
+      if (Opnd0->hasOneUse()) {
+        // -X * Y => -(X*Y) (Promote negation as high as possible)
+        Value *T = Builder->CreateFMul(N0, Opnd1);
+        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
+        Instruction *Neg = BinaryOperator::CreateFNeg(T);
+        if (I.getFastMathFlags().any()) {
+          cast<Instruction>(T)->copyFastMathFlags(&I);
+          Neg->copyFastMathFlags(&I);
+        }
+        return Neg;
+      }
+    }
+
+    // (X*Y) * X => (X*X) * Y where Y != X
+    //  The purpose is two-fold: 
+    //   1) to form a power expression (of X).
+    //   2) potentially shorten the critical path: After transformation, the
+    //  latency of the instruction Y is amortized by the expression of X*X,
+    //  and therefore Y is in a "less critical" position compared to what it
+    //  was before the transformation.
+    //
+    if (AllowReassociate) {
+      Value *Opnd0_0, *Opnd0_1;
+      if (Opnd0->hasOneUse() &&
+          match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
+        Value *Y = 0;
+        if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
+          Y = Opnd0_1;
+        else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
+          Y = Opnd0_0;
+
+        if (Y) {
+          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
+          T->copyFastMathFlags(&I);
+          T->setDebugLoc(I.getDebugLoc());
+
+          Instruction *R = BinaryOperator::CreateFMul(T, Y);
+          R->copyFastMathFlags(&I);
+          return R;
+        }
+      }
+    }
+
+    if (!isa<Constant>(Op1))
+      std::swap(Opnd0, Opnd1);
+    else
+      break;
+  }
 
   return Changed ? &I : 0;
 }
@@ -567,21 +813,140 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   return 0;
 }
 
+/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
+/// FP value and:
+///    1) 1/C is exact, or 
+///    2) reciprocal is allowed.
+/// If the convertion was successful, the simplified expression "X * 1/C" is
+/// returned; otherwise, NULL is returned.
+///
+static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
+                                             ConstantFP *Divisor,
+                                             bool AllowReciprocal) {
+  const APFloat &FpVal = Divisor->getValueAPF();
+  APFloat Reciprocal(FpVal.getSemantics());
+  bool Cvt = FpVal.getExactInverse(&Reciprocal);
+    
+  if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
+    Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
+    (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
+    Cvt = !Reciprocal.isDenormal();
+  }
+
+  if (!Cvt)
+    return 0;
+
+  ConstantFP *R;
+  R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
+  return BinaryOperator::CreateFMul(Dividend, R);
+}
+
 Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyFDivInst(Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReciprocal = I.hasAllowReciprocal();
+
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-    const APFloat &Op1F = Op1C->getValueAPF();
-
-    // If the divisor has an exact multiplicative inverse we can turn the fdiv
-    // into a cheaper fmul.
-    APFloat Reciprocal(Op1F.getSemantics());
-    if (Op1F.getExactInverse(&Reciprocal)) {
-      ConstantFP *RFP = ConstantFP::get(Builder->getContext(), Reciprocal);
-      return BinaryOperator::CreateFMul(Op0, RFP);
+    if (AllowReassociate) {
+      ConstantFP *C1 = 0;
+      ConstantFP *C2 = Op1C;
+      Value *X;
+      Instruction *Res = 0;
+
+      if (match(Op0, m_FMul(m_Value(X), m_ConstantFP(C1)))) {
+        // (X*C1)/C2 => X * (C1/C2)
+        //
+        Constant *C = ConstantExpr::getFDiv(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal())
+          Res = BinaryOperator::CreateFMul(X, C);
+      } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) {
+        // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
+        //
+        Constant *C = ConstantExpr::getFMul(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal()) {
+          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), 
+                                         AllowReciprocal);
+          if (!Res)
+            Res = BinaryOperator::CreateFDiv(X, C); 
+        }
+      }
+
+      if (Res) {
+        Res->setFastMathFlags(I.getFastMathFlags());
+        return Res;
+      }
+    }
+
+    // X / C => X * 1/C
+    if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal))
+      return T;
+
+    return 0;
+  }
+
+  if (AllowReassociate && isa<ConstantFP>(Op0)) {
+    ConstantFP *C1 = cast<ConstantFP>(Op0), *C2;
+    Constant *Fold = 0;
+    Value *X;
+    bool CreateDiv = true;
+
+    // C1 / (X*C2) => (C1/C2) / X
+    if (match(Op1, m_FMul(m_Value(X), m_ConstantFP(C2))))
+      Fold = ConstantExpr::getFDiv(C1, C2);
+    else if (match(Op1, m_FDiv(m_Value(X), m_ConstantFP(C2)))) {
+      // C1 / (X/C2) => (C1*C2) / X
+      Fold = ConstantExpr::getFMul(C1, C2);
+    } else if (match(Op1, m_FDiv(m_ConstantFP(C2), m_Value(X)))) {
+      // C1 / (C2/X) => (C1/C2) * X
+      Fold = ConstantExpr::getFDiv(C1, C2);
+      CreateDiv = false;
+    }
+
+    if (Fold) {
+      const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
+      if (FoldC.isNormal() && !FoldC.isDenormal()) {
+        Instruction *R = CreateDiv ? 
+                         BinaryOperator::CreateFDiv(Fold, X) :
+                         BinaryOperator::CreateFMul(X, Fold);
+        R->setFastMathFlags(I.getFastMathFlags());
+        return R;
+      }
+    }
+    return 0;
+  }
+
+  if (AllowReassociate) {
+    Value *X, *Y;
+    Value *NewInst = 0;
+    Instruction *SimpR = 0;
+
+    if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // (X/Y) / Z => X / (Y*Z)
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op1)) {
+        NewInst = Builder->CreateFMul(Y, Op1);
+        SimpR = BinaryOperator::CreateFDiv(X, NewInst);
+      }
+    } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // Z / (X/Y) => Z*Y / X
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op0)) {
+        NewInst = Builder->CreateFMul(Op0, Y);
+        SimpR = BinaryOperator::CreateFDiv(NewInst, X);
+      }
+    }
+
+    if (NewInst) {
+      if (Instruction *T = dyn_cast<Instruction>(NewInst))
+        T->setDebugLoc(I.getDebugLoc());
+      SimpR->setFastMathFlags(I.getFastMathFlags());
+      return SimpR;
     }
   }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index de9c77e..b0a998c 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
 using namespace llvm;
 
 /// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)]
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a2d4c88..121aa1f 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Support/PatternMatch.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -127,13 +127,14 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     // If this is a non-volatile load or a cast from the same type,
     // merge.
     if (TI->isCast()) {
-      if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
+      Type *FIOpndTy = FI->getOperand(0)->getType();
+      if (TI->getOperand(0)->getType() != FIOpndTy)
         return 0;
       // The select condition may be a vector. We may only change the operand
       // type if the vector width remains the same (and matches the condition).
       Type *CondTy = SI.getCondition()->getType();
-      if (CondTy->isVectorTy() && CondTy->getVectorNumElements() !=
-          FI->getOperand(0)->getType()->getVectorNumElements())
+      if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() ||
+          CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
         return 0;
     } else {
       return 0;  // unknown unary op.
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 57021f1..8cf76e5 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -49,7 +49,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     I.setOperand(1, Rem);
     return &I;
   }
-  
+
   return 0;
 }
 
@@ -70,10 +70,10 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If this is the opposite shift, we can directly reuse the input of the shift
   // if the needed bits are already zero in the input.  This allows us to reuse
   // the value which means that we don't care if the shift has multiple uses.
@@ -95,14 +95,14 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
         return CanEvaluateTruncated(I->getOperand(0), Ty);
       }
 #endif
-      
+
     }
   }
-  
+
   // We can't mutate something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
-  
+
   switch (I->getOpcode()) {
   default: return false;
   case Instruction::And:
@@ -111,7 +111,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
     return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) &&
            CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC);
-      
+
   case Instruction::Shl: {
     // We can often fold the shift into shifts-by-a-constant.
     CI = dyn_cast<ConstantInt>(I->getOperand(1));
@@ -119,10 +119,10 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
 
     // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
     if (isLeftShift) return true;
-    
+
     // We can always turn shl(c)+shr(c) -> and(c2).
     if (CI->getValue() == NumBits) return true;
-      
+
     unsigned TypeWidth = I->getType()->getScalarSizeInBits();
 
     // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't
@@ -133,20 +133,20 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
                        APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
         return true;
     }
-      
+
     return false;
   }
   case Instruction::LShr: {
     // We can often fold the shift into shifts-by-a-constant.
     CI = dyn_cast<ConstantInt>(I->getOperand(1));
     if (CI == 0) return false;
-    
+
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) return true;
-    
+
     // We can always turn lshr(c)+shl(c) -> and(c2).
     if (CI->getValue() == NumBits) return true;
-      
+
     unsigned TypeWidth = I->getType()->getScalarSizeInBits();
 
     // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
@@ -157,7 +157,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
                           APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
         return true;
     }
-      
+
     return false;
   }
   case Instruction::Select: {
@@ -175,7 +175,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
         return false;
     return true;
   }
-  }      
+  }
 }
 
 /// GetShiftedValue - When CanEvaluateShifted returned true for an expression,
@@ -194,7 +194,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                                          IC.getTargetLibraryInfo());
     return V;
   }
-  
+
   Instruction *I = cast<Instruction>(V);
   IC.Worklist.Add(I);
 
@@ -207,7 +207,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC));
     I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
     return I;
-    
+
   case Instruction::Shl: {
     BinaryOperator *BO = cast<BinaryOperator>(I);
     unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
@@ -227,7 +227,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       BO->setHasNoSignedWrap(false);
       return I;
     }
-    
+
     // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have
     // zeros.
     if (CI->getValue() == NumBits) {
@@ -240,7 +240,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       }
       return V;
     }
-    
+
     // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
@@ -255,19 +255,19 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
     // We only accept shifts-by-a-constant in CanEvaluateShifted.
     ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
-    
+
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) {
       // If this is oversized composite shift, then unsigned shifts get 0.
       unsigned NewShAmt = NumBits+CI->getZExtValue();
       if (NewShAmt >= TypeWidth)
         return Constant::getNullValue(BO->getType());
-      
+
       BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
       BO->setIsExact(false);
       return I;
     }
-    
+
     // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have
     // zeros.
     if (CI->getValue() == NumBits) {
@@ -280,7 +280,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       }
       return V;
     }
-    
+
     // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
@@ -289,7 +289,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     BO->setIsExact(false);
     return BO;
   }
-    
+
   case Instruction::Select:
     I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
     I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC));
@@ -304,7 +304,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                                               NumBits, isLeftShift, IC));
     return PN;
   }
-  }      
+  }
 }
 
 
@@ -312,24 +312,24 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
-  
-  
+
+
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
       CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) {
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
-    
-    return ReplaceInstUsesWith(I, 
+
+    return ReplaceInstUsesWith(I,
                  GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this));
   }
-  
-  
-  // See if we can simplify any instructions used by the instruction whose sole 
+
+
+  // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
-  
+
   // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate
   // a signed shift.
   //
@@ -340,14 +340,14 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
     return &I;
   }
-  
+
   // ((X*C1) << C2) == (X * (C1 << C2))
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
     if (BO->getOpcode() == Instruction::Mul && isLeftShift)
       if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
         return BinaryOperator::CreateMul(BO->getOperand(0),
                                         ConstantExpr::getShl(BOOp, Op1));
-  
+
   // Try to fold constant and into select arguments.
   if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
     if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -355,7 +355,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
   if (isa<PHINode>(Op0))
     if (Instruction *NV = FoldOpIntoPhi(I))
       return NV;
-  
+
   // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
   if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
     Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
@@ -364,7 +364,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     // require that the input operand is a shift-by-constant so that we have
     // confidence that the shifts will get folded together.  We could do this
     // xform in more cases, but it is unlikely to be profitable.
-    if (TrOp && I.isLogicalShift() && TrOp->isShift() && 
+    if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
         isa<ConstantInt>(TrOp->getOperand(1))) {
       // Okay, we'll do this xform.  Make the shift of shift.
       Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
@@ -378,7 +378,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       unsigned SrcSize = TrOp->getType()->getScalarSizeInBits();
       unsigned DstSize = TI->getType()->getScalarSizeInBits();
       APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
-      
+
       // The mask we constructed says what the trunc would do if occurring
       // between the shifts.  We want to know the effect *after* the second
       // shift.  We know that it is a logical shift by a constant, so adjust the
@@ -399,7 +399,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       return new TruncInst(And, I.getType());
     }
   }
-  
+
   if (Op0->hasOneUse()) {
     if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
       // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
@@ -425,14 +425,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
                      APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
         }
-        
+
         // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
         Value *Op0BOOp1 = Op0BO->getOperand(1);
         if (isLeftShift && Op0BOOp1->hasOneUse() &&
-            match(Op0BOOp1, 
-                  m_And(m_Shr(m_Value(V1), m_Specific(Op1)),
-                        m_ConstantInt(CC))) &&
-            cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) {
+            match(Op0BOOp1,
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
+                        m_ConstantInt(CC)))) {
           Value *YS =   // (Y << C)
             Builder->CreateShl(Op0BO->getOperand(0), Op1,
                                          Op0BO->getName());
@@ -442,7 +441,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
         }
       }
-        
+
       // FALL THROUGH.
       case Instruction::Sub: {
         // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
@@ -458,34 +457,32 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
                      APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
         }
-        
+
         // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
         if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
             match(Op0BO->getOperand(0),
-                  m_And(m_Shr(m_Value(V1), m_Value(V2)),
-                        m_ConstantInt(CC))) && V2 == Op1 &&
-            cast<BinaryOperator>(Op0BO->getOperand(0))
-                ->getOperand(0)->hasOneUse()) {
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))),
+                        m_ConstantInt(CC))) && V2 == Op1) {
           Value *YS = // (Y << C)
             Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
           // X & (CC << C)
           Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
                                          V1->getName()+".mask");
-          
+
           return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
         }
-        
+
         break;
       }
       }
-      
-      
+
+
       // If the operand is an bitwise operator with a constant RHS, and the
       // shift is the only use, we can pull it out of the shift.
       if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
         bool isValid = true;     // Valid only for And, Or, Xor
         bool highBitSet = false; // Transform if high bit of constant set?
-        
+
         switch (Op0BO->getOpcode()) {
         default: isValid = false; break;   // Do not perform transform!
         case Instruction::Add:
@@ -499,7 +496,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           highBitSet = true;
           break;
         }
-        
+
         // If this is a signed shift right, and the high bit is modified
         // by the logical operation, do not perform the transformation.
         // The highBitSet boolean indicates the value of the high bit of
@@ -508,26 +505,26 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
         //
         if (isValid && I.getOpcode() == Instruction::AShr)
           isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
-        
+
         if (isValid) {
           Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
-          
+
           Value *NewShift =
             Builder->CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
           NewShift->takeName(Op0BO);
-          
+
           return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
                                         NewRHS);
         }
       }
     }
   }
-  
+
   // Find out if this is a shift of a shift by a constant.
   BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
   if (ShiftOp && !ShiftOp->isShift())
     ShiftOp = 0;
-  
+
   if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
 
     // This is a constant shift of a constant shift. Be careful about hiding
@@ -548,9 +545,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
     if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
     Value *X = ShiftOp->getOperand(0);
-    
+
     IntegerType *Ty = cast<IntegerType>(I.getType());
-    
+
     // Check for (X << c1) << c2  and  (X >> c1) >> c2
     if (I.getOpcode() == ShiftOp->getOpcode()) {
       uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
@@ -561,11 +558,11 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
         AmtSum = TypeBits-1;  // Saturate to 31 for i32 ashr.
       }
-      
+
       return BinaryOperator::Create(I.getOpcode(), X,
                                     ConstantInt::get(Ty, AmtSum));
     }
-    
+
     if (ShiftAmt1 == ShiftAmt2) {
       // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
       if (I.getOpcode() == Instruction::LShr &&
@@ -605,7 +602,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return NewLShr;
         }
         Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
-        
+
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
@@ -653,12 +650,12 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return NewShl;
         }
         Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-        
+
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
       }
-      
+
       // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
       // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
       if (I.getOpcode() == Instruction::AShr &&
@@ -682,21 +679,21 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  TD))
     return ReplaceInstUsesWith(I, V);
-  
+
   if (Instruction *V = commonShiftTransforms(I))
     return V;
-  
+
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
     unsigned ShAmt = Op1C->getZExtValue();
-    
+
     // If the shifted-out value is known-zero, then this is a NUW shift.
-    if (!I.hasNoUnsignedWrap() && 
+    if (!I.hasNoUnsignedWrap() &&
         MaskedValueIsZero(I.getOperand(0),
                           APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt))) {
           I.setHasNoUnsignedWrap();
           return &I;
         }
-    
+
     // If the shifted out value is all signbits, this is a NSW shift.
     if (!I.hasNoSignedWrap() &&
         ComputeNumSignBits(I.getOperand(0)) > ShAmt) {
@@ -712,7 +709,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       match(I.getOperand(1), m_Constant(C2)))
     return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
 
-  return 0;    
+  return 0;
 }
 
 Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
@@ -722,9 +719,9 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
-  
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  
+
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     unsigned ShAmt = Op1C->getZExtValue();
 
@@ -743,15 +740,15 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         return new ZExtInst(Cmp, II->getType());
       }
     }
-  
+
     // If the shifted-out value is known-zero, then this is an exact shift.
-    if (!I.isExact() && 
+    if (!I.isExact() &&
         MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
       I.setIsExact();
       return &I;
-    }    
+    }
   }
-  
+
   return 0;
 }
 
@@ -762,12 +759,12 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
-  
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     unsigned ShAmt = Op1C->getZExtValue();
-    
+
     // If the input is a SHL by the same constant (ashr (shl X, C), C), then we
     // have a sign-extend idiom.
     Value *X;
@@ -791,23 +788,23 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
     }
 
     // If the shifted-out value is known-zero, then this is an exact shift.
-    if (!I.isExact() && 
+    if (!I.isExact() &&
         MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
       I.setIsExact();
       return &I;
     }
-  }            
-  
+  }
+
   // See if we can turn a signed shr into an unsigned shr.
   if (MaskedValueIsZero(Op0,
                         APInt::getSignBit(I.getType()->getScalarSizeInBits())))
     return BinaryOperator::CreateLShr(Op0, Op1);
-  
+
   // Arithmetic shifting an all-sign-bit value is a no-op.
   unsigned NumSignBits = ComputeNumSignBits(Op0);
   if (NumSignBits == Op0->getType()->getScalarSizeInBits())
     return ReplaceInstUsesWith(I, Op0);
-  
+
   return 0;
 }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 602b203..8add1ea 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -14,17 +14,18 @@
 
 
 #include "InstCombine.h"
-#include "llvm/DataLayout.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/PatternMatch.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
-
-/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// ShrinkDemandedConstant - Check to see if the specified operand of the
 /// specified instruction is a constant integer.  If so, check to see if there
 /// are any bits set in the constant that are not demanded.  If so, shrink the
 /// constant and return true.
-static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, 
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
                                    APInt Demanded) {
   assert(I && "No instruction?");
   assert(OpNo < I->getNumOperands() && "Operand index too large");
@@ -53,8 +54,8 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
   unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
   APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
-  
-  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, 
+
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask,
                                      KnownZero, KnownOne, 0);
   if (V == 0) return false;
   if (V == &Inst) return true;
@@ -65,7 +66,7 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 /// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the
 /// specified instruction operand if possible, updating it in place.  It returns
 /// true if it made any change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
                                         APInt &KnownZero, APInt &KnownOne,
                                         unsigned Depth) {
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
@@ -86,7 +87,7 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
 /// to be one in the expression.  KnownZero contains all the bits that are known
 /// to be zero in the expression. These are provided to potentially allow the
 /// caller (which might recursively be SimplifyDemandedBits itself) to simplify
-/// the expression. KnownOne and KnownZero always follow the invariant that 
+/// the expression. KnownOne and KnownZero always follow the invariant that
 /// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that
 /// the bits in KnownOne and KnownZero may only be accurate for those bits set
 /// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
@@ -133,10 +134,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       return 0;
     return UndefValue::get(VTy);
   }
-  
+
   if (Depth == 6)        // Limit search depth.
     return 0;
-  
+
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
   APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
 
@@ -158,61 +159,74 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If either the LHS or the RHS are Zero, the result is zero.
       ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
       ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
-      
+
       // If all of the demanded bits are known 1 on one side, return the other.
       // These bits cannot contribute to the result of the 'and' in this
       // context.
-      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
           (DemandedMask & ~LHSKnownZero))
         return I->getOperand(0);
-      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
           (DemandedMask & ~RHSKnownZero))
         return I->getOperand(1);
-      
+
       // If all of the demanded bits in the inputs are known zeros, return zero.
       if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
         return Constant::getNullValue(VTy);
-      
+
     } else if (I->getOpcode() == Instruction::Or) {
       // We can simplify (X|Y) -> X or Y in the user's context if we know that
       // only bits from X or Y are demanded.
-      
+
       // If either the LHS or the RHS are One, the result is One.
       ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
       ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
-      
+
       // If all of the demanded bits are known zero on one side, return the
       // other.  These bits cannot contribute to the result of the 'or' in this
       // context.
-      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
           (DemandedMask & ~LHSKnownOne))
         return I->getOperand(0);
-      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
           (DemandedMask & ~RHSKnownOne))
         return I->getOperand(1);
-      
+
       // If all of the potentially set bits on one side are known to be set on
       // the other side, just use the 'other' side.
-      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
           (DemandedMask & (~RHSKnownZero)))
         return I->getOperand(0);
-      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
           (DemandedMask & (~LHSKnownZero)))
         return I->getOperand(1);
+    } else if (I->getOpcode() == Instruction::Xor) {
+      // We can simplify (X^Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+
+      // If all of the demanded bits are known zero on one side, return the
+      // other.
+      if ((DemandedMask & RHSKnownZero) == DemandedMask)
+        return I->getOperand(0);
+      if ((DemandedMask & LHSKnownZero) == DemandedMask)
+        return I->getOperand(1);
     }
-    
+
     // Compute the KnownZero/KnownOne bits to simplify things downstream.
     ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
     return 0;
   }
-  
+
   // If this is the root being simplified, allow it to have multiple uses,
   // just set the DemandedMask to all bits so that we can try to simplify the
   // operands.  This allows visitTruncInst (for example) to simplify the
   // operand of a trunc without duplicating all the logic below.
   if (Depth == 0 && !V->hasOneUse())
     DemandedMask = APInt::getAllOnesValue(BitWidth);
-  
+
   switch (I->getOpcode()) {
   default:
     ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
@@ -224,26 +238,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
                              LHSKnownZero, LHSKnownOne, Depth+1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known 1 on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
-    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
         (DemandedMask & ~LHSKnownZero))
       return I->getOperand(0);
-    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
         (DemandedMask & ~RHSKnownZero))
       return I->getOperand(1);
-    
+
     // If all of the demanded bits in the inputs are known zeros, return zero.
     if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
       return Constant::getNullValue(VTy);
-      
+
     // If the RHS is a constant, see if we can simplify it.
     if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
       return I;
-      
+
     // Output known-1 bits are only known if set in both the LHS & RHS.
     KnownOne = RHSKnownOne & LHSKnownOne;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
@@ -251,36 +265,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::Or:
     // If either the LHS or the RHS are One, the result is One.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
                              RHSKnownZero, RHSKnownOne, Depth+1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, 
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne,
                              LHSKnownZero, LHSKnownOne, Depth+1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
-    
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
-    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
         (DemandedMask & ~LHSKnownOne))
       return I->getOperand(0);
-    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
         (DemandedMask & ~RHSKnownOne))
       return I->getOperand(1);
 
     // If all of the potentially set bits on one side are known to be set on
     // the other side, just use the 'other' side.
-    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
         (DemandedMask & (~RHSKnownZero)))
       return I->getOperand(0);
-    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
         (DemandedMask & (~LHSKnownZero)))
       return I->getOperand(1);
-        
+
     // If the RHS is a constant, see if we can simplify it.
     if (ShrinkDemandedConstant(I, 1, DemandedMask))
       return I;
-          
+
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     KnownZero = RHSKnownZero & LHSKnownZero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
@@ -289,34 +303,34 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   case Instruction::Xor: {
     if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
                              RHSKnownZero, RHSKnownOne, Depth+1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
                              LHSKnownZero, LHSKnownOne, Depth+1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
-    
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
     if ((DemandedMask & RHSKnownZero) == DemandedMask)
       return I->getOperand(0);
     if ((DemandedMask & LHSKnownZero) == DemandedMask)
       return I->getOperand(1);
-    
+
     // If all of the demanded bits are known to be zero on one side or the
     // other, turn this into an *inclusive* or.
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
     if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
-      Instruction *Or = 
+      Instruction *Or =
         BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                  I->getName());
       return InsertNewInstWith(Or, *I);
     }
-    
+
     // If all of the demanded bits on one side are known, and all of the set
     // bits on that side are also known to be set on the other side, turn this
     // into an AND, as we know the bits will be cleared.
     //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
-    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { 
+    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) {
       // all known
       if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
         Constant *AndC = Constant::getIntegerValue(VTy,
@@ -325,12 +339,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return InsertNewInstWith(And, *I);
       }
     }
-    
+
     // If the RHS is a constant, see if we can simplify it.
     // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
     if (ShrinkDemandedConstant(I, 1, DemandedMask))
       return I;
-    
+
     // If our LHS is an 'and' and if it has one use, and if any of the bits we
     // are flipping are known to be set, then the xor is just resetting those
     // bits to zero.  We can just knock out bits from the 'and' and the 'xor',
@@ -343,12 +357,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
         ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
         APInt NewMask = ~(LHSKnownOne & RHSKnownOne & DemandedMask);
-        
+
         Constant *AndC =
           ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
         Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         InsertNewInstWith(NewAnd, *I);
-        
+
         Constant *XorC =
           ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
         Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
@@ -364,17 +378,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   case Instruction::Select:
     if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask,
                              RHSKnownZero, RHSKnownOne, Depth+1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, 
+        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
                              LHSKnownZero, LHSKnownOne, Depth+1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); 
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); 
-    
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
     // If the operands are constants, see if we can simplify them.
     if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
         ShrinkDemandedConstant(I, 2, DemandedMask))
       return I;
-    
+
     // Only known if known in both the LHS and RHS.
     KnownOne = RHSKnownOne & LHSKnownOne;
     KnownZero = RHSKnownZero & LHSKnownZero;
@@ -384,13 +398,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.zext(truncBf);
     KnownZero = KnownZero.zext(truncBf);
     KnownOne = KnownOne.zext(truncBf);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, 
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
                              KnownZero, KnownOne, Depth+1))
       return I;
     DemandedMask = DemandedMask.trunc(BitWidth);
     KnownZero = KnownZero.trunc(BitWidth);
     KnownOne = KnownOne.trunc(BitWidth);
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     break;
   }
   case Instruction::BitCast:
@@ -413,12 +427,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
                              KnownZero, KnownOne, Depth+1))
       return I;
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     break;
   case Instruction::ZExt: {
     // Compute the bits in the result that are not present in the input.
     unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
-    
+
     DemandedMask = DemandedMask.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
@@ -428,7 +442,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     // The top bits are known to be zero.
     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
     break;
@@ -436,8 +450,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   case Instruction::SExt: {
     // Compute the bits in the result that are not present in the input.
     unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
-    
-    APInt InputDemandedBits = DemandedMask & 
+
+    APInt InputDemandedBits = DemandedMask &
                               APInt::getLowBitsSet(BitWidth, SrcBitWidth);
 
     APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
@@ -445,7 +459,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // bit is demanded.
     if ((NewBits & DemandedMask) != 0)
       InputDemandedBits.setBit(SrcBitWidth-1);
-      
+
     InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
@@ -455,8 +469,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     InputDemandedBits = InputDemandedBits.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
-      
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
 
@@ -476,7 +490,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // are not demanded, then the add doesn't demand them from its input
     // either.
     unsigned NLZ = DemandedMask.countLeadingZeros();
-      
+
     // If there is a constant on the RHS, there are a variety of xformations
     // we can do.
     if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
@@ -484,13 +498,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // won't work if the RHS is zero.
       if (RHS->isZero())
         break;
-      
+
       // If the top bit of the output is demanded, demand everything from the
       // input.  Otherwise, we demand all the input bits except NLZ top bits.
       APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ));
 
       // Find information about known zero/one bits in the input.
-      if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, 
+      if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits,
                                LHSKnownZero, LHSKnownOne, Depth+1))
         return I;
 
@@ -498,11 +512,11 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // the constant.
       if (ShrinkDemandedConstant(I, 1, InDemandedBits))
         return I;
-      
+
       // Avoid excess work.
       if (LHSKnownZero == 0 && LHSKnownOne == 0)
         break;
-      
+
       // Turn it into OR if input bits are zero.
       if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) {
         Instruction *Or =
@@ -510,26 +524,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                    I->getName());
         return InsertNewInstWith(Or, *I);
       }
-      
+
       // We can say something about the output known-zero and known-one bits,
       // depending on potential carries from the input constant and the
       // unknowns.  For example if the LHS is known to have at most the 0x0F0F0
       // bits set and the RHS constant is 0x01001, then we know we have a known
       // one mask of 0x00001 and a known zero mask of 0xE0F0E.
-      
+
       // To compute this, we first compute the potential carry bits.  These are
       // the bits which may be modified.  I'm not aware of a better way to do
       // this scan.
       const APInt &RHSVal = RHS->getValue();
       APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal));
-      
+
       // Now that we know which bits have carries, compute the known-1/0 sets.
-      
+
       // Bits are known one if they are known zero in one operand and one in the
       // other, and there is no input carry.
-      KnownOne = ((LHSKnownZero & RHSVal) | 
+      KnownOne = ((LHSKnownZero & RHSVal) |
                   (LHSKnownOne & ~RHSVal)) & ~CarryBits;
-      
+
       // Bits are known zero if they are known zero in both operands and there
       // is no input carry.
       KnownZero = LHSKnownZero & ~RHSVal & ~CarryBits;
@@ -580,17 +594,28 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::Shl:
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      {
+        Value *VarX; ConstantInt *C1;
+        if (match(I->getOperand(0), m_Shr(m_Value(VarX), m_ConstantInt(C1)))) {
+          Instruction *Shr = cast<Instruction>(I->getOperand(0));
+          Value *R = SimplifyShrShlDemandedBits(Shr, I, DemandedMask,
+                                                KnownZero, KnownOne);
+          if (R)
+            return R;
+        }
+      }
+
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
       APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
-      
+
       // If the shift is NUW/NSW, then it does demand the high bits.
       ShlOperator *IOp = cast<ShlOperator>(I);
       if (IOp->hasNoSignedWrap())
         DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
       else if (IOp->hasNoUnsignedWrap())
         DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
-      
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, 
+
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
                                KnownZero, KnownOne, Depth+1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
@@ -605,15 +630,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // For a logical shift right
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
-      
+
       // Unsigned shift right.
       APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
-      
+
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<LShrOperator>(I)->isExact())
         DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
-      
+
       if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
                                KnownZero, KnownOne, Depth+1))
         return I;
@@ -637,28 +662,28 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       Instruction *NewVal = BinaryOperator::CreateLShr(
                         I->getOperand(0), I->getOperand(1), I->getName());
       return InsertNewInstWith(NewVal, *I);
-    }    
+    }
 
     // If the sign bit is the only bit demanded by this ashr, then there is no
     // need to do it, the shift doesn't change the high bit.
     if (DemandedMask.isSignBit())
       return I->getOperand(0);
-    
+
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
-      
+
       // Signed shift right.
       APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
       // If any of the "high bits" are demanded, we should set the sign bit as
       // demanded.
       if (DemandedMask.countLeadingZeros() <= ShiftAmt)
         DemandedMaskIn.setBit(BitWidth-1);
-      
+
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<AShrOperator>(I)->isExact())
         DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
-      
+
       if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
                                KnownZero, KnownOne, Depth+1))
         return I;
@@ -667,15 +692,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-        
+
       // Handle the sign bits.
       APInt SignBit(APInt::getSignBit(BitWidth));
       // Adjust to where it is now in the mask.
-      SignBit = APIntOps::lshr(SignBit, ShiftAmt);  
-        
+      SignBit = APIntOps::lshr(SignBit, ShiftAmt);
+
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] || 
+      if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] ||
           (HighBits & ~DemandedMask) == HighBits) {
         // Perform the logical shift right.
         BinaryOperator *NewVal = BinaryOperator::CreateLShr(I->getOperand(0),
@@ -718,7 +743,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         if (LHSKnownOne[BitWidth-1] && ((LHSKnownOne & LowBits) != 0))
           KnownOne |= ~LowBits;
 
-        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); 
+        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       }
     }
 
@@ -756,7 +781,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // just shift the input byte into position to eliminate the bswap.
         unsigned NLZ = DemandedMask.countLeadingZeros();
         unsigned NTZ = DemandedMask.countTrailingZeros();
-          
+
         // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
         // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
         // have 14 leading zeros, round to 8.
@@ -766,7 +791,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         if (BitWidth-NLZ-NTZ == 8) {
           unsigned ResultBit = NTZ;
           unsigned InputBit = BitWidth-NTZ-8;
-          
+
           // Replace this with either a left or right shift to get the byte into
           // the right place.
           Instruction *NewVal;
@@ -779,7 +804,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           NewVal->takeName(I);
           return InsertNewInstWith(NewVal, *I);
         }
-          
+
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
@@ -792,7 +817,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
     break;
   }
-  
+
   // If the client is only demanding bits that we know, return the known
   // constant.
   if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask)
@@ -800,6 +825,81 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   return 0;
 }
 
+/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
+/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
+/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
+/// of "C2-C1".
+///
+/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
+/// ..., bn}, without considering the specific value X is holding.
+/// This transformation is legal iff one of following conditions is hold:
+///  1) All the bit in S are 0, in this case E1 == E2.
+///  2) We don't care those bits in S, per the input DemandedMask.
+///  3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
+///     rest bits.
+///
+/// Currently we only test condition 2).
+///
+/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
+/// not successful.
+Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
+  Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
+
+  unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue();
+  unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue();
+
+  KnownOne.clearAllBits();
+  KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
+  KnownZero &= DemandedMask;
+
+  if (ShlAmt == 0 || ShrAmt == 0)
+    return 0;
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+
+  APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+  APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+
+  bool isLshr = (Shr->getOpcode() == Instruction::LShr);
+  BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
+                      (BitMask1.ashr(ShrAmt) << ShlAmt);
+
+  if (ShrAmt <= ShlAmt) {
+    BitMask2 <<= (ShlAmt - ShrAmt);
+  } else {
+    BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
+                        BitMask2.ashr(ShrAmt - ShlAmt);
+  }
+
+  // Check if condition-2 (see the comment to this function) is satified.
+  if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
+    if (ShrAmt == ShlAmt)
+      return VarX;
+
+    if (!Shr->hasOneUse())
+      return 0;
+
+    BinaryOperator *New;
+    if (ShrAmt < ShlAmt) {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
+      New = BinaryOperator::CreateShl(VarX, Amt);
+      BinaryOperator *Orig = cast<BinaryOperator>(Shl);
+      New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
+    } else {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
+      New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
+                     BinaryOperator::CreateAShr(VarX, Amt);
+      if (cast<BinaryOperator>(Shr)->isExact())
+        New->setIsExact(true);
+    }
+
+    return InsertNewInstWith(New, *Shl);
+  }
+
+  return 0;
+}
 
 /// SimplifyDemandedVectorElts - The specified value produces a vector with
 /// any number of elements. DemandedElts contains the set of elements that are
@@ -821,14 +921,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     UndefElts = EltMask;
     return 0;
   }
-  
+
   if (DemandedElts == 0) { // If nothing is demanded, provide undef.
     UndefElts = EltMask;
     return UndefValue::get(V->getType());
   }
 
   UndefElts = 0;
-  
+
   // Handle ConstantAggregateZero, ConstantVector, ConstantDataSequential.
   if (Constant *C = dyn_cast<Constant>(V)) {
     // Check if this is identity. If so, return 0 since we are not simplifying
@@ -838,7 +938,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
     Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Undef = UndefValue::get(EltTy);
-    
+
     SmallVector<Constant*, 16> Elts;
     for (unsigned i = 0; i != VWidth; ++i) {
       if (!DemandedElts[i]) {   // If not demanded, set to undef.
@@ -846,10 +946,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
         UndefElts.setBit(i);
         continue;
       }
-      
+
       Constant *Elt = C->getAggregateElement(i);
       if (Elt == 0) return 0;
-      
+
       if (isa<UndefValue>(Elt)) {   // Already undef.
         Elts.push_back(Undef);
         UndefElts.setBit(i);
@@ -857,12 +957,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
         Elts.push_back(Elt);
       }
     }
-    
+
     // If we changed the constant, return it.
     Constant *NewCV = ConstantVector::get(Elts);
     return NewCV != C ? NewCV : 0;
   }
-  
+
   // Limit search depth.
   if (Depth == 10)
     return 0;
@@ -881,16 +981,16 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // Conservatively assume that all elements are needed.
     DemandedElts = EltMask;
   }
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return 0;        // Only analyze instructions.
-  
+
   bool MadeChange = false;
   APInt UndefElts2(VWidth, 0);
   Value *TmpV;
   switch (I->getOpcode()) {
   default: break;
-    
+
   case Instruction::InsertElement: {
     // If this is a variable index, we don't know which element it overwrites.
     // demand exactly the same input as we produce.
@@ -903,7 +1003,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
       break;
     }
-    
+
     // If this is inserting an element that isn't demanded, remove this
     // insertelement.
     unsigned IdxNo = Idx->getZExtValue();
@@ -911,7 +1011,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       Worklist.Add(I);
       return I->getOperand(0);
     }
-    
+
     // Otherwise, the element inserted overwrites whatever was there, so the
     // input demanded set is simpler than the output set.
     APInt DemandedElts2 = DemandedElts;
@@ -1007,7 +1107,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded,
                                       UndefElts2, Depth+1);
     if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; }
-      
+
     // Output elements are undefined if both are undefined.
     UndefElts &= UndefElts2;
     break;
@@ -1028,7 +1128,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     } else if (VWidth > InVWidth) {
       // Untested so far.
       break;
-      
+
       // If there are more elements in the result than there are in the source,
       // then an input element is live if any of the corresponding output
       // elements are live.
@@ -1040,7 +1140,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     } else {
       // Untested so far.
       break;
-      
+
       // If there are more elements in the source than there are in the result,
       // then an input element is live if the corresponding output element is
       // live.
@@ -1049,7 +1149,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
         if (DemandedElts[InIdx/Ratio])
           InputDemandedElts.setBit(InIdx);
     }
-    
+
     // div/rem demand all inputs, because they don't want divide by zero.
     TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
                                       UndefElts2, Depth+1);
@@ -1057,7 +1157,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       I->setOperand(0, TmpV);
       MadeChange = true;
     }
-    
+
     UndefElts = UndefElts2;
     if (VWidth > InVWidth) {
       llvm_unreachable("Unimp");
@@ -1092,7 +1192,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
                                       UndefElts2, Depth+1);
     if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
-      
+
     // Output elements are undefined if both are undefined.  Consider things
     // like undef&0.  The result is known zero, not undef.
     UndefElts &= UndefElts2;
@@ -1103,13 +1203,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                       UndefElts, Depth+1);
     if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
     break;
-    
+
   case Instruction::Call: {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
     if (!II) break;
     switch (II->getIntrinsicID()) {
     default: break;
-      
+
     // Binary vector operations that work column-wise.  A dest element is a
     // function of the corresponding input elements from the two inputs.
     case Intrinsic::x86_sse_sub_ss:
@@ -1140,11 +1240,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
           Value *LHS = II->getArgOperand(0);
           Value *RHS = II->getArgOperand(1);
           // Extract the element as scalars.
-          LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, 
+          LHS = InsertNewInstWith(ExtractElementInst::Create(LHS,
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
           RHS = InsertNewInstWith(ExtractElementInst::Create(RHS,
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
-          
+
           switch (II->getIntrinsicID()) {
           default: llvm_unreachable("Case stmts out of sync!");
           case Intrinsic::x86_sse_sub_ss:
@@ -1158,7 +1258,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                                          II->getName()), *II);
             break;
           }
-          
+
           Instruction *New =
             InsertElementInst::Create(
               UndefValue::get(II->getType()), TmpV,
@@ -1166,9 +1266,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                       II->getName());
           InsertNewInstWith(New, *II);
           return New;
-        }            
+        }
       }
-        
+
       // Output elements are undefined if both are undefined.  Consider things
       // like undef&0.  The result is known zero, not undef.
       UndefElts &= UndefElts2;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index dd7ea14..4f71db1 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -13,7 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 /// CheapToScalarize - Return true if the value is cheaper to scalarize than it
 /// is to leave as a vector operation.  isConstant indicates whether we're
@@ -92,6 +94,13 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
     return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
   }
 
+  // Extract a value from a vector add operation with a constant zero.
+  Value *Val = 0; Constant *Con = 0;
+  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
+    if (Con->getAggregateElement(EltNo)->isNullValue())
+      return FindScalarElement(Val, EltNo);
+  }
+
   // Otherwise, we don't know.
   return 0;
 }
@@ -295,12 +304,12 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
     return V;
   }
-  
+
   if (isa<ConstantAggregateZero>(V)) {
     Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
     return V;
   }
-  
+
   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert of an extract from some other vector, include it.
     Value *VecOp    = IEI->getOperand(0);
@@ -595,12 +604,12 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // ShuffleVectorInst is equivalent to the original one.
   for (unsigned i = 0; i < VWidth; ++i) {
     int eltMask;
-    if (Mask[i] == -1) {
+    if (Mask[i] < 0) {
       // This element is an undef value.
       eltMask = -1;
     } else if (Mask[i] < (int)LHSWidth) {
       // This element is from left hand side vector operand.
-      // 
+      //
       // If LHS is going to be replaced (case 1, 2, or 4), calculate the
       // new mask value for the element.
       if (newLHS != LHS) {
@@ -609,8 +618,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         // with a -1 mask value.
         if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
           eltMask = -1;
-      }
-      else
+      } else
         eltMask = Mask[i];
     } else {
       // This element is from right hand side vector operand
@@ -630,8 +638,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
                  && "should have been check above");
           eltMask = -1;
         }
-      }
-      else
+      } else
         eltMask = Mask[i]-LHSWidth;
 
       // If LHS's width is changed, shift the mask value accordingly.
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h
index ea654ae..49efce5 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -11,28 +11,28 @@
 #define INSTCOMBINE_WORKLIST_H
 
 #define DEBUG_TYPE "instcombine"
-#include "llvm/Instruction.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
-  
+
 /// InstCombineWorklist - This is the worklist management logic for
 /// InstCombine.
 class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
   SmallVector<Instruction*, 256> Worklist;
   DenseMap<Instruction*, unsigned> WorklistMap;
-  
+
   void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
   InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
 public:
   InstCombineWorklist() {}
-  
+
   bool isEmpty() const { return Worklist.empty(); }
-  
+
   /// Add - Add the specified instruction to the worklist if it isn't already
   /// in it.
   void Add(Instruction *I) {
@@ -41,12 +41,12 @@ public:
       Worklist.push_back(I);
     }
   }
-  
+
   void AddValue(Value *V) {
     if (Instruction *I = dyn_cast<Instruction>(V))
       Add(I);
   }
-  
+
   /// AddInitialGroup - Add the specified batch of stuff in reverse order.
   /// which should only be done when the worklist is empty and when the group
   /// has no duplicates.
@@ -61,25 +61,25 @@ public:
       Worklist.push_back(I);
     }
   }
-  
+
   // Remove - remove I from the worklist if it exists.
   void Remove(Instruction *I) {
     DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
     if (It == WorklistMap.end()) return; // Not in worklist.
-    
+
     // Don't bother moving everything down, just null out the slot.
     Worklist[It->second] = 0;
-    
+
     WorklistMap.erase(It);
   }
-  
+
   Instruction *RemoveOne() {
     Instruction *I = Worklist.back();
     Worklist.pop_back();
     WorklistMap.erase(I);
     return I;
   }
-  
+
   /// AddUsersToWorkList - When an instruction is simplified, add all users of
   /// the instruction to the work lists because they might get more simplified
   /// now.
@@ -89,18 +89,18 @@ public:
          UI != UE; ++UI)
       Add(cast<Instruction>(*UI));
   }
-  
-  
+
+
   /// Zap - check that the worklist is empty and nuke the backing store for
   /// the map if it is large.
   void Zap() {
     assert(WorklistMap.empty() && "Worklist empty, but map not?");
-    
+
     // Do an explicit clear, this shrinks the map if needed.
     WorklistMap.clear();
   }
 };
-  
+
 } // end namespace llvm.
 
 #endif
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 9a46f25..c6115e3 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -36,22 +36,23 @@
 #define DEBUG_TYPE "instcombine"
 #include "llvm/Transforms/Scalar.h"
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm-c/Initialization.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <climits>
 using namespace llvm;
@@ -65,6 +66,11 @@ STATISTIC(NumExpand,    "Number of expansions");
 STATISTIC(NumFactor   , "Number of factorizations");
 STATISTIC(NumReassoc  , "Number of reassociations");
 
+static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Enable unsafe double to float "
+                                            "shrinking for math lib calls"));
+
 // Initialization Routines
 void llvm::initializeInstCombine(PassRegistry &Registry) {
   initializeInstCombinerPass(Registry);
@@ -156,6 +162,21 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   return !Overflow;
 }
 
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+  if (!FPMO) {
+    I.clearSubclassOptionalData();
+    return;
+  }
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  I.clearSubclassOptionalData();
+  I.setFastMathFlags(FMF);
+}
+
 /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
 /// operators which are associative or commutative:
 //
@@ -213,7 +234,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
             I.clearSubclassOptionalData();
             I.setHasNoSignedWrap(true);
           } else {
-            I.clearSubclassOptionalData();
+            ClearSubclassDataAfterReassociation(I);
           }
 
           Changed = true;
@@ -235,7 +256,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, C);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -257,7 +278,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, B);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -277,7 +298,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, V);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -304,7 +325,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         I.setOperand(1, Folded);
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
-        I.clearSubclassOptionalData();
+        ClearSubclassDataAfterReassociation(I);
 
         Changed = true;
         continue;
@@ -510,8 +531,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
 // instruction if the LHS is a constant negative zero (which is the 'negate'
 // form).
 //
-Value *InstCombiner::dyn_castFNegVal(Value *V) const {
-  if (BinaryOperator::isFNeg(V))
+Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
+  if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
     return BinaryOperator::getFNegArgument(V);
 
   // Constants can be considered to be negated values if they can be folded.
@@ -1303,17 +1324,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   /// into a gep of the original struct.  This is important for SROA and alias
   /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
     if (TD &&
-        !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices() &&
+        !isa<BitCastInst>(BCI->getOperand(0)) &&
+        GEP.accumulateConstantOffset(*TD, Offset) &&
         StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
-      // Determine how much the GEP moves the pointer.
-      SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end());
-      int64_t Offset = TD->getIndexedOffset(GEP.getPointerOperandType(), Ops);
-
       // If this GEP instruction doesn't move the pointer, just replace the GEP
       // with a bitcast of the real input to the dest type.
-      if (Offset == 0) {
+      if (!Offset) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
         if (isa<AllocaInst>(BCI->getOperand(0)) ||
@@ -1337,7 +1356,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       SmallVector<Value*, 8> NewIndices;
       Type *InTy =
         cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
-      if (FindElementAtOffset(InTy, Offset, NewIndices)) {
+      if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
           Builder->CreateGEP(BCI->getOperand(0), NewIndices);
@@ -1471,6 +1490,62 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   return 0;
 }
 
+/// \brief Move the call to free before a NULL test.
+///
+/// Check if this free is accessed after its argument has been test
+/// against NULL (property 0).
+/// If yes, it is legal to move this call in its predecessor block.
+///
+/// The move is performed only if the block containing the call to free
+/// will be removed, i.e.:
+/// 1. it has only one predecessor P, and P has two successors
+/// 2. it contains the call and an unconditional branch
+/// 3. its successor is the same as its predecessor's successor
+///
+/// The profitability is out-of concern here and this function should
+/// be called only if the caller knows this transformation would be
+/// profitable (e.g., for code size).
+static Instruction *
+tryToMoveFreeBeforeNullTest(CallInst &FI) {
+  Value *Op = FI.getArgOperand(0);
+  BasicBlock *FreeInstrBB = FI.getParent();
+  BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
+
+  // Validate part of constraint #1: Only one predecessor
+  // FIXME: We can extend the number of predecessor, but in that case, we
+  //        would duplicate the call to free in each predecessor and it may
+  //        not be profitable even for code size.
+  if (!PredBB)
+    return 0;
+
+  // Validate constraint #2: Does this block contains only the call to
+  //                         free and an unconditional branch?
+  // FIXME: We could check if we can speculate everything in the
+  //        predecessor block
+  if (FreeInstrBB->size() != 2)
+    return 0;
+  BasicBlock *SuccBB;
+  if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB)))
+    return 0;
+
+  // Validate the rest of constraint #1 by matching on the pred branch.
+  TerminatorInst *TI = PredBB->getTerminator();
+  BasicBlock *TrueBB, *FalseBB;
+  ICmpInst::Predicate Pred;
+  if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
+    return 0;
+  if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
+    return 0;
+
+  // Validate constraint #3: Ensure the null case just falls through.
+  if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
+    return 0;
+  assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
+         "Broken CFG: missing edge from predecessor to successor");
+
+  FI.moveBefore(TI);
+  return &FI;
+}
 
 
 Instruction *InstCombiner::visitFree(CallInst &FI) {
@@ -1489,6 +1564,16 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   if (isa<ConstantPointerNull>(Op))
     return EraseInstFromFunction(FI);
 
+  // If we optimize for code size, try to move the call to free before the null
+  // test so that simplify cfg can remove the empty block and dead code
+  // elimination the branch. I.e., helps to turn something like:
+  // if (foo) free(foo);
+  // into
+  // free(foo);
+  if (MinimizeSize)
+    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI))
+      return I;
+
   return 0;
 }
 
@@ -2374,7 +2459,7 @@ public:
   InstCombinerLibCallSimplifier(const DataLayout *TD,
                                 const TargetLibraryInfo *TLI,
                                 InstCombiner *IC)
-    : LibCallSimplifier(TD, TLI) {
+    : LibCallSimplifier(TD, TLI, UnsafeFPShrink) {
     this->IC = IC;
   }
 
@@ -2389,6 +2474,9 @@ public:
 bool InstCombiner::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
+  // Minimizing size?
+  MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                Attribute::MinSize);
 
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index b7be462..623c470 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -15,41 +15,47 @@
 
 #define DEBUG_TYPE "asan"
 
-#include "BlackList.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-
-#include <string>
 #include <algorithm>
+#include <string>
 
 using namespace llvm;
 
 static const uint64_t kDefaultShadowScale = 3;
 static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
-static const uint64_t kDefaultShadowOffsetAndroid = 0;
+static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
 
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
@@ -59,16 +65,22 @@ static const char *kAsanModuleCtorName = "asan.module_ctor";
 static const char *kAsanModuleDtorName = "asan.module_dtor";
 static const int   kAsanCtorAndCtorPriority = 1;
 static const char *kAsanReportErrorTemplate = "__asan_report_";
+static const char *kAsanReportLoadN = "__asan_report_load_n";
+static const char *kAsanReportStoreN = "__asan_report_store_n";
 static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
 static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *kAsanInitName = "__asan_init";
+static const char *kAsanInitName = "__asan_init_v3";
 static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *kAsanMappingScaleName = "__asan_mapping_scale";
 static const char *kAsanStackMallocName = "__asan_stack_malloc";
 static const char *kAsanStackFreeName = "__asan_stack_free";
+static const char *kAsanGenPrefix = "__asan_gen_";
+static const char *kAsanPoisonStackMemoryName = "__asan_poison_stack_memory";
+static const char *kAsanUnpoisonStackMemoryName =
+    "__asan_unpoison_stack_memory";
 
 static const int kAsanStackLeftRedzoneMagic = 0xf1;
 static const int kAsanStackMidRedzoneMagic = 0xf2;
@@ -112,9 +124,10 @@ static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClMemIntrin("asan-memintrin",
        cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
-// This flag may need to be replaced with -fasan-blacklist.
-static cl::opt<std::string>  ClBlackListFile("asan-blacklist",
-       cl::desc("File containing the list of functions to ignore "
+static cl::opt<bool> ClRealignStack("asan-realign-stack",
+       cl::desc("Realign stack to 32"), cl::Hidden, cl::init(true));
+static cl::opt<std::string> ClBlacklistFile("asan-blacklist",
+       cl::desc("File containing the list of objects to ignore "
                 "during instrumentation"), cl::Hidden);
 
 // These flags allow to change the shadow mapping.
@@ -124,6 +137,9 @@ static cl::opt<int> ClMappingScale("asan-mapping-scale",
        cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
 static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log",
        cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1));
+static cl::opt<bool> ClShort64BitOffset("asan-short-64bit-mapping-offset",
+       cl::desc("Use short immediate constant as the mapping offset for 64bit"),
+       cl::Hidden, cl::init(true));
 
 // Optimization flags. Not user visible, used mostly for testing
 // and benchmarking the tool.
@@ -135,6 +151,10 @@ static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp",
 static cl::opt<bool> ClOptGlobals("asan-opt-globals",
        cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true));
 
+static cl::opt<bool> ClCheckLifetime("asan-check-lifetime",
+       cl::desc("Use llvm.lifetime intrinsics to insert extra checks"),
+       cl::Hidden, cl::init(false));
+
 // Debug flags.
 static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
                             cl::init(0));
@@ -148,74 +168,332 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
                                cl::Hidden, cl::init(-1));
 
 namespace {
+/// A set of dynamically initialized globals extracted from metadata.
+class SetOfDynamicallyInitializedGlobals {
+ public:
+  void Init(Module& M) {
+    // Clang generates metadata identifying all dynamically initialized globals.
+    NamedMDNode *DynamicGlobals =
+        M.getNamedMetadata("llvm.asan.dynamically_initialized_globals");
+    if (!DynamicGlobals)
+      return;
+    for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) {
+      MDNode *MDN = DynamicGlobals->getOperand(i);
+      assert(MDN->getNumOperands() == 1);
+      Value *VG = MDN->getOperand(0);
+      // The optimizer may optimize away a global entirely, in which case we
+      // cannot instrument access to it.
+      if (!VG)
+        continue;
+      DynInitGlobals.insert(cast<GlobalVariable>(VG));
+    }
+  }
+  bool Contains(GlobalVariable *G) { return DynInitGlobals.count(G) != 0; }
+ private:
+  SmallSet<GlobalValue*, 32> DynInitGlobals;
+};
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+};
+
+static ShadowMapping getShadowMapping(const Module &M, int LongSize,
+                                      bool ZeroBaseShadow) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
+  bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64;
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+
+  ShadowMapping Mapping;
+
+  // OR-ing shadow offset if more efficient (at least on x86),
+  // but on ppc64 we have to use add since the shadow offset is not neccesary
+  // 1/8-th of the address space.
+  Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset;
+
+  Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 :
+      (LongSize == 32 ? kDefaultShadowOffset32 :
+       IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64);
+  if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) {
+    assert(LongSize == 64);
+    Mapping.Offset = kDefaultShort64bitShadowOffset;
+  }
+  if (!ZeroBaseShadow && ClMappingOffsetLog >= 0) {
+    // Zero offset log is the special case.
+    Mapping.Offset = (ClMappingOffsetLog == 0) ? 0 : 1ULL << ClMappingOffsetLog;
+  }
+
+  Mapping.Scale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  return Mapping;
+}
+
+static size_t RedzoneSizeForScale(int MappingScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << MappingScale);
+}
+
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  AddressSanitizer();
-  virtual const char *getPassName() const;
+  AddressSanitizer(bool CheckInitOrder = true,
+                   bool CheckUseAfterReturn = false,
+                   bool CheckLifetime = false,
+                   StringRef BlacklistFile = StringRef(),
+                   bool ZeroBaseShadow = false)
+      : FunctionPass(ID),
+        CheckInitOrder(CheckInitOrder || ClInitializers),
+        CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
+        CheckLifetime(CheckLifetime || ClCheckLifetime),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
+  virtual const char *getPassName() const {
+    return "AddressSanitizerFunctionPass";
+  }
   void instrumentMop(Instruction *I);
-  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
-                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument);
   Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                            Value *ShadowValue, uint32_t TypeSize);
   Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
-                                 bool IsWrite, size_t AccessSizeIndex);
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument);
   bool instrumentMemIntrinsic(MemIntrinsic *MI);
   void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
                                    Value *Size,
                                    Instruction *InsertBefore, bool IsWrite);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool runOnFunction(Function &F);
-  void createInitializerPoisonCalls(Module &M,
-                                    Value *FirstAddr, Value *LastAddr);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
-  bool poisonStackInFunction(Function &F);
+  void emitShadowMapping(Module &M, IRBuilder<> &IRB) const;
   virtual bool doInitialization(Module &M);
-  virtual bool doFinalization(Module &M);
-  bool insertGlobalRedzones(Module &M);
   static char ID;  // Pass identification, replacement for typeid
 
  private:
-  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
-    Type *Ty = AI->getAllocatedType();
-    uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
-    return SizeInBytes;
-  }
-  uint64_t getAlignedSize(uint64_t SizeInBytes) {
-    return ((SizeInBytes + RedzoneSize - 1)
-            / RedzoneSize) * RedzoneSize;
-  }
-  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
-    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
-    return getAlignedSize(SizeInBytes);
-  }
+  void initializeCallbacks(Module &M);
 
-  Function *checkInterfaceFunction(Constant *FuncOrBitcast);
   bool ShouldInstrumentGlobal(GlobalVariable *G);
-  void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
-                   Value *ShadowBase, bool DoPoison);
   bool LooksLikeCodeInBug11395(Instruction *I);
   void FindDynamicInitializers(Module &M);
-  bool HasDynamicInitializer(GlobalVariable *G);
+
+  bool CheckInitOrder;
+  bool CheckUseAfterReturn;
+  bool CheckLifetime;
+  SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
 
   LLVMContext *C;
   DataLayout *TD;
-  uint64_t MappingOffset;
-  int MappingScale;
-  size_t RedzoneSize;
   int LongSize;
   Type *IntptrTy;
-  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
-  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
   Function *AsanHandleNoReturnFunc;
-  Instruction *CtorInsertBefore;
   OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite.
+  Function *AsanErrorCallbackSized[2];
   InlineAsm *EmptyAsm;
-  SmallSet<GlobalValue*, 32> DynamicallyInitializedGlobals;
-  SmallSet<GlobalValue*, 32> GlobalsCreatedByAsan;
+  SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
+
+  friend struct FunctionStackPoisoner;
+};
+
+class AddressSanitizerModule : public ModulePass {
+ public:
+  AddressSanitizerModule(bool CheckInitOrder = true,
+                         StringRef BlacklistFile = StringRef(),
+                         bool ZeroBaseShadow = false)
+      : ModulePass(ID),
+        CheckInitOrder(CheckInitOrder || ClInitializers),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
+  bool runOnModule(Module &M);
+  static char ID;  // Pass identification, replacement for typeid
+  virtual const char *getPassName() const {
+    return "AddressSanitizerModule";
+  }
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+
+  bool CheckInitOrder;
+  SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
+  OwningPtr<BlackList> BL;
+  SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
+  Type *IntptrTy;
+  LLVMContext *C;
+  DataLayout *TD;
+  ShadowMapping Mapping;
+  Function *AsanPoisonGlobals;
+  Function *AsanUnpoisonGlobals;
+  Function *AsanRegisterGlobals;
+  Function *AsanUnregisterGlobals;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+  Function &F;
+  AddressSanitizer &ASan;
+  DIBuilder DIB;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
+
+  SmallVector<AllocaInst*, 16> AllocaVec;
+  SmallVector<Instruction*, 8> RetVec;
+  uint64_t TotalStackSize;
+  unsigned StackAlignment;
+
+  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
+
+  // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+  struct AllocaPoisonCall {
+    IntrinsicInst *InsBefore;
+    uint64_t Size;
+    bool DoPoison;
+  };
+  SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec;
+
+  // Maps Value to an AllocaInst from which the Value is originated.
+  typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy;
+  AllocaForValueMapTy AllocaForValue;
+
+  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+      : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C),
+        IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)),
+        Mapping(ASan.Mapping),
+        TotalStackSize(0), StackAlignment(1 << Mapping.Scale) {}
+
+  bool runOnFunction() {
+    if (!ClStack) return false;
+    // Collect alloca, ret, lifetime instructions etc.
+    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+      BasicBlock *BB = *DI;
+      visit(*BB);
+    }
+    if (AllocaVec.empty()) return false;
+
+    initializeCallbacks(*F.getParent());
+
+    poisonStack();
+
+    if (ClDebugStack) {
+      DEBUG(dbgs() << F);
+    }
+    return true;
+  }
+
+  // Finds all static Alloca instructions and puts
+  // poisoned red zones around all of them.
+  // Then unpoison everything back before the function returns.
+  void poisonStack();
+
+  // ----------------------- Visitors.
+  /// \brief Collect all Ret instructions.
+  void visitReturnInst(ReturnInst &RI) {
+    RetVec.push_back(&RI);
+  }
+
+  /// \brief Collect Alloca instructions we want (and can) handle.
+  void visitAllocaInst(AllocaInst &AI) {
+    if (!isInterestingAlloca(AI)) return;
+
+    StackAlignment = std::max(StackAlignment, AI.getAlignment());
+    AllocaVec.push_back(&AI);
+    uint64_t AlignedSize =  getAlignedAllocaSize(&AI);
+    TotalStackSize += AlignedSize;
+  }
+
+  /// \brief Collect lifetime intrinsic calls to check for use-after-scope
+  /// errors.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!ASan.CheckLifetime) return;
+    Intrinsic::ID ID = II.getIntrinsicID();
+    if (ID != Intrinsic::lifetime_start &&
+        ID != Intrinsic::lifetime_end)
+      return;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne()) return;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+      return;
+    // Find alloca instruction that corresponds to llvm.lifetime argument.
+    AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
+    if (!AI) return;
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    AllocaPoisonCall APC = {&II, SizeValue, DoPoison};
+    AllocaPoisonCallVec.push_back(APC);
+  }
+
+  // ---------------------- Helpers.
+  void initializeCallbacks(Module &M);
+
+  // Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(AllocaInst &AI) {
+    return (!AI.isArrayAllocation() &&
+            AI.isStaticAlloca() &&
+            AI.getAllocatedType()->isSized());
+  }
+
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
+    return SizeInBytes;
+  }
+  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+    size_t RZ = RedzoneSize();
+    return ((SizeInBytes + RZ - 1) / RZ) * RZ;
+  }
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    return getAlignedSize(SizeInBytes);
+  }
+  /// Finds alloca where the value comes from.
+  AllocaInst *findAllocaForValue(Value *V);
+  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+                      Value *ShadowBase, bool DoPoison);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison);
 };
 
 }  // namespace
@@ -224,13 +502,21 @@ char AddressSanitizer::ID = 0;
 INITIALIZE_PASS(AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
     false, false)
-AddressSanitizer::AddressSanitizer() : FunctionPass(ID) { }
-FunctionPass *llvm::createAddressSanitizerPass() {
-  return new AddressSanitizer();
+FunctionPass *llvm::createAddressSanitizerFunctionPass(
+    bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
+    StringRef BlacklistFile, bool ZeroBaseShadow) {
+  return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
+                              CheckLifetime, BlacklistFile, ZeroBaseShadow);
 }
 
-const char *AddressSanitizer::getPassName() const {
-  return "AddressSanitizer";
+char AddressSanitizerModule::ID = 0;
+INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass", false, false)
+ModulePass *llvm::createAddressSanitizerModulePass(
+    bool CheckInitOrder, StringRef BlacklistFile, bool ZeroBaseShadow) {
+  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile,
+                                    ZeroBaseShadow);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -242,38 +528,44 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
 // Create a constant for Str so that we can pass it to the run-time lib.
 static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
-  return new GlobalVariable(M, StrConst->getType(), true,
-                            GlobalValue::PrivateLinkage, StrConst, "");
+  GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true,
+                            GlobalValue::PrivateLinkage, StrConst,
+                            kAsanGenPrefix);
+  GV->setUnnamedAddr(true);  // Ok to merge these.
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
+  return G->getName().find(kAsanGenPrefix) == 0;
 }
 
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
   // Shadow >> scale
-  Shadow = IRB.CreateLShr(Shadow, MappingScale);
-  if (MappingOffset == 0)
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0)
     return Shadow;
   // (Shadow >> scale) | offset
-  return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy,
-                                               MappingOffset));
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+  else
+    return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
 }
 
 void AddressSanitizer::instrumentMemIntrinsicParam(
     Instruction *OrigIns,
     Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  if (Size->getType() != IntptrTy)
+    Size = IRB.CreateIntCast(Size, IntptrTy, false);
   // Check the first byte.
-  {
-    IRBuilder<> IRB(InsertBefore);
-    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
-  }
+  instrumentAddress(OrigIns, InsertBefore, Addr, 8, IsWrite, Size);
   // Check the last byte.
-  {
-    IRBuilder<> IRB(InsertBefore);
-    Value *SizeMinusOne = IRB.CreateSub(
-        Size, ConstantInt::get(Size->getType(), 1));
-    SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
-    Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
-    Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
-    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
-  }
+  IRB.SetInsertPoint(InsertBefore);
+  Value *SizeMinusOne = IRB.CreateSub(Size, ConstantInt::get(IntptrTy, 1));
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *AddrLast = IRB.CreateAdd(AddrLong, SizeMinusOne);
+  instrumentAddress(OrigIns, InsertBefore, AddrLast, 8, IsWrite, Size);
 }
 
 // Instrument memset/memmove/memcpy
@@ -328,30 +620,6 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   return NULL;
 }
 
-void AddressSanitizer::FindDynamicInitializers(Module& M) {
-  // Clang generates metadata identifying all dynamically initialized globals.
-  NamedMDNode *DynamicGlobals =
-      M.getNamedMetadata("llvm.asan.dynamically_initialized_globals");
-  if (!DynamicGlobals)
-    return;
-  for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) {
-    MDNode *MDN = DynamicGlobals->getOperand(i);
-    assert(MDN->getNumOperands() == 1);
-    Value *VG = MDN->getOperand(0);
-    // The optimizer may optimize away a global entirely, in which case we
-    // cannot instrument access to it.
-    if (!VG)
-      continue;
-
-    GlobalVariable *G = cast<GlobalVariable>(VG);
-    DynamicallyInitializedGlobals.insert(G);
-  }
-}
-// Returns true if a global variable is initialized dynamically in this TU.
-bool AddressSanitizer::HasDynamicInitializer(GlobalVariable *G) {
-  return DynamicallyInitializedGlobals.count(G);
-}
-
 void AddressSanitizer::instrumentMop(Instruction *I) {
   bool IsWrite = false;
   Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
@@ -360,14 +628,12 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!ClInitializers)
+      if (!CheckInitOrder)
         return;
       // If a global variable does not have dynamic initialization we don't
-      // have to instrument it.  However, if a global has external linkage, we
-      // assume it has dynamic initialization, as it may have an initializer
-      // in a different TU.
-      if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
-          !HasDynamicInitializer(G))
+      // have to instrument it.  However, if a global does not have initailizer
+      // at all, we assume it has dynamic initializer (in other TU).
+      if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G))
         return;
     }
   }
@@ -378,21 +644,31 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
   assert(OrigTy->isSized());
   uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
 
-  if (TypeSize != 8  && TypeSize != 16 &&
-      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
-    // Ignore all unusual sizes.
-    return;
-  }
+  assert((TypeSize % 8) == 0);
 
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
+  if (TypeSize == 8  || TypeSize == 16 ||
+      TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, 0);
+  // Instrument unusual size (but still multiple of 8).
+  // We can not do it with a single check, so we do 1-byte check for the first
+  // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+  // to report the actual access size.
   IRBuilder<> IRB(I);
-  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+  Value *LastByte =  IRB.CreateIntToPtr(
+      IRB.CreateAdd(IRB.CreatePointerCast(Addr, IntptrTy),
+                    ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+      OrigPtrTy);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  instrumentAddress(I, I, Addr, 8, IsWrite, Size);
+  instrumentAddress(I, I, LastByte, 8, IsWrite, Size);
 }
 
 // Validate the result of Module::getOrInsertFunction called for an interface
 // function of AddressSanitizer. If the instrumented module defines a function
 // with the same name, their prototypes must match, otherwise
 // getOrInsertFunction returns a bitcast.
-Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) {
+static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
   if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast);
   FuncOrBitcast->dump();
   report_fatal_error("trying to redefine an AddressSanitizer "
@@ -401,10 +677,12 @@ Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) {
 
 Instruction *AddressSanitizer::generateCrashCode(
     Instruction *InsertBefore, Value *Addr,
-    bool IsWrite, size_t AccessSizeIndex) {
+    bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument) {
   IRBuilder<> IRB(InsertBefore);
-  CallInst *Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex],
-                                  Addr);
+  CallInst *Call = SizeArgument
+    ? IRB.CreateCall2(AsanErrorCallbackSized[IsWrite], Addr, SizeArgument)
+    : IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+
   // We don't do Call->setDoesNotReturn() because the BB already has
   // UnreachableInst at the end.
   // This EmptyAsm is required to avoid callback merge.
@@ -415,7 +693,7 @@ Instruction *AddressSanitizer::generateCrashCode(
 Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                                             Value *ShadowValue,
                                             uint32_t TypeSize) {
-  size_t Granularity = 1 << MappingScale;
+  size_t Granularity = 1 << Mapping.Scale;
   // Addr & (Granularity - 1)
   Value *LastAccessedByte = IRB.CreateAnd(
       AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
@@ -431,12 +709,14 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
 }
 
 void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
-                                         IRBuilder<> &IRB, Value *Addr,
-                                         uint32_t TypeSize, bool IsWrite) {
+                                         Instruction *InsertBefore,
+                                         Value *Addr, uint32_t TypeSize,
+                                         bool IsWrite, Value *SizeArgument) {
+  IRBuilder<> IRB(InsertBefore);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
 
   Type *ShadowTy  = IntegerType::get(
-      *C, std::max(8U, TypeSize >> MappingScale));
+      *C, std::max(8U, TypeSize >> Mapping.Scale));
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *CmpVal = Constant::getNullValue(ShadowTy);
@@ -445,7 +725,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
   size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
-  size_t Granularity = 1 << MappingScale;
+  size_t Granularity = 1 << Mapping.Scale;
   TerminatorInst *CrashTerm = 0;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
@@ -464,14 +744,13 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true);
   }
 
-  Instruction *Crash =
-      generateCrashCode(CrashTerm, AddrLong, IsWrite, AccessSizeIndex);
+  Instruction *Crash = generateCrashCode(
+      CrashTerm, AddrLong, IsWrite, AccessSizeIndex, SizeArgument);
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
-void AddressSanitizer::createInitializerPoisonCalls(Module &M,
-                                                    Value *FirstAddr,
-                                                    Value *LastAddr) {
+void AddressSanitizerModule::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
   // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
   Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
   // If that function is not present, this TU contains no globals, or they have
@@ -482,16 +761,9 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M,
   // Set up the arguments to our poison/unpoison functions.
   IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
 
-  // Declare our poisoning and unpoisoning functions.
-  Function *AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
-  AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
-  Function *AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
-  AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
-
   // Add a call to poison all external globals before the given function starts.
-  IRB.CreateCall2(AsanPoisonGlobals, FirstAddr, LastAddr);
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
 
   // Add calls to unpoison all globals before each return instruction.
   for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
@@ -502,14 +774,14 @@ void AddressSanitizer::createInitializerPoisonCalls(Module &M,
   }
 }
 
-bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
+bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
   DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
   if (BL->isIn(*G)) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
-  if (GlobalsCreatedByAsan.count(G)) return false;  // Our own global.
+  if (GlobalWasGeneratedByAsan(G)) return false;  // Our own global.
   // Touch only those globals that will not be defined in other modules.
   // Don't handle ODR type linkages since other modules may be built w/o asan.
   if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
@@ -522,7 +794,7 @@ bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   if (G->isThreadLocal())
     return false;
   // For now, just ignore this Alloca if the alignment is large.
-  if (G->getAlignment() > RedzoneSize) return false;
+  if (G->getAlignment() > RedzoneSize()) return false;
 
   // Ignore all the globals with the names starting with "\01L_OBJC_".
   // Many of those are put into the .cstring section. The linker compresses
@@ -561,10 +833,43 @@ bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   return true;
 }
 
+void AddressSanitizerModule::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Declare our poisoning and unpoisoning functions.
+  AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, NULL));
+  AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
+  AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
+  // Declare functions that register/unregister globals.
+  AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(),
+      IntptrTy, IntptrTy, NULL));
+  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnregisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnregisterGlobalsName,
+      IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+}
+
 // This function replaces all global variables with new variables that have
 // trailing redzones. It also creates a function that poisons
 // redzones and inserts this function into llvm.global_ctors.
-bool AddressSanitizer::insertGlobalRedzones(Module &M) {
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  if (!ClGlobals) return false;
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  if (BL->isIn(M)) return false;
+  C = &(M.getContext());
+  int LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
+  initializeCallbacks(M);
+  DynamicallyInitializedGlobals.Init(M);
+
   SmallVector<GlobalVariable *, 16> GlobalsToChange;
 
   for (Module::GlobalListType::iterator G = M.global_begin(),
@@ -581,32 +886,48 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
   //   size_t size;
   //   size_t size_with_redzone;
   //   const char *name;
+  //   const char *module_name;
   //   size_t has_dynamic_init;
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy,
-                                               IntptrTy, NULL);
+                                               IntptrTy, IntptrTy, NULL);
   SmallVector<Constant *, 16> Initializers(n), DynamicInit;
 
-  IRBuilder<> IRB(CtorInsertBefore);
 
-  if (ClInitializers)
-    FindDynamicInitializers(M);
+  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
+  assert(CtorFunc);
+  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+
+  bool HasDynamicallyInitializedGlobals = false;
 
-  // The addresses of the first and last dynamically initialized globals in
-  // this TU.  Used in initialization order checking.
-  Value *FirstDynamic = 0, *LastDynamic = 0;
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier());
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  ModuleName->setUnnamedAddr(false);
 
   for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
     GlobalVariable *G = GlobalsToChange[i];
     PointerType *PtrTy = cast<PointerType>(G->getType());
     Type *Ty = PtrTy->getElementType();
     uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
-    uint64_t RightRedzoneSize = RedzoneSize +
-        (RedzoneSize - (SizeInBytes % RedzoneSize));
+    uint64_t MinRZ = RedzoneSize();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(MinRZ,
+                         std::min(kMaxGlobalRedzone,
+                                  (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ)
+      RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
     // Determine whether this global should be poisoned in initialization.
-    bool GlobalHasDynamicInitializer = HasDynamicInitializer(G);
+    bool GlobalHasDynamicInitializer =
+        DynamicallyInitializedGlobals.Contains(G);
     // Don't check initialization order if this global is blacklisted.
     GlobalHasDynamicInitializer &= !BL->isInInit(*G);
 
@@ -615,18 +936,14 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
         NewTy, G->getInitializer(),
         Constant::getNullValue(RightRedZoneTy), NULL);
 
-    SmallString<2048> DescriptionOfGlobal = G->getName();
-    DescriptionOfGlobal += " (";
-    DescriptionOfGlobal += M.getModuleIdentifier();
-    DescriptionOfGlobal += ")";
-    GlobalVariable *Name = createPrivateGlobalForString(M, DescriptionOfGlobal);
+    GlobalVariable *Name = createPrivateGlobalForString(M, G->getName());
 
     // Create a new global variable with enough space for a redzone.
     GlobalVariable *NewGlobal = new GlobalVariable(
         M, NewTy, G->isConstant(), G->getLinkage(),
         NewInitializer, "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
-    NewGlobal->setAlignment(RedzoneSize);
+    NewGlobal->setAlignment(MinRZ);
 
     Value *Indices2[2];
     Indices2[0] = IRB.getInt32(0);
@@ -643,15 +960,13 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
         ConstantInt::get(IntptrTy, SizeInBytes),
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer),
         NULL);
 
     // Populate the first and last globals declared in this TU.
-    if (ClInitializers && GlobalHasDynamicInitializer) {
-      LastDynamic = ConstantExpr::getPointerCast(NewGlobal, IntptrTy);
-      if (FirstDynamic == 0)
-        FirstDynamic = LastDynamic;
-    }
+    if (CheckInitOrder && GlobalHasDynamicInitializer)
+      HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
@@ -662,14 +977,8 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
-  if (ClInitializers && FirstDynamic && LastDynamic)
-    createInitializerPoisonCalls(M, FirstDynamic, LastDynamic);
-
-  Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanRegisterGlobalsName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, NULL));
-  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
-
+  if (CheckInitOrder && HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
   IRB.CreateCall2(AsanRegisterGlobals,
                   IRB.CreatePointerCast(AllGlobals, IntptrTy),
                   ConstantInt::get(IntptrTy, n));
@@ -681,12 +990,6 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
   IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
-  Function *AsanUnregisterGlobals =
-      checkInterfaceFunction(M.getOrInsertFunction(
-          kAsanUnregisterGlobalsName,
-          IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
-  AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
-
   IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
                        IRB.CreatePointerCast(AllGlobals, IntptrTy),
                        ConstantInt::get(IntptrTy, n));
@@ -696,33 +999,8 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
   return true;
 }
 
-// virtual
-bool AddressSanitizer::doInitialization(Module &M) {
-  // Initialize the private fields. No one has accessed them before.
-  TD = getAnalysisIfAvailable<DataLayout>();
-
-  if (!TD)
-    return false;
-  BL.reset(new BlackList(ClBlackListFile));
-
-  C = &(M.getContext());
-  LongSize = TD->getPointerSizeInBits();
-  IntptrTy = Type::getIntNTy(*C, LongSize);
-  IntptrPtrTy = PointerType::get(IntptrTy, 0);
-
-  AsanCtorFunction = Function::Create(
-      FunctionType::get(Type::getVoidTy(*C), false),
-      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
-  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
-  CtorInsertBefore = ReturnInst::Create(*C, AsanCtorBB);
-
-  // call __asan_init in the module ctor.
-  IRBuilder<> IRB(CtorInsertBefore);
-  AsanInitFunction = checkInterfaceFunction(
-      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
-  AsanInitFunction->setLinkage(Function::ExternalLinkage);
-  IRB.CreateCall(AsanInitFunction);
-
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
   // Create __asan_report* callbacks.
   for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
     for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -736,74 +1014,68 @@ bool AddressSanitizer::doInitialization(Module &M) {
               FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
     }
   }
+  AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
 
-  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
-  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackFreeName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, IntptrTy, NULL));
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
-
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
+}
 
-  llvm::Triple targetTriple(M.getTargetTriple());
-  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
+void AddressSanitizer::emitShadowMapping(Module &M, IRBuilder<> &IRB) const {
+  // Tell the values of mapping offset and scale to the run-time.
+  GlobalValue *asan_mapping_offset =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                     ConstantInt::get(IntptrTy, Mapping.Offset),
+                     kAsanMappingOffsetName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_offset, true);
+
+  GlobalValue *asan_mapping_scale =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                         ConstantInt::get(IntptrTy, Mapping.Scale),
+                         kAsanMappingScaleName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_scale, true);
+}
 
-  MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
-    (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);
-  if (ClMappingOffsetLog >= 0) {
-    if (ClMappingOffsetLog == 0) {
-      // special case
-      MappingOffset = 0;
-    } else {
-      MappingOffset = 1ULL << ClMappingOffsetLog;
-    }
-  }
-  MappingScale = kDefaultShadowScale;
-  if (ClMappingScale) {
-    MappingScale = ClMappingScale;
-  }
-  // Redzone used for stack and globals is at least 32 bytes.
-  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
-  RedzoneSize = std::max(32, (int)(1 << MappingScale));
+// virtual
+bool AddressSanitizer::doInitialization(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  TD = getAnalysisIfAvailable<DataLayout>();
 
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  DynamicallyInitializedGlobals.Init(M);
 
-  if (ClMappingOffsetLog >= 0) {
-    // Tell the run-time the current values of mapping offset and scale.
-    GlobalValue *asan_mapping_offset =
-        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                       ConstantInt::get(IntptrTy, MappingOffset),
-                       kAsanMappingOffsetName);
-    // Read the global, otherwise it may be optimized away.
-    IRB.CreateLoad(asan_mapping_offset, true);
-  }
-  if (ClMappingScale) {
-    GlobalValue *asan_mapping_scale =
-        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                           ConstantInt::get(IntptrTy, MappingScale),
-                           kAsanMappingScaleName);
-    // Read the global, otherwise it may be optimized away.
-    IRB.CreateLoad(asan_mapping_scale, true);
-  }
+  C = &(M.getContext());
+  LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
 
-  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
+  AsanCtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
+  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
+  // call __asan_init in the module ctor.
+  IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB));
+  AsanInitFunction = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
+  AsanInitFunction->setLinkage(Function::ExternalLinkage);
+  IRB.CreateCall(AsanInitFunction);
 
-  return true;
-}
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
+  emitShadowMapping(M, IRB);
 
-bool AddressSanitizer::doFinalization(Module &M) {
-  // We transform the globals at the very end so that the optimization analysis
-  // works on the original globals.
-  if (ClGlobals)
-    return insertGlobalRedzones(M);
-  return false;
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
+  return true;
 }
 
-
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
@@ -823,12 +1095,15 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+  initializeCallbacks(*F.getParent());
 
-  // If needed, insert __asan_init before checking for AddressSafety attr.
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
-  if (!F.getFnAttributes().hasAttribute(Attributes::AddressSafety))
+  if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::SanitizeAddress))
     return false;
 
   if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
@@ -857,12 +1132,12 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
         // ok, take it.
       } else {
-        if (CallInst *CI = dyn_cast<CallInst>(BI)) {
+        CallSite CS(BI);
+        if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CI->doesNotReturn()) {
-            NoReturnCalls.push_back(CI);
-          }
+          if (CS.doesNotReturn())
+            NoReturnCalls.push_back(CS.getInstruction());
         }
         continue;
       }
@@ -887,7 +1162,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     NumInstrumented++;
   }
 
-  bool ChangedStack = poisonStackInFunction(F);
+  FunctionStackPoisoner FSP(F, *this);
+  bool ChangedStack = FSP.runOnFunction();
 
   // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
   // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
@@ -912,10 +1188,10 @@ static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) {
 
 static void PoisonShadowPartialRightRedzone(uint8_t *Shadow,
                                             size_t Size,
-                                            size_t RedzoneSize,
+                                            size_t RZSize,
                                             size_t ShadowGranularity,
                                             uint8_t Magic) {
-  for (size_t i = 0; i < RedzoneSize;
+  for (size_t i = 0; i < RZSize;
        i+= ShadowGranularity, Shadow++) {
     if (i + ShadowGranularity <= Size) {
       *Shadow = 0;  // fully addressable
@@ -927,10 +1203,35 @@ static void PoisonShadowPartialRightRedzone(uint8_t *Shadow,
   }
 }
 
-void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec,
-                                   IRBuilder<> IRB,
-                                   Value *ShadowBase, bool DoPoison) {
-  size_t ShadowRZSize = RedzoneSize >> MappingScale;
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
+  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanStackFreeName, IRB.getVoidTy(),
+      IntptrTy, IntptrTy, IntptrTy, NULL));
+  AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+}
+
+void FunctionStackPoisoner::poisonRedZones(
+  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
+  bool DoPoison) {
+  size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
   assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
   Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8);
   Type *RZPtrTy = PointerType::get(RZTy, 0);
@@ -946,12 +1247,12 @@ void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec,
   IRB.CreateStore(PoisonLeft, IRB.CreateIntToPtr(ShadowBase, RZPtrTy));
 
   // poison all other red zones.
-  uint64_t Pos = RedzoneSize;
+  uint64_t Pos = RedzoneSize();
   for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
     AllocaInst *AI = AllocaVec[i];
     uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
     uint64_t AlignedSize = getAlignedAllocaSize(AI);
-    assert(AlignedSize - SizeInBytes < RedzoneSize);
+    assert(AlignedSize - SizeInBytes < RedzoneSize());
     Value *Ptr = NULL;
 
     Pos += AlignedSize;
@@ -961,13 +1262,13 @@ void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec,
       // Poison the partial redzone at right
       Ptr = IRB.CreateAdd(
           ShadowBase, ConstantInt::get(IntptrTy,
-                                       (Pos >> MappingScale) - ShadowRZSize));
-      size_t AddressableBytes = RedzoneSize - (AlignedSize - SizeInBytes);
+                                       (Pos >> Mapping.Scale) - ShadowRZSize));
+      size_t AddressableBytes = RedzoneSize() - (AlignedSize - SizeInBytes);
       uint32_t Poison = 0;
       if (DoPoison) {
         PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes,
-                                        RedzoneSize,
-                                        1ULL << MappingScale,
+                                        RedzoneSize(),
+                                        1ULL << Mapping.Scale,
                                         kAsanStackPartialRedzoneMagic);
       }
       Value *PartialPoison = ConstantInt::get(RZTy, Poison);
@@ -976,76 +1277,23 @@ void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec,
 
     // Poison the full redzone at right.
     Ptr = IRB.CreateAdd(ShadowBase,
-                        ConstantInt::get(IntptrTy, Pos >> MappingScale));
-    Value *Poison = i == AllocaVec.size() - 1 ? PoisonRight : PoisonMid;
+                        ConstantInt::get(IntptrTy, Pos >> Mapping.Scale));
+    bool LastAlloca = (i == AllocaVec.size() - 1);
+    Value *Poison = LastAlloca ? PoisonRight : PoisonMid;
     IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
 
-    Pos += RedzoneSize;
+    Pos += RedzoneSize();
   }
 }
 
-// Workaround for bug 11395: we don't want to instrument stack in functions
-// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
-// FIXME: remove once the bug 11395 is fixed.
-bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
-  if (LongSize != 32) return false;
-  CallInst *CI = dyn_cast<CallInst>(I);
-  if (!CI || !CI->isInlineAsm()) return false;
-  if (CI->getNumArgOperands() <= 5) return false;
-  // We have inline assembly with quite a few arguments.
-  return true;
-}
-
-// Find all static Alloca instructions and put
-// poisoned red zones around all of them.
-// Then unpoison everything back before the function returns.
-//
-// Stack poisoning does not play well with exception handling.
-// When an exception is thrown, we essentially bypass the code
-// that unpoisones the stack. This is why the run-time library has
-// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
-// stack in the interceptor. This however does not work inside the
-// actual function which catches the exception. Most likely because the
-// compiler hoists the load of the shadow value somewhere too high.
-// This causes asan to report a non-existing bug on 453.povray.
-// It sounds like an LLVM bug.
-bool AddressSanitizer::poisonStackInFunction(Function &F) {
-  if (!ClStack) return false;
-  SmallVector<AllocaInst*, 16> AllocaVec;
-  SmallVector<Instruction*, 8> RetVec;
-  uint64_t TotalSize = 0;
+void FunctionStackPoisoner::poisonStack() {
+  uint64_t LocalStackSize = TotalStackSize +
+                            (AllocaVec.size() + 1) * RedzoneSize();
 
-  // Filter out Alloca instructions we want (and can) handle.
-  // Collect Ret instructions.
-  for (Function::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI) {
-    BasicBlock &BB = *FI;
-    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
-         BI != BE; ++BI) {
-      if (isa<ReturnInst>(BI)) {
-          RetVec.push_back(BI);
-          continue;
-      }
-
-      AllocaInst *AI = dyn_cast<AllocaInst>(BI);
-      if (!AI) continue;
-      if (AI->isArrayAllocation()) continue;
-      if (!AI->isStaticAlloca()) continue;
-      if (!AI->getAllocatedType()->isSized()) continue;
-      if (AI->getAlignment() > RedzoneSize) continue;
-      AllocaVec.push_back(AI);
-      uint64_t AlignedSize =  getAlignedAllocaSize(AI);
-      TotalSize += AlignedSize;
-    }
-  }
-
-  if (AllocaVec.empty()) return false;
-
-  uint64_t LocalStackSize = TotalSize + (AllocaVec.size() + 1) * RedzoneSize;
-
-  bool DoStackMalloc = ClUseAfterReturn
+  bool DoStackMalloc = ASan.CheckUseAfterReturn
       && LocalStackSize <= kMaxStackMallocSize;
 
+  assert(AllocaVec.size() > 0);
   Instruction *InsBefore = AllocaVec[0];
   IRBuilder<> IRB(InsBefore);
 
@@ -1053,7 +1301,9 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
   Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
   AllocaInst *MyAlloca =
       new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
-  MyAlloca->setAlignment(RedzoneSize);
+  if (ClRealignStack && StackAlignment < RedzoneSize())
+    StackAlignment = RedzoneSize();
+  MyAlloca->setAlignment(StackAlignment);
   assert(MyAlloca->isStaticAlloca());
   Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy);
   Value *LocalStackBase = OrigStackBase;
@@ -1063,12 +1313,24 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
   }
 
-  // This string will be parsed by the run-time (DescribeStackAddress).
+  // This string will be parsed by the run-time (DescribeAddressIfStack).
   SmallString<2048> StackDescriptionStorage;
   raw_svector_ostream StackDescription(StackDescriptionStorage);
-  StackDescription << F.getName() << " " << AllocaVec.size() << " ";
+  StackDescription << AllocaVec.size() << " ";
+
+  // Insert poison calls for lifetime intrinsics for alloca.
+  bool HavePoisonedAllocas = false;
+  for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
+    const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
+    IntrinsicInst *II = APC.InsBefore;
+    AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
+    assert(AI);
+    IRBuilder<> IRB(II);
+    poisonAlloca(AI, APC.Size, IRB, APC.DoPoison);
+    HavePoisonedAllocas |= APC.DoPoison;
+  }
 
-  uint64_t Pos = RedzoneSize;
+  uint64_t Pos = RedzoneSize();
   // Replace Alloca instructions with base+offset.
   for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
     AllocaInst *AI = AllocaVec[i];
@@ -1077,57 +1339,115 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
     StackDescription << Pos << " " << SizeInBytes << " "
                      << Name.size() << " " << Name << " ";
     uint64_t AlignedSize = getAlignedAllocaSize(AI);
-    assert((AlignedSize % RedzoneSize) == 0);
-    AI->replaceAllUsesWith(
-        IRB.CreateIntToPtr(
+    assert((AlignedSize % RedzoneSize()) == 0);
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
             IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)),
-            AI->getType()));
-    Pos += AlignedSize + RedzoneSize;
+            AI->getType());
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB);
+    AI->replaceAllUsesWith(NewAllocaPtr);
+    Pos += AlignedSize + RedzoneSize();
   }
   assert(Pos == LocalStackSize);
 
-  // Write the Magic value and the frame description constant to the redzone.
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
   Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
   IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
                   BasePlus0);
-  Value *BasePlus1 = IRB.CreateAdd(LocalStackBase,
-                                   ConstantInt::get(IntptrTy, LongSize/8));
-  BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)),
+    IntptrPtrTy);
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), StackDescription.str());
-  GlobalsCreatedByAsan.insert(StackDescriptionGlobal);
-  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal,
+                                             IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy,
+                                                   2 * ASan.LongSize/8)),
+    IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
 
   // Poison the stack redzones at the entry.
-  Value *ShadowBase = memToShadow(LocalStackBase, IRB);
-  PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRB, ShadowBase, true);
+  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  poisonRedZones(AllocaVec, IRB, ShadowBase, true);
 
   // Unpoison the stack before all ret instructions.
   for (size_t i = 0, n = RetVec.size(); i < n; i++) {
     Instruction *Ret = RetVec[i];
     IRBuilder<> IRBRet(Ret);
-
     // Mark the current frame as retired.
     IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
                        BasePlus0);
     // Unpoison the stack.
-    PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRBRet, ShadowBase, false);
-
+    poisonRedZones(AllocaVec, IRBRet, ShadowBase, false);
     if (DoStackMalloc) {
+      // In use-after-return mode, mark the whole stack frame unaddressable.
       IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
                          ConstantInt::get(IntptrTy, LocalStackSize),
                          OrigStackBase);
+    } else if (HavePoisonedAllocas) {
+      // If we poisoned some allocas in llvm.lifetime analysis,
+      // unpoison whole stack frame now.
+      assert(LocalStackBase == OrigStackBase);
+      poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false);
     }
   }
 
   // We are done. Remove the old unused alloca instructions.
   for (size_t i = 0, n = AllocaVec.size(); i < n; i++)
     AllocaVec[i]->eraseFromParent();
+}
 
-  if (ClDebugStack) {
-    DEBUG(dbgs() << F);
-  }
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+                                         IRBuilder<> IRB, bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall2(DoPoison ? AsanPoisonStackMemoryFunc
+                           : AsanUnpoisonStackMemoryFunc,
+                  AddrArg, SizeArg);
+}
 
-  return true;
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+
+AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    // We're intested only in allocas we can handle.
+    return isInterestingAlloca(*AI) ? AI : 0;
+  // See if we've already calculated (or started to calculate) alloca for a
+  // given value.
+  AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
+  if (I != AllocaForValue.end())
+    return I->second;
+  // Store 0 while we're calculating alloca for value V to avoid
+  // infinite recursion if the value references itself.
+  AllocaForValue[V] = 0;
+  AllocaInst *Res = 0;
+  if (CastInst *CI = dyn_cast<CastInst>(V))
+    Res = findAllocaForValue(CI->getOperand(0));
+  else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *IncValue = PN->getIncomingValue(i);
+      // Allow self-referencing phi-nodes.
+      if (IncValue == PN) continue;
+      AllocaInst *IncValueAI = findAllocaForValue(IncValue);
+      // AI for incoming values should exist and should all be equal.
+      if (IncValueAI == 0 || (Res != 0 && IncValueAI != Res))
+        return 0;
+      Res = IncValueAI;
+    }
+  }
+  if (Res != 0)
+    AllocaForValue[V] = Res;
+  return Res;
 }
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp
index ef34b8a..927982d 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp
@@ -13,26 +13,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <utility>
-#include <string>
-
-#include "BlackList.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Module.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
+#include <string>
+#include <utility>
 
 namespace llvm {
 
 BlackList::BlackList(const StringRef Path) {
   // Validate and open blacklist file.
-  if (!Path.size()) return;
+  if (Path.empty()) return;
   OwningPtr<MemoryBuffer> File;
   if (error_code EC = MemoryBuffer::getFile(Path, File)) {
     report_fatal_error("Can't open blacklist file: " + Path + ": " +
@@ -52,6 +52,10 @@ BlackList::BlackList(const StringRef Path) {
     std::pair<StringRef, StringRef> SplitLine = I->split(":");
     StringRef Prefix = SplitLine.first;
     std::string Regexp = SplitLine.second;
+    if (Regexp.empty()) {
+      // Missing ':' in the line.
+      report_fatal_error("malformed blacklist line: " + SplitLine.first);
+    }
 
     // Replace * with .*
     for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
@@ -68,38 +72,54 @@ BlackList::BlackList(const StringRef Path) {
     }
 
     // Add this regexp into the proper group by its prefix.
-    if (Regexps[Prefix].size())
+    if (!Regexps[Prefix].empty())
       Regexps[Prefix] += "|";
     Regexps[Prefix] += Regexp;
   }
 
   // Iterate through each of the prefixes, and create Regexs for them.
-  for (StringMap<std::string>::iterator I = Regexps.begin(), E = Regexps.end();
-       I != E; ++I) {
+  for (StringMap<std::string>::const_iterator I = Regexps.begin(),
+       E = Regexps.end(); I != E; ++I) {
     Entries[I->getKey()] = new Regex(I->getValue());
   }
 }
 
-bool BlackList::isIn(const Function &F) {
+bool BlackList::isIn(const Function &F) const {
   return isIn(*F.getParent()) || inSection("fun", F.getName());
 }
 
-bool BlackList::isIn(const GlobalVariable &G) {
+bool BlackList::isIn(const GlobalVariable &G) const {
   return isIn(*G.getParent()) || inSection("global", G.getName());
 }
 
-bool BlackList::isIn(const Module &M) {
+bool BlackList::isIn(const Module &M) const {
   return inSection("src", M.getModuleIdentifier());
 }
 
-bool BlackList::isInInit(const GlobalVariable &G) {
-  return isIn(*G.getParent()) || inSection("global-init", G.getName());
+static StringRef GetGVTypeString(const GlobalVariable &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getType()->getElementType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+bool BlackList::isInInit(const GlobalVariable &G) const {
+  return (isIn(*G.getParent()) ||
+          inSection("global-init", G.getName()) ||
+          inSection("global-init-type", GetGVTypeString(G)));
 }
 
 bool BlackList::inSection(const StringRef Section,
-                                  const StringRef Query) {
-  Regex *FunctionRegex = Entries[Section];
-  return FunctionRegex ? FunctionRegex->match(Query) : false;
+                          const StringRef Query) const {
+  StringMap<Regex*>::const_iterator I = Entries.find(Section);
+  if (I == Entries.end()) return false;
+
+  Regex *FunctionRegex = I->getValue();
+  return FunctionRegex->match(Query);
 }
 
 }  // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.h b/contrib/llvm/lib/Transforms/Instrumentation/BlackList.h
deleted file mode 100644
index f3c05a5..0000000
--- a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- BlackList.h - blacklist for sanitizers ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions or global
-// variables based on a user-supplied blacklist.
-//
-// The blacklist disables instrumentation of various functions and global
-// variables.  Each line contains a prefix, followed by a wild card expression.
-// Empty lines and lines starting with "#" are ignored.
-// ---
-// # Blacklisted items:
-// fun:*_ZN4base6subtle*
-// global:*global_with_bad_access_or_initialization*
-// global-init:*global_with_initialization_issues*
-// src:file_with_tricky_code.cc
-// ---
-// Note that the wild card is in fact an llvm::Regex, but * is automatically
-// replaced with .*
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include "llvm/ADT/StringMap.h"
-
-namespace llvm {
-class Function;
-class GlobalVariable;
-class Module;
-class Regex;
-class StringRef;
-
-class BlackList {
- public:
-  BlackList(const StringRef Path);
-  // Returns whether either this function or it's source file are blacklisted.
-  bool isIn(const Function &F);
-  // Returns whether either this global or it's source file are blacklisted.
-  bool isIn(const GlobalVariable &G);
-  // Returns whether this module is blacklisted by filename.
-  bool isIn(const Module &M);
-  // Returns whether a global should be excluded from initialization checking.
-  bool isInInit(const GlobalVariable &G);
- private:
-  StringMap<Regex*> Entries;
-
-  bool inSection(const StringRef Section, const StringRef Query);
-};
-
-}  // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 7810b1b..b094d42 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -13,19 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "bounds-checking"
-#include "llvm/IRBuilder.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Pass.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Instrumentation.h"
 using namespace llvm;
 
 static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
@@ -41,7 +41,7 @@ namespace {
   struct BoundsChecking : public FunctionPass {
     static char ID;
 
-    BoundsChecking(unsigned _Penalty = 5) : FunctionPass(ID), Penalty(_Penalty){
+    BoundsChecking() : FunctionPass(ID) {
       initializeBoundsCheckingPass(*PassRegistry::getPassRegistry());
     }
 
@@ -59,7 +59,6 @@ namespace {
     BuilderTy *Builder;
     Instruction *Inst;
     BasicBlock *TrapBB;
-    unsigned Penalty;
 
     BasicBlock *getTrapBB();
     void emitBranchToTrap(Value *Cmp = 0);
@@ -109,6 +108,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) {
     else
       Cmp = 0; // unconditional branch
   }
+  ++ChecksAdded;
 
   Instruction *Inst = Builder->GetInsertPoint();
   BasicBlock *OldBB = Inst->getParent();
@@ -163,7 +163,6 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
   }
   emitBranchToTrap(Or);
 
-  ++ChecksAdded;
   return true;
 }
 
@@ -208,6 +207,6 @@ bool BoundsChecking::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-FunctionPass *llvm::createBoundsCheckingPass(unsigned Penalty) {
-  return new BoundsChecking(Penalty);
+FunctionPass *llvm::createBoundsCheckingPass() {
+  return new BoundsChecking();
 }
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index e8ef265..a2459fb 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -18,13 +18,13 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-edge-profiling"
 
+#include "llvm/Transforms/Instrumentation.h"
 #include "ProfilingUtils.h"
-#include "llvm/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/ADT/Statistic.h"
 #include <set>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index e9192e5..2edd151 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -16,21 +16,23 @@
 
 #define DEBUG_TYPE "insert-gcov-profiling"
 
-#include "ProfilingUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "ProfilingUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/UniqueVector.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/PathV2.h"
 #include "llvm/Support/raw_ostream.h"
@@ -39,30 +41,57 @@
 #include <utility>
 using namespace llvm;
 
+static cl::opt<std::string>
+DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
+                   cl::ValueRequired);
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.UseCfgChecksum = false;
+  Options.NoRedZone = false;
+  Options.FunctionNamesInData = true;
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
 namespace {
   class GCOVProfiler : public ModulePass {
   public:
     static char ID;
-    GCOVProfiler()
-        : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false),
-          UseExtraChecksum(false) {
+    GCOVProfiler() : ModulePass(ID), Options(GCOVOptions::getDefault()) {
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
-    GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false,
-                 bool useExtraChecksum = false)
-        : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData),
-          Use402Format(use402Format), UseExtraChecksum(useExtraChecksum) {
-      assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?");
+    GCOVProfiler(const GCOVOptions &Options) : ModulePass(ID), Options(Options){
+      assert((Options.EmitNotes || Options.EmitData) &&
+             "GCOVProfiler asked to do nothing?");
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
     virtual const char *getPassName() const {
       return "GCOV Profiler";
     }
+
   private:
     bool runOnModule(Module &M);
 
-    // Create the GCNO files for the Module based on DebugInfo.
-    void emitGCNO();
+    // Create the .gcno files for the Module based on DebugInfo.
+    void emitProfileNotes();
 
     // Modify the program to track transitions along edges and call into the
     // profiling runtime to emit .gcda files when run.
@@ -73,6 +102,8 @@ namespace {
     Constant *getIncrementIndirectCounterFunc();
     Constant *getEmitFunctionFunc();
     Constant *getEmitArcsFunc();
+    Constant *getDeleteWriteoutFunctionListFunc();
+    Constant *getDeleteFlushFunctionListFunc();
     Constant *getEndFileFunc();
 
     // Create or retrieve an i32 state value that is used to represent the
@@ -83,21 +114,22 @@ namespace {
     // block number.
     GlobalVariable *buildEdgeLookupTable(Function *F,
                                          GlobalVariable *Counter,
-                                         const UniqueVector<BasicBlock *> &Preds,
-                                         const UniqueVector<BasicBlock *> &Succs);
+                                         const UniqueVector<BasicBlock *>&Preds,
+                                         const UniqueVector<BasicBlock*>&Succs);
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
+    Function *insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*,
+                                                       MDNode*> >);
+    Function *insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
     void insertIndirectCounterIncrement();
-    void insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
 
     std::string mangleName(DICompileUnit CU, const char *NewStem);
 
-    bool EmitNotes;
-    bool EmitData;
-    bool Use402Format;
-    bool UseExtraChecksum;
+    GCOVOptions Options;
+
+    // Reversed, NUL-terminated copy of Options.Version.
+    char ReversedVersion[5];  
 
     Module *M;
     LLVMContext *Ctx;
@@ -108,10 +140,14 @@ char GCOVProfiler::ID = 0;
 INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
                 "Insert instrumentation for GCOV profiling", false, false)
 
-ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
-                                         bool Use402Format,
-                                         bool UseExtraChecksum) {
-  return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum);
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfiler(Options);
+}
+
+static std::string getFunctionName(DISubprogram SP) {
+  if (!SP.getLinkageName().empty())
+    return SP.getLinkageName();
+  return SP.getName();
 }
 
 namespace {
@@ -249,8 +285,8 @@ namespace {
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
    public:
-    GCOVFunction(DISubprogram SP, raw_ostream *os,
-                 bool Use402Format, bool UseExtraChecksum) {
+    GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
+                 bool UseCfgChecksum) {
       this->os = os;
 
       Function *F = SP.getFunction();
@@ -262,17 +298,16 @@ namespace {
       ReturnBlock = new GCOVBlock(i++, os);
 
       writeBytes(FunctionTag, 4);
-      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) +
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
           1 + lengthOfGCOVString(SP.getFilename()) + 1;
-      if (UseExtraChecksum)
+      if (UseCfgChecksum)
         ++BlockLen;
       write(BlockLen);
-      uint32_t Ident = reinterpret_cast<intptr_t>((MDNode*)SP);
       write(Ident);
       write(0);  // lineno checksum
-      if (UseExtraChecksum)
+      if (UseCfgChecksum)
         write(0);  // cfg checksum
-      writeGCOVString(SP.getName());
+      writeGCOVString(getFunctionName(SP));
       writeGCOVString(SP.getFilename());
       write(SP.getLineNumber());
     }
@@ -347,19 +382,23 @@ std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
 
   SmallString<128> Filename = CU.getFilename();
   sys::path::replace_extension(Filename, NewStem);
-  return sys::path::filename(Filename.str());
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName.str());
+  return CurPath.str();
 }
 
 bool GCOVProfiler::runOnModule(Module &M) {
   this->M = &M;
   Ctx = &M.getContext();
 
-  if (EmitNotes) emitGCNO();
-  if (EmitData) return emitProfileArcs();
+  if (Options.EmitNotes) emitProfileNotes();
+  if (Options.EmitData) return emitProfileArcs();
   return false;
 }
 
-void GCOVProfiler::emitGCNO() {
+void GCOVProfiler::emitProfileNotes() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes) return;
 
@@ -372,10 +411,9 @@ void GCOVProfiler::emitGCNO() {
     std::string ErrorInfo;
     raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo,
                        raw_fd_ostream::F_Binary);
-    if (!Use402Format)
-      out.write("oncg*404MVLL", 12);
-    else
-      out.write("oncg*204MVLL", 12);
+    out.write("oncg", 4);
+    out.write(ReversedVersion, 4);
+    out.write("MVLL", 4);
 
     DIArray SPs = CU.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
@@ -384,7 +422,7 @@ void GCOVProfiler::emitGCNO() {
 
       Function *F = SP.getFunction();
       if (!F) continue;
-      GCOVFunction Func(SP, &out, Use402Format, UseExtraChecksum);
+      GCOVFunction Func(SP, &out, i, Options.UseCfgChecksum);
 
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
         GCOVBlock &Block = Func.getBlock(BB);
@@ -465,21 +503,18 @@ bool GCOVProfiler::emitProfileArcs() {
             Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                                 Edge);
             Value *Count = Builder.CreateLoad(Counter);
-            Count = Builder.CreateAdd(Count,
-                                      ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-            Value *Sel = Builder.CreateSelect(
-              BI->getCondition(),
-              ConstantInt::get(Type::getInt64Ty(*Ctx), Edge),
-              ConstantInt::get(Type::getInt64Ty(*Ctx), Edge + 1));
+            Value *Sel = Builder.CreateSelect(BI->getCondition(),
+                                              Builder.getInt64(Edge),
+                                              Builder.getInt64(Edge + 1));
             SmallVector<Value *, 2> Idx;
-            Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx)));
+            Idx.push_back(Builder.getInt64(0));
             Idx.push_back(Sel);
             Value *Counter = Builder.CreateInBoundsGEP(Counters, Idx);
             Value *Count = Builder.CreateLoad(Counter);
-            Count = Builder.CreateAdd(Count,
-                                      ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else {
             ComplexEdgePreds.insert(BB);
@@ -496,10 +531,9 @@ bool GCOVProfiler::emitProfileArcs() {
                                ComplexEdgePreds, ComplexEdgeSuccs);
         GlobalVariable *EdgeState = getEdgeStateValue();
         
-        Type *Int32Ty = Type::getInt32Ty(*Ctx);
         for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
           IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
-          Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState);
+          Builder.CreateStore(Builder.getInt32(i), EdgeState);
         }
         for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
           // call runtime to perform increment
@@ -518,8 +552,38 @@ bool GCOVProfiler::emitProfileArcs() {
       }
     }
 
-    insertCounterWriteout(CountersBySP);
-    insertFlush(CountersBySP);
+    Function *WriteoutF = insertCounterWriteout(CountersBySP);
+    Function *FlushF = insertFlush(CountersBySP);
+
+    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                   "__llvm_gcov_init", M);
+    F->setUnnamedAddr(true);
+    F->setLinkage(GlobalValue::InternalLinkage);
+    F->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      F->addFnAttr(Attribute::NoRedZone);
+
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    IRBuilder<> Builder(BB);
+
+    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Inialize the environment and register the local writeout and flush
+    // functions.
+    Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
+    Builder.CreateRetVoid();
+
+    appendToGlobalCtors(*M, F, 0);
   }
 
   if (InsertIndCounterIncrCode)
@@ -540,13 +604,13 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
   // read it. Threads and invoke make this untrue.
 
   // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
+  size_t TableSize = Succs.size() * Preds.size();
   Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
-  ArrayType *EdgeTableTy = ArrayType::get(
-      Int64PtrTy, Succs.size() * Preds.size());
+  ArrayType *EdgeTableTy = ArrayType::get(Int64PtrTy, TableSize);
 
-  Constant **EdgeTable = new Constant*[Succs.size() * Preds.size()];
+  OwningArrayPtr<Constant *> EdgeTable(new Constant*[TableSize]);
   Constant *NullValue = Constant::getNullValue(Int64PtrTy);
-  for (int i = 0, ie = Succs.size() * Preds.size(); i != ie; ++i)
+  for (size_t i = 0; i != TableSize; ++i)
     EdgeTable[i] = NullValue;
 
   unsigned Edge = 0;
@@ -556,8 +620,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
     if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) {
       for (int i = 0; i != Successors; ++i) {
         BasicBlock *Succ = TI->getSuccessor(i);
-        IRBuilder<> builder(Succ);
-        Value *Counter = builder.CreateConstInBoundsGEP2_64(Counters, 0,
+        IRBuilder<> Builder(Succ);
+        Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                             Edge + i);
         EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) +
                   (Preds.idFor(BB)-1)] = cast<Constant>(Counter);
@@ -566,7 +630,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
     Edge += Successors;
   }
 
-  ArrayRef<Constant*> V(&EdgeTable[0], Succs.size() * Preds.size());
+  ArrayRef<Constant*> V(&EdgeTable[0], TableSize);
   GlobalVariable *EdgeTableGV =
       new GlobalVariable(
           *M, EdgeTableTy, true, GlobalValue::InternalLinkage,
@@ -577,8 +641,11 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
 }
 
 Constant *GCOVProfiler::getStartFileFunc() {
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Type::getInt8PtrTy(*Ctx), false);
+  Type *Args[] = {
+    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
+    Type::getInt8PtrTy(*Ctx),  // const char version[4]
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
 }
 
@@ -594,9 +661,10 @@ Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
 }
 
 Constant *GCOVProfiler::getEmitFunctionFunc() {
-  Type *Args[2] = {
+  Type *Args[3] = {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
+    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
@@ -607,11 +675,20 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
     Type::getInt32Ty(*Ctx),     // uint32_t num_counters
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
   };
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Args, false);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
+Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
+}
+
+Constant *GCOVProfiler::getDeleteFlushFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_flush_function_list", FTy);
+}
+
 Constant *GCOVProfiler::getEndFileFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
@@ -630,7 +707,7 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
   return GV;
 }
 
-void GCOVProfiler::insertCounterWriteout(
+Function *GCOVProfiler::insertCounterWriteout(
     ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
   FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
@@ -638,6 +715,9 @@ void GCOVProfiler::insertCounterWriteout(
     WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
                                  "__llvm_gcov_writeout", M);
   WriteoutF->setUnnamedAddr(true);
+  WriteoutF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    WriteoutF->addFnAttr(Attribute::NoRedZone);
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
   IRBuilder<> Builder(BB);
@@ -652,48 +732,31 @@ void GCOVProfiler::insertCounterWriteout(
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       DICompileUnit CU(CU_Nodes->getOperand(i));
       std::string FilenameGcda = mangleName(CU, "gcda");
-      Builder.CreateCall(StartFile,
-                         Builder.CreateGlobalStringPtr(FilenameGcda));
-      for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
-             I = CountersBySP.begin(), E = CountersBySP.end();
-           I != E; ++I) {
-        DISubprogram SP(I->second);
-        intptr_t ident = reinterpret_cast<intptr_t>(I->second);
-        Builder.CreateCall2(EmitFunction,
-                            ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
-                            Builder.CreateGlobalStringPtr(SP.getName()));
-        
-        GlobalVariable *GV = I->first;
+      Builder.CreateCall2(StartFile,
+                          Builder.CreateGlobalStringPtr(FilenameGcda),
+                          Builder.CreateGlobalStringPtr(ReversedVersion));
+      for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
+        DISubprogram SP(CountersBySP[j].second);
+        Builder.CreateCall3(
+            EmitFunction, Builder.getInt32(j),
+            Options.FunctionNamesInData ?
+              Builder.CreateGlobalStringPtr(getFunctionName(SP)) :
+              Constant::getNullValue(Builder.getInt8PtrTy()),
+            Builder.getInt8(Options.UseCfgChecksum));
+
+        GlobalVariable *GV = CountersBySP[j].first;
         unsigned Arcs =
           cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
         Builder.CreateCall2(EmitArcs,
-                            ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs),
+                            Builder.getInt32(Arcs),
                             Builder.CreateConstGEP2_64(GV, 0, 0));
       }
       Builder.CreateCall(EndFile);
     }
   }
-  Builder.CreateRetVoid();
 
-  // Create a small bit of code that registers the "__llvm_gcov_writeout"
-  // function to be executed at exit.
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
-                                 "__llvm_gcov_init", M);
-  F->setUnnamedAddr(true);
-  F->setLinkage(GlobalValue::InternalLinkage);
-  F->addFnAttr(Attributes::NoInline);
-
-  BB = BasicBlock::Create(*Ctx, "entry", F);
-  Builder.SetInsertPoint(BB);
-
-  FTy = FunctionType::get(Type::getInt32Ty(*Ctx),
-                          PointerType::get(FTy, 0), false);
-  Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy);
-  Builder.CreateCall(AtExitFn, WriteoutF);
   Builder.CreateRetVoid();
-
-  appendToGlobalCtors(*M, F, 0);
+  return WriteoutF;
 }
 
 void GCOVProfiler::insertIndirectCounterIncrement() {
@@ -701,11 +764,9 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
     cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
   Fn->setUnnamedAddr(true);
   Fn->setLinkage(GlobalValue::InternalLinkage);
-  Fn->addFnAttr(Attributes::NoInline);
-
-  Type *Int32Ty = Type::getInt32Ty(*Ctx);
-  Type *Int64Ty = Type::getInt64Ty(*Ctx);
-  Constant *NegOne = ConstantInt::get(Int32Ty, 0xffffffff);
+  Fn->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    Fn->addFnAttr(Attribute::NoRedZone);
 
   // Create basic blocks for function.
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn);
@@ -720,26 +781,27 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Argument *Arg = Fn->arg_begin();
   Arg->setName("predecessor");
   Value *Pred = Builder.CreateLoad(Arg, "pred");
-  Value *Cond = Builder.CreateICmpEQ(Pred, NegOne);
+  Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff));
   BranchInst::Create(Exit, PredNotNegOne, Cond, BB);
 
   Builder.SetInsertPoint(PredNotNegOne);
 
   // uint64_t *counter = counters[pred];
   // if (!counter) return;
-  Value *ZExtPred = Builder.CreateZExt(Pred, Int64Ty);
+  Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty());
   Arg = llvm::next(Fn->arg_begin());
   Arg->setName("counters");
   Value *GEP = Builder.CreateGEP(Arg, ZExtPred);
   Value *Counter = Builder.CreateLoad(GEP, "counter");
   Cond = Builder.CreateICmpEQ(Counter,
-                              Constant::getNullValue(Int64Ty->getPointerTo()));
+                              Constant::getNullValue(
+                                  Builder.getInt64Ty()->getPointerTo()));
   Builder.CreateCondBr(Cond, Exit, CounterEnd);
 
   // ++*counter;
   Builder.SetInsertPoint(CounterEnd);
   Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter),
-                                 ConstantInt::get(Int64Ty, 1));
+                                 Builder.getInt64(1));
   Builder.CreateStore(Add, Counter);
   Builder.CreateBr(Exit);
 
@@ -748,16 +810,19 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Builder.CreateRetVoid();
 }
 
-void GCOVProfiler::
+Function *GCOVProfiler::
 insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *FlushF = M->getFunction("__gcov_flush");
+  Function *FlushF = M->getFunction("__llvm_gcov_flush");
   if (!FlushF)
     FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
-                              "__gcov_flush", M);
+                              "__llvm_gcov_flush", M);
   else
     FlushF->setLinkage(GlobalValue::InternalLinkage);
   FlushF->setUnnamedAddr(true);
+  FlushF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    FlushF->addFnAttr(Attribute::NoRedZone);
 
   BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
 
@@ -781,8 +846,10 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
   if (RetTy == Type::getVoidTy(*Ctx))
     Builder.CreateRetVoid();
   else if (RetTy->isIntegerTy())
-    // Used if __gcov_flush was implicitly declared.
+    // Used if __llvm_gcov_flush was implicitly declared.
     Builder.CreateRet(ConstantInt::get(RetTy, 0));
   else
-    report_fatal_error("invalid return type for __gcov_flush");
+    report_fatal_error("invalid return type for __llvm_gcov_flush");
+
+  return FlushF;
 }
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index 1e0b4a3..8ba1025 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -21,11 +21,13 @@ using namespace llvm;
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerPass(Registry);
+  initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
   initializeEdgeProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
+  initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
 }
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
index a4bb5a6..363539b 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -15,10 +15,10 @@
 #ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
 #define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
 
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/EquivalenceClasses.h"
-#include <vector>
+#include "llvm/IR/BasicBlock.h"
 #include <algorithm>
+#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
new file mode 100644
index 0000000..4e75904
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -0,0 +1,1985 @@
+//===-- MemorySanitizer.cpp - detector of uninitialized reads -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// Status: early prototype.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+///                           Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwritting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msan"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+static const uint64_t kShadowMask32 = 1ULL << 31;
+static const uint64_t kShadowMask64 = 1ULL << 46;
+static const uint64_t kOriginOffset32 = 1ULL << 30;
+static const uint64_t kOriginOffset64 = 1ULL << 45;
+static const unsigned kMinOriginAlignment = 4;
+static const unsigned kShadowTLSAlignment = 8;
+
+/// \brief Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<bool> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given patter"),
+       cl::Hidden, cl::init(0xff));
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin",
+       cl::desc("store origin for clean (fully initialized) values"),
+       cl::Hidden, cl::init(false));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
+       cl::desc("File containing the list of functions where MemorySanitizer "
+                "should not report bugs"), cl::Hidden);
+
+namespace {
+
+/// \brief An instrumentation pass implementing detection of uninitialized
+/// reads.
+///
+/// MemorySanitizer: instrument the code in module to find
+/// uninitialized reads.
+class MemorySanitizer : public FunctionPass {
+ public:
+  MemorySanitizer(bool TrackOrigins = false,
+                  StringRef BlacklistFile = StringRef())
+    : FunctionPass(ID),
+      TrackOrigins(TrackOrigins || ClTrackOrigins),
+      TD(0),
+      WarningFn(0),
+      BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                          : BlacklistFile) { }
+  const char *getPassName() const { return "MemorySanitizer"; }
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  /// \brief Track origins (allocation points) of uninitialized values.
+  bool TrackOrigins;
+
+  DataLayout *TD;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+  /// \brief Thread-local shadow storage for function parameters.
+  GlobalVariable *ParamTLS;
+  /// \brief Thread-local origin storage for function parameters.
+  GlobalVariable *ParamOriginTLS;
+  /// \brief Thread-local shadow storage for function return value.
+  GlobalVariable *RetvalTLS;
+  /// \brief Thread-local origin storage for function return value.
+  GlobalVariable *RetvalOriginTLS;
+  /// \brief Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  GlobalVariable *VAArgTLS;
+  /// \brief Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  GlobalVariable *VAArgOverflowSizeTLS;
+  /// \brief Thread-local space used to pass origin value to the UMR reporting
+  /// function.
+  GlobalVariable *OriginTLS;
+
+  /// \brief The run-time callback to print a warning.
+  Value *WarningFn;
+  /// \brief Run-time helper that copies origin info for a memory range.
+  Value *MsanCopyOriginFn;
+  /// \brief Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  Value *MsanSetAllocaOriginFn;
+  /// \brief Run-time helper that poisons stack on function entry.
+  Value *MsanPoisonStackFn;
+  /// \brief MSan runtime replacements for memmove, memcpy and memset.
+  Value *MemmoveFn, *MemcpyFn, *MemsetFn;
+
+  /// \brief Address mask used in application-to-shadow address calculation.
+  /// ShadowAddr is computed as ApplicationAddr & ~ShadowMask.
+  uint64_t ShadowMask;
+  /// \brief Offset of the origin shadow from the "normal" shadow.
+  /// OriginAddr is computed as (ShadowAddr + OriginOffset) & ~3ULL
+  uint64_t OriginOffset;
+  /// \brief Branch weights for error reporting.
+  MDNode *ColdCallWeights;
+  /// \brief Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+  /// \bried Path to blacklist file.
+  SmallString<64> BlacklistFile;
+  /// \brief The blacklist.
+  OwningPtr<BlackList> BL;
+  /// \brief An empty volatile inline asm that prevents callback merge.
+  InlineAsm *EmptyAsm;
+
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+};
+}  // namespace
+
+char MemorySanitizer::ID = 0;
+INITIALIZE_PASS(MemorySanitizer, "msan",
+                "MemorySanitizer: detects uninitialized reads.",
+                false, false)
+
+FunctionPass *llvm::createMemorySanitizerPass(bool TrackOrigins,
+                                              StringRef BlacklistFile) {
+  return new MemorySanitizer(TrackOrigins, BlacklistFile);
+}
+
+/// \brief Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+
+/// \brief Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (WarningFn)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = ClKeepGoing ? "__msan_warning"
+                                        : "__msan_warning_noreturn";
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL);
+
+  MsanCopyOriginFn = M.getOrInsertFunction(
+    "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MsanSetAllocaOriginFn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), NULL);
+  MsanPoisonStackFn = M.getOrInsertFunction(
+    "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL);
+
+  // Create globals.
+  RetvalTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 8), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  RetvalOriginTLS = new GlobalVariable(
+    M, OriginTy, false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  ParamTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  ParamOriginTLS = new GlobalVariable(
+    M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
+    0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  VAArgTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  VAArgOverflowSizeTLS = new GlobalVariable(
+    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_va_arg_overflow_size_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  OriginTLS = new GlobalVariable(
+    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  // We insert an empty inline asm after __msan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+/// \brief Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+bool MemorySanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  C = &(M.getContext());
+  unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0);
+  switch (PtrSize) {
+    case 64:
+      ShadowMask = kShadowMask64;
+      OriginOffset = kOriginOffset64;
+      break;
+    case 32:
+      ShadowMask = kShadowMask32;
+      OriginOffset = kOriginOffset32;
+      break;
+    default:
+      report_fatal_error("unsupported pointer size");
+      break;
+  }
+
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(TD);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  // Insert a call to __msan_init/__msan_track_origins into the module's CTORs.
+  appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction(
+                      "__msan_init", IRB.getVoidTy(), NULL)), 0);
+
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(TrackOrigins), "__msan_track_origins");
+
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+
+  return true;
+}
+
+namespace {
+
+/// \brief A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  /// \brief Visit a CallSite.
+  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+
+  /// \brief Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// \brief Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// \brief Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+
+  virtual ~VarArgHelper() {}
+};
+
+struct MemorySanitizerVisitor;
+
+VarArgHelper*
+CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                   MemorySanitizerVisitor &Visitor);
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  bool InsertChecks;
+  bool LoadShadow;
+  OwningPtr<VarArgHelper> VAHelper;
+
+  struct ShadowOriginAndInsertPoint {
+    Instruction *Shadow;
+    Instruction *Origin;
+    Instruction *OrigIns;
+    ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) { }
+    ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { }
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  SmallVector<Instruction*, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
+      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
+    LoadShadow = InsertChecks =
+        !MS.BL->isIn(F) &&
+        F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::SanitizeMemory);
+
+    DEBUG(if (!InsertChecks)
+          dbgs() << "MemorySanitizer is not inserting checks into '"
+                 << F.getName() << "'\n");
+  }
+
+  void materializeStores() {
+    for (size_t i = 0, n = StoreList.size(); i < n; i++) {
+      StoreInst& I = *dyn_cast<StoreInst>(StoreList[i]);
+
+      IRBuilder<> IRB(&I);
+      Value *Val = I.getValueOperand();
+      Value *Addr = I.getPointerOperand();
+      Value *Shadow = getShadow(Val);
+      Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+      StoreInst *NewSI =
+        IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+      DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+
+      if (ClCheckAccessAddress)
+        insertCheck(Addr, &I);
+
+      if (MS.TrackOrigins) {
+        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
+        if (ClStoreCleanOrigin || isa<StructType>(Shadow->getType())) {
+          IRB.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRB),
+                                 Alignment);
+        } else {
+          Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+
+          Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow);
+          // TODO(eugenis): handle non-zero constant shadow by inserting an
+          // unconditional check (can not simply fail compilation as this could
+          // be in the dead code).
+          if (Cst)
+            continue;
+
+          Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+              getCleanShadow(ConvertedShadow), "_mscmp");
+          Instruction *CheckTerm =
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false,
+                                      MS.OriginStoreWeights);
+          IRBuilder<> IRBNew(CheckTerm);
+          IRBNew.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRBNew),
+                                    Alignment);
+        }
+      }
+    }
+  }
+
+  void materializeChecks() {
+    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
+      Instruction *Shadow = InstrumentationList[i].Shadow;
+      Instruction *OrigIns = InstrumentationList[i].OrigIns;
+      IRBuilder<> IRB(OrigIns);
+      DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Instruction *CheckTerm =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp),
+                                  /* Unreachable */ !ClKeepGoing,
+                                  MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      if (MS.TrackOrigins) {
+        Instruction *Origin = InstrumentationList[i].Origin;
+        IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
+                        MS.OriginTLS);
+      }
+      CallInst *Call = IRB.CreateCall(MS.WarningFn);
+      Call->setDebugLoc(OrigIns->getDebugLoc());
+      IRB.CreateCall(MS.EmptyAsm);
+      DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+    DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
+  /// \brief Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    MS.initializeCallbacks(*F.getParent());
+    if (!MS.TD) return false;
+
+    // In the presence of unreachable blocks, we may see Phi nodes with
+    // incoming nodes from such blocks. Since InstVisitor skips unreachable
+    // blocks, such nodes will not have any shadow value associated with them.
+    // It's easier to remove unreachable blocks than deal with missing shadow.
+    removeUnreachableBlocks(F);
+
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+      BasicBlock *BB = *DI;
+      visit(*BB);
+    }
+
+    // Finalize PHI nodes.
+    for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) {
+      PHINode *PN = ShadowPHINodes[i];
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : 0;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO)
+          PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    // Delayed instrumentation of StoreInst.
+    // This may add new checks to be inserted later.
+    materializeStores();
+
+    // Insert shadow value checks.
+    materializeChecks();
+
+    return true;
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return 0;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+      uint32_t EltSize = MS.TD->getTypeSizeInBits(VT->getElementType());
+      return VectorType::get(IntegerType::get(*MS.C, EltSize),
+                             VT->getNumElements());
+    }
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = MS.TD->getTypeSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// \brief Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C, vt->getBitWidth());
+    return ty;
+  }
+
+  /// \brief Convert a shadow value to it's flattened variant.
+  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
+  /// \brief Compute the shadow address that corresponds to a given application
+  /// address.
+  ///
+  /// Shadow = Addr & ~ShadowMask.
+  Value *getShadowPtr(Value *Addr, Type *ShadowTy,
+                      IRBuilder<> &IRB) {
+    Value *ShadowLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+  }
+
+  /// \brief Compute the origin address that corresponds to a given application
+  /// address.
+  ///
+  /// OriginAddr = (ShadowAddr + OriginOffset) & ~3ULL
+  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB) {
+    Value *ShadowLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    Value *Add =
+      IRB.CreateAdd(ShadowLong,
+                    ConstantInt::get(MS.IntptrTy, MS.OriginOffset));
+    Value *SecondAnd =
+      IRB.CreateAnd(Add, ConstantInt::get(MS.IntptrTy, ~3ULL));
+    return IRB.CreateIntToPtr(SecondAnd, PointerType::get(IRB.getInt32Ty(), 0));
+  }
+
+  /// \brief Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// \brief Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!MS.TrackOrigins) return 0;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// \brief Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    Value *Base = IRB.CreatePointerCast(MS.RetvalTLS, MS.IntptrTy);
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msret");
+  }
+
+  /// \brief Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// \brief Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = SV;
+  }
+
+  /// \brief Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!MS.TrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  /// \brief Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Constant *getCleanShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// \brief Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    StructType *ST = cast<StructType>(ShadowTy);
+    SmallVector<Constant *, 4> Vals;
+    for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+      Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+    return ConstantStruct::get(ST, Vals);
+  }
+
+  /// \brief Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return getPoisonedShadow(ShadowTy);
+  }
+
+  /// \brief Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// \brief Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = ClPoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
+      IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+      unsigned ArgOffset = 0;
+      for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        if (!AI->getType()->isSized()) {
+          DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+        unsigned Size = AI->hasByValAttr()
+          ? MS.TD->getTypeAllocSize(AI->getType()->getPointerElementType())
+          : MS.TD->getTypeAllocSize(AI->getType());
+        if (A == AI) {
+          Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset);
+          if (AI->hasByValAttr()) {
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            Value *Cpy = EntryIRB.CreateMemCpy(
+              getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
+              Base, Size, AI->getParamAlignment());
+            DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+            (void)Cpy;
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            *ShadowPtr = EntryIRB.CreateLoad(Base);
+          }
+          DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
+                **ShadowPtr << "\n");
+          if (MS.TrackOrigins) {
+            Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
+          }
+        }
+        ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// \brief Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// \brief Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!MS.TrackOrigins) return 0;
+    if (isa<Instruction>(V) || isa<Argument>(V)) {
+      Value *Origin = OriginMap[V];
+      if (!Origin) {
+        DEBUG(dbgs() << "NO ORIGIN: " << *V << "\n");
+        Origin = getCleanOrigin();
+      }
+      return Origin;
+    }
+    return getCleanOrigin();
+  }
+
+  /// \brief Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    if (!InsertChecks) return;
+    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+    if (!Shadow) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
+           "Can only insert checks for integer and vector shadow types");
+#endif
+    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    InstrumentationList.push_back(
+      ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  // ------------------- Visitors.
+
+  /// \brief Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    if (LoadShadow) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertCheck(I.getPointerOperand(), &I);
+
+    if (MS.TrackOrigins) {
+      if (LoadShadow) {
+        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
+        setOrigin(&I,
+                  IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment));
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+  }
+
+  /// \brief Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    insertCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// \brief Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Default propagation of shadow and/or origin.
+  ///
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow;
+    Value *Origin;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) :
+      Shadow(0), Origin(0), IRB(IRB), MSV(MSV) {}
+
+    /// \brief Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (MSV->MS.TrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+          Value *Cond = IRB.CreateICmpNE(FlatShadow,
+                                         MSV->getCleanShadow(FlatShadow));
+          Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+        }
+      }
+      return *this;
+    }
+
+    /// \brief Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : 0;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// \brief Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  typedef Combiner<true> ShadowAndOriginCombiner;
+  typedef Combiner<false> OriginCombiner;
+
+  /// \brief Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!MS.TrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+           "Vector of pointers is not a valid shadow type");
+    return Ty->isVectorTy() ?
+      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
+      Ty->getPrimitiveSizeInBits();
+  }
+
+  /// \brief Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) {
+    Type *srcTy = V->getType();
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, false);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+      return IRB.CreateIntCast(V, dstTy, false);
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
+  }
+
+  /// \brief Propagate shadow for arbitrary operation.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+  void visitMul(BinaryOperator &I) { handleShadowOr(I); }
+
+  void handleDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitFDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitURem(BinaryOperator &I) { handleDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleDiv(I); }
+  void visitFRem(BinaryOperator &I) { handleDiv(I); }
+
+  /// \brief Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// \brief Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// \brief Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Instrument signed relational comparisons.
+  ///
+  /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
+  /// propagating the highest bit of the shadow. Everything else is delegated
+  /// to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    Value* op = NULL;
+    CmpInst::Predicate pre = I.getPredicate();
+    if (constOp0 && constOp0->isNullValue() &&
+        (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) {
+      op = I.getOperand(1);
+    } else if (constOp1 && constOp1->isNullValue() &&
+               (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) {
+      op = I.getOperand(0);
+    }
+    if (op) {
+      IRBuilder<> IRB(&I);
+      Value* Shadow =
+        IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
+      handleEqualityComparison(I);
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
+      handleSignedRelationalComparison(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// \brief Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemmoveFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemcpyFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemsetFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  enum IntrinsicKind {
+    IK_DoesNotAccessMemory,
+    IK_OnlyReadsMemory,
+    IK_WritesMemory
+  };
+
+  static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) {
+    const int DoesNotAccessMemory = IK_DoesNotAccessMemory;
+    const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory;
+    const int OnlyReadsMemory = IK_OnlyReadsMemory;
+    const int OnlyAccessesArgumentPointees = IK_WritesMemory;
+    const int UnknownModRefBehavior = IK_WritesMemory;
+#define GET_INTRINSIC_MODREF_BEHAVIOR
+#define ModRefBehavior IntrinsicKind
+#include "llvm/IR/Intrinsics.gen"
+#undef ModRefBehavior
+#undef GET_INTRINSIC_MODREF_BEHAVIOR
+  }
+
+  /// \brief Handle vector store-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+  /// has 1 pointer argument and 1 vector argument, returns void.
+  bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Value *Shadow = getShadow(&I, 1);
+    Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+    // We don't know the pointer alignment (could be unaligned SSE store!).
+    // Have to assume to worst case.
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
+
+    if (ClCheckAccessAddress)
+      insertCheck(Addr, &I);
+
+    // FIXME: use ClStoreCleanOrigin
+    // FIXME: factor out common code from materializeStores
+    if (MS.TrackOrigins)
+      IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB));
+    return true;
+  }
+
+  /// \brief Handle vector load-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+  /// has 1 pointer argument, returns a vector.
+  bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+
+    Type *ShadowTy = getShadowTy(&I);
+    if (LoadShadow) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      // We don't know the pointer alignment (could be unaligned SSE load!).
+      // Have to assume to worst case.
+      setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, 1, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+
+    if (ClCheckAccessAddress)
+      insertCheck(Addr, &I);
+
+    if (MS.TrackOrigins) {
+      if (LoadShadow)
+        setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB)));
+      else
+        setOrigin(&I, getCleanOrigin());
+    }
+    return true;
+  }
+
+  /// \brief Handle (SIMD arithmetic)-like intrinsics.
+  ///
+  /// Instrument intrinsics with any number of arguments of the same type,
+  /// equal to the return type. The type should be simple (no aggregates or
+  /// pointers; vectors are fine).
+  /// Caller guarantees that this intrinsic does not access memory.
+  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+    Type *RetTy = I.getType();
+    if (!(RetTy->isIntOrIntVectorTy() ||
+          RetTy->isFPOrFPVectorTy() ||
+          RetTy->isX86_MMXTy()))
+      return false;
+
+    unsigned NumArgOperands = I.getNumArgOperands();
+
+    for (unsigned i = 0; i < NumArgOperands; ++i) {
+      Type *Ty = I.getArgOperand(i)->getType();
+      if (Ty != RetTy)
+        return false;
+    }
+
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (unsigned i = 0; i < NumArgOperands; ++i)
+      SC.Add(I.getArgOperand(i));
+    SC.Done(&I);
+
+    return true;
+  }
+
+  /// \brief Heuristically instrument unknown intrinsics.
+  ///
+  /// The main purpose of this code is to do something reasonable with all
+  /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+  /// We recognize several classes of intrinsics by their argument types and
+  /// ModRefBehaviour and apply special intrumentation when we are reasonably
+  /// sure that we know what the intrinsic does.
+  ///
+  /// We special-case intrinsics where this approach fails. See llvm.bswap
+  /// handling as an example of that.
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    unsigned NumArgOperands = I.getNumArgOperands();
+    if (NumArgOperands == 0)
+      return false;
+
+    Intrinsic::ID iid = I.getIntrinsicID();
+    IntrinsicKind IK = getIntrinsicKind(iid);
+    bool OnlyReadsMemory = IK == IK_OnlyReadsMemory;
+    bool WritesMemory = IK == IK_WritesMemory;
+    assert(!(OnlyReadsMemory && WritesMemory));
+
+    if (NumArgOperands == 2 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getArgOperand(1)->getType()->isVectorTy() &&
+        I.getType()->isVoidTy() &&
+        WritesMemory) {
+      // This looks like a vector store.
+      return handleVectorStoreIntrinsic(I);
+    }
+
+    if (NumArgOperands == 1 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getType()->isVectorTy() &&
+        OnlyReadsMemory) {
+      // This looks like a vector load.
+      return handleVectorLoadIntrinsic(I);
+    }
+
+    if (!OnlyReadsMemory && !WritesMemory)
+      if (maybeHandleSimpleNomemIntrinsic(I))
+        return true;
+
+    // FIXME: detect and handle SSE maskstore/maskload
+    return false;
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, ArrayRef<Type*>(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case llvm::Intrinsic::bswap:
+      handleBswap(I);
+      break;
+    default:
+      if (!handleUnknownIntrinsic(I))
+        visitInstruction(I);
+      break;
+    }
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction &I = *CS.getInstruction();
+    assert((CS.isCall() || CS.isInvoke()) && "Unknown type of CallSite");
+    if (CS.isCall()) {
+      CallInst *Call = cast<CallInst>(&I);
+
+      // For inline asm, do the usual thing: check argument shadow and mark all
+      // outputs as clean. Note that any side effects of the inline asm that are
+      // not immediately visible in its constraints are not handled.
+      if (Call->isInlineAsm()) {
+        visitInstruction(I);
+        return;
+      }
+
+      // Allow only tail calls with the same types, otherwise
+      // we may have a false positive: shadow for a non-void RetVal
+      // will get propagated to a void RetVal.
+      if (Call->isTailCall() && Call->getType() != Call->getParent()->getType())
+        Call->setTailCall(false);
+
+      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
+      if (Function *Func = Call->getCalledFunction()) {
+        // Clear out readonly/readnone attributes.
+        AttrBuilder B;
+        B.addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::ReadNone);
+        Func->removeAttributes(AttributeSet::FunctionIndex,
+                               AttributeSet::get(Func->getContext(),
+                                                 AttributeSet::FunctionIndex,
+                                                 B));
+      }
+    }
+    IRBuilder<> IRB(&I);
+    unsigned ArgOffset = 0;
+    DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CS.arg_begin();
+      if (!A->getType()->isSized()) {
+        DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = 0;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
+            " Shadow: " << *ArgShadow << "\n");
+      if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = MS.TD->getTypeAllocSize(A->getType()->getPointerElementType());
+        unsigned Alignment = CS.getParamAlignment(i + 1);
+        Store = IRB.CreateMemCpy(ArgShadowBase,
+                                 getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
+                                 Size, Alignment);
+      } else {
+        Size = MS.TD->getTypeAllocSize(A->getType());
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
+      }
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
+      assert(Size != 0 && Store != 0);
+      DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+    }
+    DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT =
+      cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0));
+    if (FT->isVarArg()) {
+      VAHelper->visitCallSite(CS, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!I.getType()->isSized()) return;
+    IRBuilder<> IRBBefore(&I);
+    // Untill we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
+    Instruction *NextInsn = 0;
+    if (CS.isCall()) {
+      NextInsn = I.getNextNode();
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&I, getCleanShadow(&I));
+        setOrigin(&I, getCleanOrigin());
+        return;
+      }
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(NextInsn);
+    Value *RetvalShadow =
+      IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),
+                                 kShadowTLSAlignment, "_msret");
+    setShadow(&I, RetvalShadow);
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    if (Value *RetVal = I.getReturnValue()) {
+      // Set the shadow for the RetVal.
+      Value *Shadow = getShadow(RetVal);
+      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+      DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    if (!ClPoisonStack) return;
+    IRBuilder<> IRB(I.getNextNode());
+    uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType());
+    if (ClPoisonStackWithCall) {
+      IRB.CreateCall2(MS.MsanPoisonStackFn,
+                      IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size));
+    } else {
+      Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
+      IRB.CreateMemSet(ShadowBase, IRB.getInt8(ClPoisonStackPattern),
+                       Size, I.getAlignment());
+    }
+
+    if (MS.TrackOrigins) {
+      setOrigin(&I, getCleanOrigin());
+      SmallString<2048> StackDescriptionStorage;
+      raw_svector_ostream StackDescription(StackDescriptionStorage);
+      // We create a string with a description of the stack allocation and
+      // pass it into __msan_set_alloca_origin.
+      // It will be printed by the run-time if stack-originated UMR is found.
+      // The first 4 bytes of the string are set to '----' and will be replaced
+      // by __msan_va_arg_overflow_size_tls at the first call.
+      StackDescription << "----" << I.getName() << "@" << F.getName();
+      Value *Descr =
+          createPrivateNonConstGlobalForString(*F.getParent(),
+                                               StackDescription.str());
+      IRB.CreateCall3(MS.MsanSetAllocaOriginFn,
+                      IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size),
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()));
+    }
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I,  IRB.CreateSelect(I.getCondition(),
+              getShadow(I.getTrueValue()), getShadow(I.getFalseValue()),
+              "_msprop"));
+    if (MS.TrackOrigins) {
+      // Origins are always i32, so any vector conditions must be flattened.
+      // FIXME: consider tracking vector origins for app vectors?
+      Value *Cond = I.getCondition();
+      if (Cond->getType()->isVectorTy()) {
+        Value *ConvertedShadow = convertToShadowTyNoVec(Cond, IRB);
+        Cond = IRB.CreateICmpNE(ConvertedShadow,
+                                getCleanShadow(ConvertedShadow), "_mso_select");
+      }
+      setOrigin(&I, IRB.CreateSelect(Cond,
+                getOrigin(I.getTrueValue()), getOrigin(I.getFalseValue())));
+    }
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See http://code.google.com/p/memory-sanitizer/issues/detail?id=1
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
+      insertCheck(I.getOperand(i), &I);
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// \brief AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffset = 176;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(0), VAArgOverflowSize(0) { }
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+      case AK_GeneralPurpose:
+        Base = getShadowPtrForVAArgument(A, IRB, GpOffset);
+        GpOffset += 8;
+        break;
+      case AK_FloatingPoint:
+        Base = getShadowPtrForVAArgument(A, IRB, FpOffset);
+        FpOffset += 16;
+        break;
+      case AK_Memory:
+        uint64_t ArgSize = MS.TD->getTypeAllocSize(A->getType());
+        Base = getShadowPtrForVAArgument(A, IRB, OverflowOffset);
+        OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8);
+      }
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Value *A, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+        MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy,
+                       AMD64FpEndOffset, 16);
+
+      Value *OverflowArgAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr =
+        MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
+      Value *SrcPtr =
+        getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
+    }
+  }
+};
+
+VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                 MemorySanitizerVisitor &Visitor) {
+  return new VarArgAMD64Helper(Func, Msan, Visitor);
+}
+
+}  // namespace
+
+bool MemorySanitizer::runOnFunction(Function &F) {
+  MemorySanitizerVisitor Visitor(F, *this);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly)
+    .addAttribute(Attribute::ReadNone);
+  F.removeAttributes(AttributeSet::FunctionIndex,
+                     AttributeSet::get(F.getContext(),
+                                       AttributeSet::FunctionIndex, B));
+
+  return Visitor.runOnFunction();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index 1fe1254..b45aef65 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -13,20 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-optimal-edge-profiling"
+#include "llvm/Transforms/Instrumentation.h"
+#include "MaximumSpanningTree.h"
 #include "ProfilingUtils.h"
-#include "llvm/Constants.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "MaximumSpanningTree.h"
 using namespace llvm;
 
 STATISTIC(NumEdgesInserted, "The # of edges inserted.");
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp
index cc27146..7de7326 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -45,24 +45,23 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-path-profiling"
 
-#include "llvm/DerivedTypes.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "ProfilingUtils.h"
 #include "llvm/Analysis/PathNumbering.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/InstrTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeBuilder.h"
 #include "llvm/Pass.h"
-#include "llvm/TypeBuilder.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include <vector>
 
 #define HASH_THRESHHOLD 100000
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp
index de57cd1..4b3de6d 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -15,11 +15,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "ProfilingUtils.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 
 void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
                                    GlobalValue *Array,
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 9e10fc4..299060a 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -21,31 +21,32 @@
 
 #define DEBUG_TYPE "tsan"
 
-#include "BlackList.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
 
-static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
+static cl::opt<std::string>  ClBlacklistFile("tsan-blacklist",
        cl::desc("Blacklist file"), cl::Hidden);
 static cl::opt<bool>  ClInstrumentMemoryAccesses(
     "tsan-instrument-memory-accesses", cl::init(true),
@@ -56,6 +57,9 @@ static cl::opt<bool>  ClInstrumentFuncEntryExit(
 static cl::opt<bool>  ClInstrumentAtomics(
     "tsan-instrument-atomics", cl::init(true),
     cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
@@ -63,6 +67,7 @@ STATISTIC(NumOmittedReadsBeforeWrite,
           "Number of reads ignored due to following writes");
 STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
 STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
 STATISTIC(NumOmittedReadsFromConstantGlobals,
           "Number of reads from constant globals");
 STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
@@ -71,21 +76,29 @@ namespace {
 
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
-  ThreadSanitizer();
+  ThreadSanitizer(StringRef BlacklistFile = StringRef())
+      : FunctionPass(ID),
+        TD(0),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile) { }
   const char *getPassName() const;
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
   static char ID;  // Pass identification, replacement for typeid.
 
  private:
+  void initializeCallbacks(Module &M);
   bool instrumentLoadOrStore(Instruction *I);
   bool instrumentAtomic(Instruction *I);
+  bool instrumentMemIntrinsic(Instruction *I);
   void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
                                       SmallVectorImpl<Instruction*> &All);
   bool addrPointsToConstantData(Value *Addr);
   int getMemoryAccessFuncIndex(Value *Addr);
 
   DataLayout *TD;
+  Type *IntptrTy;
+  SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
@@ -102,6 +115,8 @@ struct ThreadSanitizer : public FunctionPass {
   Function *TsanAtomicThreadFence;
   Function *TsanAtomicSignalFence;
   Function *TsanVptrUpdate;
+  Function *TsanVptrLoad;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
 };
 }  // namespace
 
@@ -114,13 +129,8 @@ const char *ThreadSanitizer::getPassName() const {
   return "ThreadSanitizer";
 }
 
-ThreadSanitizer::ThreadSanitizer()
-  : FunctionPass(ID),
-  TD(NULL) {
-}
-
-FunctionPass *llvm::createThreadSanitizerPass() {
-  return new ThreadSanitizer();
+FunctionPass *llvm::createThreadSanitizerPass(StringRef BlacklistFile) {
+  return new ThreadSanitizer(BlacklistFile);
 }
 
 static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
@@ -130,18 +140,8 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
   report_fatal_error("ThreadSanitizer interface function redefined");
 }
 
-bool ThreadSanitizer::doInitialization(Module &M) {
-  TD = getAnalysisIfAvailable<DataLayout>();
-  if (!TD)
-    return false;
-  BL.reset(new BlackList(ClBlackListFile));
-
-  // Always insert a call to __tsan_init into the module's CTORs.
+void ThreadSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
-  Value *TsanInit = M.getOrInsertFunction("__tsan_init",
-                                          IRB.getVoidTy(), NULL);
-  appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
-
   // Initialize the callbacks.
   TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
@@ -188,6 +188,8 @@ bool ThreadSanitizer::doInitialization(Module &M) {
         NamePart = "_fetch_or";
       else if (op == AtomicRMWInst::Xor)
         NamePart = "_fetch_xor";
+      else if (op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
       else
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
@@ -198,15 +200,42 @@ bool ThreadSanitizer::doInitialization(Module &M) {
     SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
                                   "_compare_exchange_val");
     TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction(
-        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, NULL));
+        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, NULL));
   }
   TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
       IRB.getInt8PtrTy(), NULL));
+  TsanVptrLoad = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
   TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
   TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
+
+  MemmoveFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL));
+  MemcpyFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL));
+  MemsetFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL));
+}
+
+bool ThreadSanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+
+  // Always insert a call to __tsan_init into the module's CTORs.
+  IRBuilder<> IRB(M.getContext());
+  IntptrTy = IRB.getIntPtrTy(TD);
+  Value *TsanInit = M.getOrInsertFunction("__tsan_init",
+                                          IRB.getVoidTy(), NULL);
+  appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
+
   return true;
 }
 
@@ -297,10 +326,12 @@ static bool isAtomic(Instruction *I) {
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!TD) return false;
   if (BL->isIn(F)) return false;
+  initializeCallbacks(*F.getParent());
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
   SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
   bool Res = false;
   bool HasCalls = false;
 
@@ -317,6 +348,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       else if (isa<ReturnInst>(BI))
         RetVec.push_back(BI);
       else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
+        if (isa<MemIntrinsic>(BI))
+          MemIntrinCalls.push_back(BI);
         HasCalls = true;
         chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
@@ -340,6 +373,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       Res |= instrumentAtomic(AtomicAccesses[i]);
     }
 
+  if (ClInstrumentMemIntrinsics)
+    for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) {
+      Res |= instrumentMemIntrinsic(MemIntrinCalls[i]);
+    }
+
   // Instrument function entry/exit points if there were instrumented accesses.
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
@@ -378,6 +416,12 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
     NumInstrumentedVtableWrites++;
     return true;
   }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
   Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
   if (IsWrite) NumInstrumentedWrites++;
@@ -391,7 +435,7 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
     case NotAtomic:              assert(false);
     case Unordered:              // Fall-through.
     case Monotonic:              v = 0; break;
- // case Consume:                v = 1; break;  // Not specified yet.
+    // case Consume:                v = 1; break;  // Not specified yet.
     case Acquire:                v = 2; break;
     case Release:                v = 3; break;
     case AcquireRelease:         v = 4; break;
@@ -400,6 +444,55 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   return IRB->getInt32(v);
 }
 
+static ConstantInt *createFailOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case NotAtomic:              assert(false);
+    case Unordered:              // Fall-through.
+    case Monotonic:              v = 0; break;
+    // case Consume:                v = 1; break;  // Not specified yet.
+    case Acquire:                v = 2; break;
+    case Release:                v = 0; break;
+    case AcquireRelease:         v = 2; break;
+    case SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall3(MemsetFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall3(isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  }
+  return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards.  For background see C++11 standard.  A slightly older, publically
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
 bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
   IRBuilder<> IRB(I);
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
@@ -461,7 +554,8 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
                      IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
                      IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
-                     createOrdering(&IRB, CASI->getOrdering())};
+                     createOrdering(&IRB, CASI->getOrdering()),
+                     createFailOrdering(&IRB, CASI->getOrdering())};
     CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
     ReplaceInstWithInst(I, C);
   } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
new file mode 100644
index 0000000..8f917ae
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -0,0 +1,262 @@
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-dependency"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/Support/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                ProvenanceAnalysis &PA,
+                                InstructionClass Class) {
+  switch (Class) {
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_IntrinsicUser:
+  case IC_User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  ImmutableCallSite CS = static_cast<const Value *>(Inst);
+  assert(CS && "Only calls can alter reference counts!");
+
+  // See if AliasAnalysis can help us with the call.
+  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I) {
+      const Value *Op = *I;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, InstructionClass Class) {
+  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
+  if (Class == IC_Call)
+    return false;
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+      return false;
+  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
+         OE = CS.arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+      return true;
+  }
+  return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+                       const Value *Arg, ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case AutoreleasePoolBoundary: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
+  case CanChangeRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+
+  case RetainAutoreleaseRVDep: {
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicInstructionClass(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
+void
+llvm::objcarc::FindDependencies(DependenceKind Flavor,
+                                const Value *Arg,
+                                BasicBlock *StartBB, Instruction *StartInst,
+                                SmallPtrSet<Instruction *, 4> &DependingInsts,
+                                SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                ProvenanceAnalysis &PA) {
+  BasicBlock::iterator StartPos = StartInst;
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
+          // If we've reached the function entry, produce a null dependence.
+          DependingInsts.insert(0);
+        else
+          // Add the predecessors to the worklist.
+          do {
+            BasicBlock *PredBB = *PI;
+            if (Visited.insert(PredBB))
+              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+          } while (++PI != PE);
+        break;
+      }
+
+      Instruction *Inst = --LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInsts.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
+       E = Visited.end(); I != E; ++I) {
+    const BasicBlock *BB = *I;
+    if (BB == StartBB)
+      continue;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+      const BasicBlock *Succ = *SI;
+      if (Succ != StartBB && !Visited.count(Succ)) {
+        DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
+        return;
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
new file mode 100644
index 0000000..24d358b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -0,0 +1,79 @@
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+  class BasicBlock;
+  class Instruction;
+  class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// \brief Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+  NeedsPositiveRetainCount,
+  AutoreleasePoolBoundary,
+  CanChangeRetainCount,
+  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
+void FindDependencies(DependenceKind Flavor,
+                      const Value *Arg,
+                      BasicBlock *StartBB, Instruction *StartInst,
+                      SmallPtrSet<Instruction *, 4> &DependingInstructions,
+                      SmallPtrSet<const BasicBlock *, 4> &Visited,
+                      ProvenanceAnalysis &PA);
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+       InstructionClass Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                 ProvenanceAnalysis &PA, InstructionClass Class);
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
new file mode 100644
index 0000000..53a31b0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -0,0 +1,48 @@
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+  class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+bool llvm::objcarc::EnableARCOpts;
+static cl::opt<bool, true>
+EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::location(EnableARCOpts),
+                       cl::init(true));
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+  initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
+  initializeObjCARCExpandPass(Registry);
+  initializeObjCARCContractPass(Registry);
+  initializeObjCARCOptPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+  initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
new file mode 100644
index 0000000..39670f3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -0,0 +1,395 @@
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+#define LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+extern bool EnableARCOpts;
+
+/// \brief Test if the given module looks interesting to run ARC optimization
+/// on.
+static inline bool ModuleHasARC(const Module &M) {
+  return
+    M.getNamedValue("objc_retain") ||
+    M.getNamedValue("objc_release") ||
+    M.getNamedValue("objc_autorelease") ||
+    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
+    M.getNamedValue("objc_retainBlock") ||
+    M.getNamedValue("objc_autoreleaseReturnValue") ||
+    M.getNamedValue("objc_autoreleasePoolPush") ||
+    M.getNamedValue("objc_loadWeakRetained") ||
+    M.getNamedValue("objc_loadWeak") ||
+    M.getNamedValue("objc_destroyWeak") ||
+    M.getNamedValue("objc_storeWeak") ||
+    M.getNamedValue("objc_initWeak") ||
+    M.getNamedValue("objc_moveWeak") ||
+    M.getNamedValue("objc_copyWeak") ||
+    M.getNamedValue("objc_retainedObject") ||
+    M.getNamedValue("objc_unretainedObject") ||
+    M.getNamedValue("objc_unretainedPointer") ||
+    M.getNamedValue("clang.arc.use");
+}
+
+/// \enum InstructionClass
+/// \brief A simple classification for instructions.
+enum InstructionClass {
+  IC_Retain,              ///< objc_retain
+  IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
+  IC_RetainBlock,         ///< objc_retainBlock
+  IC_Release,             ///< objc_release
+  IC_Autorelease,         ///< objc_autorelease
+  IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
+  IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
+  IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
+  IC_NoopCast,            ///< objc_retainedObject, etc.
+  IC_FusedRetainAutorelease, ///< objc_retainAutorelease
+  IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
+  IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
+  IC_StoreWeak,           ///< objc_storeWeak (primitive)
+  IC_InitWeak,            ///< objc_initWeak (derived)
+  IC_LoadWeak,            ///< objc_loadWeak (derived)
+  IC_MoveWeak,            ///< objc_moveWeak (derived)
+  IC_CopyWeak,            ///< objc_copyWeak (derived)
+  IC_DestroyWeak,         ///< objc_destroyWeak (derived)
+  IC_StoreStrong,         ///< objc_storeStrong (derived)
+  IC_IntrinsicUser,       ///< clang.arc.use
+  IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
+  IC_Call,                ///< could call objc_release
+  IC_User,                ///< could "use" a pointer
+  IC_None                 ///< anything else
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
+
+/// \brief Test if the given class is a kind of user.
+inline static bool IsUser(InstructionClass Class) {
+  return Class == IC_User ||
+         Class == IC_CallOrUser ||
+         Class == IC_IntrinsicUser;
+}
+
+/// \brief Test if the given class is objc_retain or equivalent.
+static inline bool IsRetain(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV;
+}
+
+/// \brief Test if the given class is objc_autorelease or equivalent.
+static inline bool IsAutorelease(InstructionClass Class) {
+  return Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which return their
+/// argument verbatim.
+static inline bool IsForwarding(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_NoopCast;
+}
+
+/// \brief Test if the given class represents instructions which do nothing if
+/// passed a null pointer.
+static inline bool IsNoopOnNull(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the "tail" keyword.
+static inline bool IsAlwaysTail(InstructionClass Class) {
+  // IC_RetainBlock may be given a stack argument.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which are never safe
+/// to mark with the "tail" keyword.
+static inline bool IsNeverTail(InstructionClass Class) {
+  /// It is never safe to tail call objc_autorelease since by tail calling
+  /// objc_autorelease, we also tail call -[NSObject autorelease] which supports
+  /// fast autoreleasing causing our object to be potentially reclaimed from the
+  /// autorelease pool which violates the semantics of __autoreleasing types in
+  /// ARC.
+  return Class == IC_Autorelease;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the nounwind attribute.
+static inline bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_AutoreleasepoolPush ||
+         Class == IC_AutoreleasepoolPop;
+}
+
+/// Test whether the given instruction can autorelease any pointer or cause an
+/// autoreleasepool pop.
+static inline bool
+CanInterruptRV(InstructionClass Class) {
+  switch (Class) {
+  case IC_AutoreleasepoolPop:
+  case IC_CallOrUser:
+  case IC_Call:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// \brief Determine if F is one of the special known Functions.  If it isn't,
+/// return IC_CallOrUser.
+InstructionClass GetFunctionClass(const Function *F);
+
+/// \brief Determine which objc runtime call instruction class V belongs to.
+///
+/// This is similar to GetInstructionClass except that it only detects objc
+/// runtime calls. This allows it to be faster.
+///
+static inline InstructionClass GetBasicInstructionClass(const Value *V) {
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (const Function *F = CI->getCalledFunction())
+      return GetFunctionClass(F);
+    // Otherwise, be conservative.
+    return IC_CallOrUser;
+  }
+
+  // Otherwise, be conservative.
+  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass GetInstructionClass(const Value *V);
+
+/// \brief This is a wrapper around getUnderlyingObject which also knows how to
+/// look through objc_retain and objc_autorelease calls, which we know to return
+/// their argument verbatim.
+static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
+  for (;;) {
+    V = GetUnderlyingObject(V);
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline Value *StripPointerCastsAndObjCCalls(Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief Assuming the given instruction is one of the special calls such as
+/// objc_retain or objc_release, return the argument value, stripped of no-op
+/// casts and forwarding calls.
+static inline Value *GetObjCArg(Value *Inst) {
+  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
+}
+
+static inline bool IsNullOrUndef(const Value *V) {
+  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
+}
+
+static inline bool IsNoopInstruction(const Instruction *I) {
+  return isa<BitCastInst>(I) ||
+    (isa<GetElementPtrInst>(I) &&
+     cast<GetElementPtrInst>(I)->hasAllZeroIndices());
+}
+
+
+/// \brief Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// \brief Test whether the given value is possible a retainable object pointer.
+static inline bool IsPotentialRetainableObjPtr(const Value *Op) {
+  // Pointers to static or stack storage are not valid retainable object
+  // pointers.
+  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
+    return false;
+  // Special arguments can not be a valid retainable object pointer.
+  if (const Argument *Arg = dyn_cast<Argument>(Op))
+    if (Arg->hasByValAttr() ||
+        Arg->hasNestAttr() ||
+        Arg->hasStructRetAttr())
+      return false;
+  // Only consider values with pointer types.
+  //
+  // It seemes intuitive to exclude function pointer types as well, since
+  // functions are never retainable object pointers, however clang occasionally
+  // bitcasts retainable object pointers to function-pointer type temporarily.
+  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  if (!Ty)
+    return false;
+  // Conservatively assume anything else is a potential retainable object
+  // pointer.
+  return true;
+}
+
+static inline bool IsPotentialRetainableObjPtr(const Value *Op,
+                                               AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialRetainableObjPtr(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
+/// \brief Helper for GetInstructionClass. Determines what kind of construct CS
+/// is.
+static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
+  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I)
+    if (IsPotentialRetainableObjPtr(*I))
+      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
+
+  return CS.onlyReadsMemory() ? IC_None : IC_Call;
+}
+
+/// \brief Return true if this value refers to a distinct and identifiable
+/// object.
+///
+/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses
+/// special knowledge of ObjC conventions.
+static inline bool IsObjCIdentifiedObject(const Value *V) {
+  // Assume that call results and arguments have their own "provenance".
+  // Constants (including GlobalVariables) and Allocas are never
+  // reference-counted.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
+      isa<Argument>(V) || isa<Constant>(V) ||
+      isa<AllocaInst>(V))
+    return true;
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    const Value *Pointer =
+      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
+      StringRef Name = GV->getName();
+      // These special variables are known to hold values which are not
+      // reference-counted pointers.
+      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
+          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
+          Name.startswith("\01l_objc_msgSend_fixup_"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_OBJCARC_H
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
new file mode 100644
index 0000000..00d9864
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -0,0 +1,175 @@
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Autorelease pool elimination.
+  class ObjCARCAPElim : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnModule(Module &M);
+
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
+
+  public:
+    static char ID;
+    ObjCARCAPElim() : ModulePass(ID) {
+      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim,
+                "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination",
+                false, false)
+
+Pass *llvm::createObjCARCAPElimPass() {
+  return new ObjCARCAPElim();
+}
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
+    if (Callee->isDeclaration() || Callee->mayBeOverridden())
+      return true;
+    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
+         I != E; ++I) {
+      const BasicBlock *BB = I;
+      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
+           J != F; ++J)
+        if (ImmutableCallSite JCS = ImmutableCallSite(J))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 &&
+              !JCS.onlyReadsMemory() &&
+              MayAutorelease(JCS, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+  bool Changed = false;
+
+  Instruction *Push = 0;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case IC_AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                        "autorelease pair:\n"
+                        "                           Pop: " << *Inst << "\n"
+                     << "                           Push: " << *Push << "\n");
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = 0;
+      break;
+    case IC_CallOrUser:
+      if (MayAutorelease(ImmutableCallSite(Inst)))
+        Push = 0;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of pairs where the second members
+    // are constructor functions.
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (llvm::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(F->begin());
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
new file mode 100644
index 0000000..46b2de7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -0,0 +1,162 @@
+//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -*- mode: c++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-aa"
+#include "ObjCARC.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassSupport.h"
+
+namespace llvm {
+  class Function;
+  class Value;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+// Register this pass...
+char ObjCARCAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
+                   "ObjC-ARC-Based Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
+  return new ObjCARCAliasAnalysis();
+}
+
+void
+ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+AliasAnalysis::AliasResult
+ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
+  // precise alias query.
+  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
+  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
+  AliasResult Result =
+    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
+                         Location(SB, LocB.Size, LocB.TBAATag));
+  if (Result != MayAlias)
+    return Result;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *UA = GetUnderlyingObjCPtr(SA);
+  const Value *UB = GetUnderlyingObjCPtr(SB);
+  if (UA != SA || UB != SB) {
+    Result = AliasAnalysis::alias(Location(UA), Location(UB));
+    // We can't use MustAlias or PartialAlias results here because
+    // GetUnderlyingObjCPtr may return an offsetted pointer value.
+    if (Result == NoAlias)
+      return NoAlias;
+  }
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return MayAlias;
+}
+
+bool
+ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                             bool OrLocal) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making
+  // a precise alias query.
+  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
+  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
+                                            OrLocal))
+    return true;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *U = GetUnderlyingObjCPtr(S);
+  if (U != S)
+    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return false;
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  // We have nothing to do. Just chain to the next AliasAnalysis.
+  return AliasAnalysis::getModRefBehavior(CS);
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefBehavior(F);
+
+  switch (GetFunctionClass(F)) {
+  case IC_NoopCast:
+    return DoesNotAccessMemory;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  switch (GetBasicInstructionClass(CS.getInstruction())) {
+  case IC_Retain:
+  case IC_RetainRV:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_NoopCast:
+  case IC_AutoreleasepoolPush:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, because it updates
+    // pointers when it copies block data.
+    return NoModRef;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                    ImmutableCallSite CS2) {
+  // TODO: Theoretically we could check for dependencies between objc_* calls
+  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
new file mode 100644
index 0000000..7abe995
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
@@ -0,0 +1,74 @@
+//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+namespace objcarc {
+
+  /// \brief This is a simple alias analysis implementation that uses knowledge
+  /// of ARC constructs to answer queries.
+  ///
+  /// TODO: This class could be generalized to know about other ObjC-specific
+  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
+  /// even though their offsets are dynamic.
+  class ObjCARCAliasAnalysis : public ImmutablePass,
+                               public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
+      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+  private:
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    /// This method is used when a pass implements an analysis interface through
+    /// multiple inheritance.  If needed, it should override this to adjust the
+    /// this pointer as needed for the specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return static_cast<AliasAnalysis *>(this);
+      return this;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2);
+  };
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
new file mode 100644
index 0000000..b96c64f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -0,0 +1,541 @@
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#define DEBUG_TYPE "objc-arc-contract"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+namespace {
+  /// \brief Late ARC optimizations
+  ///
+  /// These change the IR in a way that makes it difficult to be analyzed by
+  /// ObjCARCOpt, so it's run late.
+  class ObjCARCContract : public FunctionPass {
+    bool Changed;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for objc_storeStrong().
+    Constant *StoreStrongCallee;
+    /// Declaration for objc_retainAutorelease().
+    Constant *RetainAutoreleaseCallee;
+    /// Declaration for objc_retainAutoreleaseReturnValue().
+    Constant *RetainAutoreleaseRVCallee;
+
+    /// The inline asm string to insert between calls and RetainRV calls to make
+    /// the optimization work on targets which need it.
+    const MDString *RetainRVMarker;
+
+    /// The set of inserted objc_storeStrong calls. If at the end of walking the
+    /// function we have found no alloca instructions, these calls can be marked
+    /// "tail".
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
+
+    Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainAutoreleaseCallee(Module *M);
+    Constant *getRetainAutoreleaseRVCallee(Module *M);
+
+    bool ContractAutorelease(Function &F, Instruction *Autorelease,
+                             InstructionClass Class,
+                             SmallPtrSet<Instruction *, 4>
+                               &DependingInstructions,
+                             SmallPtrSet<const BasicBlock *, 4>
+                               &Visited);
+
+    void ContractRelease(Instruction *Release,
+                         inst_iterator &Iter);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+  public:
+    static char ID;
+    ObjCARCContract() : FunctionPass(ID) {
+      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract,
+                      "objc-arc-contract", "ObjC ARC contraction", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(ObjCARCContract,
+                    "objc-arc-contract", "ObjC ARC contraction", false, false)
+
+Pass *llvm::createObjCARCContractPass() {
+  return new ObjCARCContract();
+}
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
+
+Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
+  if (!StoreStrongCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *I8XX = PointerType::getUnqual(I8X);
+    Type *Params[] = { I8XX, I8X };
+
+    AttributeSet Attr = AttributeSet()
+      .addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                    Attribute::NoUnwind)
+      .addAttribute(M->getContext(), 1, Attribute::NoCapture);
+
+    StoreStrongCallee =
+      M->getOrInsertFunction(
+        "objc_storeStrong",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attr);
+  }
+  return StoreStrongCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
+  if (!RetainAutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseCallee =
+      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
+  }
+  return RetainAutoreleaseCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
+  if (!RetainAutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainAutoreleaseRVCallee;
+}
+
+/// Merge an autorelease with a retain into a fused call.
+bool
+ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
+                                     InstructionClass Class,
+                                     SmallPtrSet<Instruction *, 4>
+                                       &DependingInstructions,
+                                     SmallPtrSet<const BasicBlock *, 4>
+                                       &Visited) {
+  const Value *Arg = GetObjCArg(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
+  CallInst *Retain = 0;
+  if (Class == IC_AutoreleaseRV)
+    FindDependencies(RetainAutoreleaseRVDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+  else
+    FindDependencies(RetainAutoreleaseDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+
+  Visited.clear();
+  if (DependingInstructions.size() != 1) {
+    DependingInstructions.clear();
+    return false;
+  }
+
+  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+  DependingInstructions.clear();
+
+  if (!Retain ||
+      GetBasicInstructionClass(Retain) != IC_Retain ||
+      GetObjCArg(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
+                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
+                  "                                      Old Retain: "
+               << *Retain << "\n");
+
+  if (Class == IC_AutoreleaseRV)
+    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
+  else
+    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                      New Retain: "
+               << *Retain << "\n");
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. This can be a little tricky because the instructions
+/// don't always appear in order, and there may be unrelated intervening
+/// instructions.
+void ObjCARCContract::ContractRelease(Instruction *Release,
+                                      inst_iterator &Iter) {
+  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
+  if (!Load || !Load->isSimple()) return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB) return;
+
+  // Walk down to find the store and the release, which may be in either order.
+  BasicBlock::iterator I = Load, End = BB->end();
+  ++I;
+  AliasAnalysis::Location Loc = AA->getLocation(Load);
+  StoreInst *Store = 0;
+  bool SawRelease = false;
+  for (; !Store || !SawRelease; ++I) {
+    if (I == End)
+      return;
+
+    Instruction *Inst = I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    // Unrelated retains are harmless.
+    if (IsRetain(Class))
+      continue;
+
+    if (Store) {
+      // The store is the point where we're going to put the objc_storeStrong,
+      // so make sure there are no uses after it.
+      if (CanUse(Inst, Load, PA, Class))
+        return;
+    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
+      // We are moving the load down to the store, so check for anything
+      // else which writes to the memory between the load and the store.
+      Store = dyn_cast<StoreInst>(Inst);
+      if (!Store || !Store->isSimple()) return;
+      if (Store->getPointerOperand() != Loc.Ptr) return;
+    }
+  }
+
+  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
+
+  // Walk up to find the retain.
+  I = Store;
+  BasicBlock::iterator Begin = BB->begin();
+  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
+    --I;
+  Instruction *Retain = I;
+  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
+  if (GetObjCArg(Retain) != New) return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  LLVMContext &C = Release->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  CallInst *StoreStrong =
+    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
+                     Args, "", Store);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // These are initialized lazily.
+  StoreStrongCallee = 0;
+  RetainAutoreleaseCallee = 0;
+  RetainAutoreleaseRVCallee = 0;
+
+  // Initialize RetainRVMarker.
+  RetainRVMarker = 0;
+  if (NamedMDNode *NMD =
+        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RetainRVMarker = S;
+    }
+
+  return false;
+}
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs = !F.isVarArg() &&
+                               !F.callsFunctionThatReturnsTwice();
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
+
+    // Only these library routines return their argument. In particular,
+    // objc_retainBlock does not necessarily return its argument.
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV:
+      break;
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
+        continue;
+      break;
+    case IC_RetainRV: {
+      // If we're compiling for a target which needs a special inline-asm
+      // marker to do the retainAutoreleasedReturnValue optimization,
+      // insert it now.
+      if (!RetainRVMarker)
+        break;
+      BasicBlock::iterator BBI = Inst;
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RetainRV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (&*BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator();
+          break;
+        }
+        --BBI;
+      } while (IsNoopInstruction(BBI));
+
+      if (&*BBI == GetObjCArg(Inst)) {
+        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
+                        "retainAutoreleasedReturnValue optimization.\n");
+        Changed = true;
+        InlineAsm *IA =
+          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                                           /*isVarArg=*/false),
+                         RetainRVMarker->getString(),
+                         /*Constraints=*/"", /*hasSideEffects=*/true);
+        CallInst::Create(IA, "", Inst);
+      }
+    decline_rv_optimization:
+      break;
+    }
+    case IC_InitWeak: {
+      // objc_initWeak(p, null) => *p = null
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(1))) {
+        Value *Null =
+          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+        Changed = true;
+        new StoreInst(Null, CI->getArgOperand(0), CI);
+
+        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                     << "                 New = " << *Null << "\n");
+
+        CI->replaceAllUsesWith(Null);
+        CI->eraseFromParent();
+      }
+      continue;
+    }
+    case IC_Release:
+      ContractRelease(Inst, I);
+      continue;
+    case IC_User:
+      // Be conservative if the function has any alloca instructions.
+      // Technically we only care about escaping alloca instructions,
+      // but this is sufficient to handle some interesting cases.
+      if (isa<AllocaInst>(Inst))
+        TailOkForStoreStrongs = false;
+      continue;
+    case IC_IntrinsicUser:
+      // Remove calls to @clang.arc.use(...).
+      Inst->eraseFromParent();
+      continue;
+    default:
+      continue;
+    }
+
+    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
+
+    // Don't use GetObjCArg because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    for (;;) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        break;
+      // Look through the uses of the pointer.
+      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        Use &U = UI.getUse();
+        unsigned OperandNo = UI.getOperandNo();
+        ++UI; // Increment UI now, because we may unlink its element.
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetObjCArg.
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            // While we're here, rewrite all edges for this PHI, rather
+            // than just one use at a time, to minimize the number of
+            // bitcasts we emit.
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (&PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) ==
+                    &UI.getUse())
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
+            U.set(Replacement);
+          }
+        }
+      }
+
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->mayBeOverridden())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else
+        break;
+    }
+  }
+
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
+         E = StoreStrongCalls.end(); I != E; ++I)
+      (*I)->setTailCall();
+  StoreStrongCalls.clear();
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
new file mode 100644
index 0000000..39bf8f3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -0,0 +1,128 @@
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+#include "ObjCARC.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class Module;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Early ARC transformations.
+  class ObjCARCExpand : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+  public:
+    static char ID;
+    ObjCARCExpand() : FunctionPass(ID) {
+      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand,
+                "objc-arc-expand", "ObjC ARC expansion", false, false)
+
+Pass *llvm::createObjCARCExpandPass() {
+  return new ObjCARCExpand();
+}
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::doInitialization(Module &M) {
+  Run = ModuleHasARC(M);
+  return false;
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  bool Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_Retain:
+    case IC_RetainRV:
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV: {
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in the contract pass.
+      Changed = true;
+      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
+                      "               New = " << *Value << "\n");
+      Inst->replaceAllUsesWith(Value);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
new file mode 100644
index 0000000..92d6fc4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -0,0 +1,3026 @@
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-opts"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific.
+/// @{
+
+namespace {
+  /// \brief An associative container with fast insertion-order (deterministic)
+  /// iteration over its elements. Plus the special blot operation.
+  template<class KeyT, class ValueT>
+  class MapVector {
+    /// Map keys to indices in Vector.
+    typedef DenseMap<KeyT, size_t> MapTy;
+    MapTy Map;
+
+    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
+    /// Keys and values.
+    VectorTy Vector;
+
+  public:
+    typedef typename VectorTy::iterator iterator;
+    typedef typename VectorTy::const_iterator const_iterator;
+    iterator begin() { return Vector.begin(); }
+    iterator end() { return Vector.end(); }
+    const_iterator begin() const { return Vector.begin(); }
+    const_iterator end() const { return Vector.end(); }
+
+#ifdef XDEBUG
+    ~MapVector() {
+      assert(Vector.size() >= Map.size()); // May differ due to blotting.
+      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
+           I != E; ++I) {
+        assert(I->second < Vector.size());
+        assert(Vector[I->second].first == I->first);
+      }
+      for (typename VectorTy::const_iterator I = Vector.begin(),
+           E = Vector.end(); I != E; ++I)
+        assert(!I->first ||
+               (Map.count(I->first) &&
+                Map[I->first] == size_t(I - Vector.begin())));
+    }
+#endif
+
+    ValueT &operator[](const KeyT &Arg) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(std::make_pair(Arg, ValueT()));
+        return Vector[Num].second;
+      }
+      return Vector[Pair.first->second].second;
+    }
+
+    std::pair<iterator, bool>
+    insert(const std::pair<KeyT, ValueT> &InsertPair) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(InsertPair);
+        return std::make_pair(Vector.begin() + Num, true);
+      }
+      return std::make_pair(Vector.begin() + Pair.first->second, false);
+    }
+
+    const_iterator find(const KeyT &Key) const {
+      typename MapTy::const_iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
+    /// This is similar to erase, but instead of removing the element from the
+    /// vector, it just zeros out the key in the vector. This leaves iterators
+    /// intact, but clients must be prepared for zeroed-out keys when iterating.
+    void blot(const KeyT &Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return;
+      Vector[It->second].first = KeyT();
+      Map.erase(It);
+    }
+
+    void clear() {
+      Map.clear();
+      Vector.clear();
+    }
+  };
+}
+
+/// @}
+///
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicInstructionClass(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return 0;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI) {
+      const User *U = *UI;
+      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
+         return 0;
+    }
+
+    return Arg;
+  }
+
+  return 0;
+}
+
+/// \brief Test whether the given retainable object pointer escapes.
+///
+/// This differs from regular escape analysis in that a use as an
+/// argument to a call is not considered an escape.
+///
+static bool DoesRetainableObjPtrEscape(const User *Ptr) {
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
+
+  // Walk the def-use chains.
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(Ptr);
+  // If Ptr has any operands add them as well.
+  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
+       ++I) {
+    Worklist.push_back(*I);
+  }
+
+  // Ensure we do not visit any value twice.
+  SmallPtrSet<const Value *, 8> VisitedSet;
+
+  do {
+    const Value *V = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Visiting: " << *V << "\n");
+
+    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      const User *UUser = *UI;
+
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User: " << *UUser << "\n");
+
+      // Special - Use by a call (callee or argument) is not considered
+      // to be an escape.
+      switch (GetBasicInstructionClass(UUser)) {
+      case IC_StoreWeak:
+      case IC_InitWeak:
+      case IC_StoreStrong:
+      case IC_Autorelease:
+      case IC_AutoreleaseRV: {
+        DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies pointer "
+              "arguments. Pointer Escapes!\n");
+        // These special functions make copies of their pointer arguments.
+        return true;
+      }
+      case IC_IntrinsicUser:
+        // Use by the use intrinsic is not an escape.
+        continue;
+      case IC_User:
+      case IC_None:
+        // Use by an instruction which copies the value is an escape if the
+        // result is an escape.
+        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
+            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
+
+          if (VisitedSet.insert(UUser)) {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies value. "
+                  "Ptr escapes if result escapes. Adding to list.\n");
+            Worklist.push_back(UUser);
+          } else {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Already visited node."
+                  "\n");
+          }
+          continue;
+        }
+        // Use by a load is not an escape.
+        if (isa<LoadInst>(UUser))
+          continue;
+        // Use by a store is not an escape if the use is the address.
+        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
+          if (V != SI->getValueOperand())
+            continue;
+        break;
+      default:
+        // Regular calls and other stuff are not considered escapes.
+        continue;
+      }
+      // Otherwise, conservatively assume an escape.
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Assuming ptr escapes.\n");
+      return true;
+    }
+  } while (!Worklist.empty());
+
+  // No escapes found.
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Ptr does not escape.\n");
+  return false;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleaes eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+
+namespace {
+  /// \enum Sequence
+  ///
+  /// \brief A sequence of states that a pointer may go through in which an
+  /// objc_retain and objc_release are actually needed.
+  enum Sequence {
+    S_None,
+    S_Retain,         ///< objc_retain(x).
+    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement.
+    S_Use,            ///< any use of x.
+    S_Stop,           ///< like S_Release, but code motion is stopped.
+    S_Release,        ///< objc_release(x).
+    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release.
+  };
+
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S) {
+    switch (S) {
+    case S_None:
+      return OS << "S_None";
+    case S_Retain:
+      return OS << "S_Retain";
+    case S_CanRelease:
+      return OS << "S_CanRelease";
+    case S_Use:
+      return OS << "S_Use";
+    case S_Release:
+      return OS << "S_Release";
+    case S_MovableRelease:
+      return OS << "S_MovableRelease";
+    case S_Stop:
+      return OS << "S_Stop";
+    }
+    llvm_unreachable("Unknown sequence type.");
+  }
+}
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B) std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+namespace {
+  /// \brief Unidirectional information about either a
+  /// retain-decrement-use-release sequence or release-use-decrement-retain
+  /// reverese sequence.
+  struct RRInfo {
+    /// After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
+
+    /// True of the objc_release calls are all marked with the "tail" keyword.
+    bool IsTailCallRelease;
+
+    /// If the Calls are objc_release calls and they all have a
+    /// clang.imprecise_release tag, this is the metadata tag.
+    MDNode *ReleaseMetadata;
+
+    /// For a top-down sequence, the set of objc_retains or
+    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+    SmallPtrSet<Instruction *, 2> Calls;
+
+    /// The set of optimal insert positions for moving calls in the opposite
+    /// sequence.
+    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+    RRInfo() :
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {}
+
+    void clear();
+  };
+}
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = 0;
+  Calls.clear();
+  ReverseInsertPts.clear();
+}
+
+namespace {
+  /// \brief This class summarizes several per-pointer runtime properties which
+  /// are propogated through the flow graph.
+  class PtrState {
+    /// True if the reference count is known to be incremented.
+    bool KnownPositiveRefCount;
+
+    /// True of we've seen an opportunity for partial RR elimination, such as
+    /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+    bool Partial;
+
+    /// The current position in the sequence.
+    Sequence Seq : 8;
+
+  public:
+    /// Unidirectional information about the current sequence.
+    ///
+    /// TODO: Encapsulate this better.
+    RRInfo RRI;
+
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
+                 Seq(S_None) {}
+
+    void SetKnownPositiveRefCount() {
+      KnownPositiveRefCount = true;
+    }
+
+    void ClearKnownPositiveRefCount() {
+      KnownPositiveRefCount = false;
+    }
+
+    bool HasKnownPositiveRefCount() const {
+      return KnownPositiveRefCount;
+    }
+
+    void SetSeq(Sequence NewSeq) {
+      Seq = NewSeq;
+    }
+
+    Sequence GetSeq() const {
+      return Seq;
+    }
+
+    void ClearSequenceProgress() {
+      ResetSequenceProgress(S_None);
+    }
+
+    void ResetSequenceProgress(Sequence NewSeq) {
+      Seq = NewSeq;
+      Partial = false;
+      RRI.clear();
+    }
+
+    void Merge(const PtrState &Other, bool TopDown);
+  };
+}
+
+void
+PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Conservatively merge the ReleaseMetadata information.
+    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
+      RRI.ReleaseMetadata = 0;
+
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
+                            Other.RRI.IsTailCallRelease;
+    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.RRI.ReverseInsertPts.begin(),
+         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
+      Partial |= RRI.ReverseInsertPts.insert(*I);
+  }
+}
+
+namespace {
+  /// \brief Per-BasicBlock state.
+  class BBState {
+    /// The number of unique control paths from the entry which can reach this
+    /// block.
+    unsigned TopDownPathCount;
+
+    /// The number of unique control paths to exits from this block.
+    unsigned BottomUpPathCount;
+
+    /// A type for PerPtrTopDown and PerPtrBottomUp.
+    typedef MapVector<const Value *, PtrState> MapTy;
+
+    /// The top-down traversal uses this to record information known about a
+    /// pointer at the bottom of each block.
+    MapTy PerPtrTopDown;
+
+    /// The bottom-up traversal uses this to record information known about a
+    /// pointer at the top of each block.
+    MapTy PerPtrBottomUp;
+
+    /// Effective predecessors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Preds;
+    /// Effective successors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Succs;
+
+  public:
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+
+    typedef MapTy::iterator ptr_iterator;
+    typedef MapTy::const_iterator ptr_const_iterator;
+
+    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    ptr_const_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    ptr_const_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+
+    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
+    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    ptr_const_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    ptr_const_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+
+    /// Mark this block as being an entry block, which has one path from the
+    /// entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// Mark this block as being an exit block, which has one path to an exit by
+    /// definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    PtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    PtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrBottomUp.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// Return the number of possible unique paths from an entry to an exit
+    /// which pass through this block. This is only valid after both the
+    /// top-down and bottom-up traversals are complete.
+    unsigned GetAllPathCount() const {
+      assert(TopDownPathCount != 0);
+      assert(BottomUpPathCount != 0);
+      return TopDownPathCount * BottomUpPathCount;
+    }
+
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() { return Preds.begin(); }
+    edge_iterator pred_end() { return Preds.end(); }
+    edge_iterator succ_begin() { return Succs.begin(); }
+    edge_iterator succ_end() { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
+  };
+}
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    clearTopDownPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
+       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = top_down_ptr_begin(),
+       ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    clearBottomUpPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
+       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = bottom_up_ptr_begin(),
+       ME = bottom_up_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/false);
+}
+
+// Only enable ARC Annotations if we are building a debug version of
+// libObjCARCOpts.
+#ifndef NDEBUG
+#define ARC_ANNOTATIONS
+#endif
+
+// Define some macros along the lines of DEBUG and some helper functions to make
+// it cleaner to create annotations in the source code and to no-op when not
+// building in debug mode.
+#ifdef ARC_ANNOTATIONS
+
+#include "llvm/Support/CommandLine.h"
+
+/// Enable/disable ARC sequence annotations.
+static cl::opt<bool>
+EnableARCAnnotations("enable-objc-arc-annotations", cl::init(false));
+
+/// This function appends a unique ARCAnnotationProvenanceSourceMDKind id to an
+/// instruction so that we can track backwards when post processing via the llvm
+/// arc annotation processor tool. If the function is an
+static MDString *AppendMDNodeToSourcePtr(unsigned NodeId,
+                                         Value *Ptr) {
+  MDString *Hash = 0;
+
+  // If pointer is a result of an instruction and it does not have a source
+  // MDNode it, attach a new MDNode onto it. If pointer is a result of
+  // an instruction and does have a source MDNode attached to it, return a
+  // reference to said Node. Otherwise just return 0.
+  if (Instruction *Inst = dyn_cast<Instruction>(Ptr)) {
+    MDNode *Node;
+    if (!(Node = Inst->getMetadata(NodeId))) {
+      // We do not have any node. Generate and attatch the hash MDString to the
+      // instruction.
+
+      // We just use an MDString to ensure that this metadata gets written out
+      // of line at the module level and to provide a very simple format
+      // encoding the information herein. Both of these makes it simpler to
+      // parse the annotations by a simple external program.
+      std::string Str;
+      raw_string_ostream os(Str);
+      os << "(" << Inst->getParent()->getParent()->getName() << ",%"
+         << Inst->getName() << ")";
+
+      Hash = MDString::get(Inst->getContext(), os.str());
+      Inst->setMetadata(NodeId, MDNode::get(Inst->getContext(),Hash));
+    } else {
+      // We have a node. Grab its hash and return it.
+      assert(Node->getNumOperands() == 1 &&
+        "An ARCAnnotationProvenanceSourceMDKind can only have 1 operand.");
+      Hash = cast<MDString>(Node->getOperand(0));
+    }
+  } else if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+    std::string str;
+    raw_string_ostream os(str);
+    os << "(" << Arg->getParent()->getName() << ",%" << Arg->getName()
+       << ")";
+    Hash = MDString::get(Arg->getContext(), os.str());
+  }
+
+  return Hash;
+}
+
+static std::string SequenceToString(Sequence A) {
+  std::string str;
+  raw_string_ostream os(str);
+  os << A;
+  return os.str();
+}
+
+/// Helper function to change a Sequence into a String object using our overload
+/// for raw_ostream so we only have printing code in one location.
+static MDString *SequenceToMDString(LLVMContext &Context,
+                                    Sequence A) {
+  return MDString::get(Context, SequenceToString(A));
+}
+
+/// A simple function to generate a MDNode which describes the change in state
+/// for Value *Ptr caused by Instruction *Inst.
+static void AppendMDNodeToInstForPtr(unsigned NodeId,
+                                     Instruction *Inst,
+                                     Value *Ptr,
+                                     MDString *PtrSourceMDNodeID,
+                                     Sequence OldSeq,
+                                     Sequence NewSeq) {
+  MDNode *Node = 0;
+  Value *tmp[3] = {PtrSourceMDNodeID,
+                   SequenceToMDString(Inst->getContext(),
+                                      OldSeq),
+                   SequenceToMDString(Inst->getContext(),
+                                      NewSeq)};
+  Node = MDNode::get(Inst->getContext(),
+                     ArrayRef<Value*>(tmp, 3));
+
+  Inst->setMetadata(NodeId, Node);
+}
+
+/// Add to the beginning of the basic block llvm.ptr.annotations which show the
+/// state of a pointer at the entrance to a basic block.
+static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
+                                            Value *Ptr, Sequence Seq) {
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, BB->getFirstInsertionPt());
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Add to the end of the basic block llvm.ptr.annotations which show the state
+/// of the pointer at the bottom of the basic block.
+static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
+                                              Value *Ptr, Sequence Seq) {
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, llvm::prior(BB->end()));
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Adds a source annotation to pointer and a state change annotation to Inst
+/// referencing the source annotation and the old/new state of pointer.
+static void GenerateARCAnnotation(unsigned InstMDId,
+                                  unsigned PtrMDId,
+                                  Instruction *Inst,
+                                  Value *Ptr,
+                                  Sequence OldSeq,
+                                  Sequence NewSeq) {
+  if (EnableARCAnnotations) {
+    // First generate the source annotation on our pointer. This will return an
+    // MDString* if Ptr actually comes from an instruction implying we can put
+    // in a source annotation. If AppendMDNodeToSourcePtr returns 0 (i.e. NULL),
+    // then we know that our pointer is from an Argument so we put a reference
+    // to the argument number.
+    //
+    // The point of this is to make it easy for the
+    // llvm-arc-annotation-processor tool to cross reference where the source
+    // pointer is in the LLVM IR since the LLVM IR parser does not submit such
+    // information via debug info for backends to use (since why would anyone
+    // need such a thing from LLVM IR besides in non standard cases
+    // [i.e. this]).
+    MDString *SourcePtrMDNode =
+      AppendMDNodeToSourcePtr(PtrMDId, Ptr);
+    AppendMDNodeToInstForPtr(InstMDId, Inst, Ptr, SourcePtrMDNode, OldSeq,
+                             NewSeq);
+  }
+}
+
+// The actual interface for accessing the above functionality is defined via
+// some simple macros which are defined below. We do this so that the user does
+// not need to pass in what metadata id is needed resulting in cleaner code and
+// additionally since it provides an easy way to conditionally no-op all
+// annotation support in a non-debug build.
+
+/// Use this macro to annotate a sequence state change when processing
+/// instructions bottom up,
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)                          \
+  GenerateARCAnnotation(ARCAnnotationBottomUpMDKind,                    \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+/// Use this macro to annotate a sequence state change when processing
+/// instructions top down.
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)                           \
+  GenerateARCAnnotation(ARCAnnotationTopDownMDKind,                     \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+
+#define ANNOTATE_BB(_states, _bb, _name, _type, _direction)                   \
+  do {                                                                        \
+  if (EnableARCAnnotations) {                                                 \
+    for(BBState::ptr_const_iterator I = (_states)._direction##_ptr_begin(),   \
+          E = (_states)._direction##_ptr_end(); I != E; ++I) {                \
+      Value *Ptr = const_cast<Value*>(I->first);                              \
+      Sequence Seq = I->second.GetSeq();                                      \
+      GenerateARCBB ## _type ## Annotation(_name, (_bb), Ptr, Seq);           \
+    }                                                                         \
+  }                                                                           \
+} while (0)
+
+#define ANNOTATE_BOTTOMUP_BBSTART(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbstart", \
+                Entrance, bottom_up)
+#define ANNOTATE_BOTTOMUP_BBEND(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbend", \
+                Terminator, bottom_up)
+#define ANNOTATE_TOPDOWN_BBSTART(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbstart", \
+                Entrance, top_down)
+#define ANNOTATE_TOPDOWN_BBEND(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbend", \
+                Terminator, top_down)
+
+#else // !ARC_ANNOTATION
+// If annotations are off, noop.
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)
+#define ANNOTATE_BOTTOMUP_BBSTART(states, basicblock)
+#define ANNOTATE_BOTTOMUP_BBEND(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBSTART(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBEND(states, basicblock)
+#endif // !ARC_ANNOTATION
+
+namespace {
+  /// \brief The main ARC optimization pass.
+  class ObjCARCOpt : public FunctionPass {
+    bool Changed;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for ObjC runtime function
+    /// objc_retainAutoreleasedReturnValue.
+    Constant *RetainRVCallee;
+    /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+    Constant *AutoreleaseRVCallee;
+    /// Declaration for ObjC runtime function objc_release.
+    Constant *ReleaseCallee;
+    /// Declaration for ObjC runtime function objc_retain.
+    Constant *RetainCallee;
+    /// Declaration for ObjC runtime function objc_retainBlock.
+    Constant *RetainBlockCallee;
+    /// Declaration for ObjC runtime function objc_autorelease.
+    Constant *AutoreleaseCallee;
+
+    /// Flags which determine whether each of the interesting runtine functions
+    /// is in fact used in the current function.
+    unsigned UsedInThisFunction;
+
+    /// The Metadata Kind for clang.imprecise_release metadata.
+    unsigned ImpreciseReleaseMDKind;
+
+    /// The Metadata Kind for clang.arc.copy_on_escape metadata.
+    unsigned CopyOnEscapeMDKind;
+
+    /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
+    unsigned NoObjCARCExceptionsMDKind;
+
+#ifdef ARC_ANNOTATIONS
+    /// The Metadata Kind for llvm.arc.annotation.bottomup metadata.
+    unsigned ARCAnnotationBottomUpMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.topdown metadata.
+    unsigned ARCAnnotationTopDownMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.provenancesource metadata.
+    unsigned ARCAnnotationProvenanceSourceMDKind;
+#endif // ARC_ANNOATIONS
+
+    Constant *getRetainRVCallee(Module *M);
+    Constant *getAutoreleaseRVCallee(Module *M);
+    Constant *getReleaseCallee(Module *M);
+    Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
+    Constant *getAutoreleaseCallee(Module *M);
+
+    bool IsRetainBlockOptimizable(const Instruction *Inst);
+
+    void OptimizeRetainCall(Function &F, Instruction *Retain);
+    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                   InstructionClass &Class);
+    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
+                                 InstructionClass &Class);
+    void OptimizeIndividualCalls(Function &F);
+
+    void CheckForCFGHazards(const BasicBlock *BB,
+                            DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst,
+                                  BasicBlock *BB,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
+    bool VisitBottomUp(BasicBlock *BB,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       MapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
+    bool VisitTopDown(BasicBlock *BB,
+                      DenseMap<const BasicBlock *, BBState> &BBStates,
+                      DenseMap<Value *, RRInfo> &Releases);
+    bool Visit(Function &F,
+               DenseMap<const BasicBlock *, BBState> &BBStates,
+               MapVector<Value *, RRInfo> &Retains,
+               DenseMap<Value *, RRInfo> &Releases);
+
+    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                   MapVector<Value *, RRInfo> &Retains,
+                   DenseMap<Value *, RRInfo> &Releases,
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
+
+    bool ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> &BBStates,
+                               MapVector<Value *, RRInfo> &Retains,
+                               DenseMap<Value *, RRInfo> &Releases,
+                               Module *M,
+                               SmallVector<Instruction *, 4> &NewRetains,
+                               SmallVector<Instruction *, 4> &NewReleases,
+                               SmallVector<Instruction *, 8> &DeadInsts,
+                               RRInfo &RetainsToMove,
+                               RRInfo &ReleasesToMove,
+                               Value *Arg,
+                               bool KnownSafe,
+                               bool &AnyPairsCompletelyEliminated);
+
+    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                              MapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
+
+    void OptimizeWeakCalls(Function &F);
+
+    bool OptimizeSequences(Function &F);
+
+    void OptimizeReturns(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+    virtual void releaseMemory();
+
+  public:
+    static char ID;
+    ObjCARCOpt() : FunctionPass(ID) {
+      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCOpt,
+                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
+INITIALIZE_PASS_END(ObjCARCOpt,
+                    "objc-arc", "ObjC ARC optimization", false, false)
+
+Pass *llvm::createObjCARCOptPass() {
+  return new ObjCARCOpt();
+}
+
+void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ObjCARCAliasAnalysis>();
+  AU.addRequired<AliasAnalysis>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
+  // Without the magic metadata tag, we have to assume this might be an
+  // objc_retainBlock call inserted to convert a block pointer to an id,
+  // in which case it really is needed.
+  if (!Inst->getMetadata(CopyOnEscapeMDKind))
+    return false;
+
+  // If the pointer "escapes" (not including being used in a call),
+  // the copy may be needed.
+  if (DoesRetainableObjPtrEscape(Inst))
+    return false;
+
+  // Otherwise, it's not needed.
+  return true;
+}
+
+Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainRVCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
+  if (!AutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return AutoreleaseRVCallee;
+}
+
+Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
+  if (!ReleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    ReleaseCallee =
+      M->getOrInsertFunction(
+        "objc_release",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return ReleaseCallee;
+}
+
+Constant *ObjCARCOpt::getRetainCallee(Module *M) {
+  if (!RetainCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainCallee =
+      M->getOrInsertFunction(
+        "objc_retain",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return RetainCallee;
+}
+
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        AttributeSet());
+  }
+  return RetainBlockCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
+  if (!AutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseCallee =
+      M->getOrInsertFunction(
+        "objc_autorelease",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return AutoreleaseCallee;
+}
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value.
+void
+ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call) return;
+  if (Call->getParent() != Retain->getParent()) return;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = Call;
+  ++I;
+  while (IsNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue..
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
+                  "objc_retain => objc_retainAutoreleasedReturnValue"
+                  " since the operand is a return value.\n"
+                  "                                Old: "
+               << *Retain << "\n");
+
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                New: "
+               << *Retain << "\n");
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.  Or, if it can be paired with an
+/// objc_autoreleaseReturnValue, delete the pair and return true.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call or invoke.
+  const Value *Arg = GetObjCArg(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::const_iterator I = Call;
+      ++I;
+      while (IsNoopInstruction(I)) ++I;
+      if (&*I == RetainRV)
+        return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::const_iterator I = RetainRVParent->begin();
+        while (IsNoopInstruction(I)) ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
+    }
+  }
+
+  // Check for being preceded by an objc_autoreleaseReturnValue on the same
+  // pointer. In this case, we can delete the pair.
+  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
+  if (I != Begin) {
+    do --I; while (I != Begin && IsNoopInstruction(I));
+    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
+        GetObjCArg(I) == Arg) {
+      Changed = true;
+      ++NumPeeps;
+
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
+                   << "                                  Erasing " << *RetainRV
+                   << "\n");
+
+      EraseInstruction(I);
+      EraseInstruction(RetainRV);
+      return true;
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
+                  "objc_retainAutoreleasedReturnValue => "
+                  "objc_retain since the operand is not a return value.\n"
+                  "                                  Old: "
+               << *RetainRV << "\n");
+
+  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                  New: "
+               << *RetainRV << "\n");
+
+  return false;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void
+ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                      InstructionClass &Class) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetObjCArg(AutoreleaseRV);
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
+                  "objc_autoreleaseReturnValue => "
+                  "objc_autorelease since its operand is not used as a return "
+                  "value.\n"
+                  "                                       Old: "
+               << *AutoreleaseRV << "\n");
+
+  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+  AutoreleaseRVCI->
+    setCalledFunction(getAutoreleaseCallee(F.getParent()));
+  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+  Class = IC_Autorelease;
+
+  DEBUG(dbgs() << "                                       New: "
+               << *AutoreleaseRV << "\n");
+
+}
+
+// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
+// calls.
+//
+// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
+// does not escape (following the rules of block escaping), strength reduce the
+// objc_retainBlock to an objc_retain.
+//
+// TODO: If an objc_retainBlock call is dominated period by a previous
+// objc_retainBlock call, strength reduce the objc_retainBlock to an
+// objc_retain.
+bool
+ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
+                                    InstructionClass &Class) {
+  assert(GetBasicInstructionClass(Inst) == Class);
+  assert(IC_RetainBlock == Class);
+
+  // If we can not optimize Inst, return false.
+  if (!IsRetainBlockOptimizable(Inst))
+    return false;
+
+  CallInst *RetainBlock = cast<CallInst>(Inst);
+  RetainBlock->setCalledFunction(getRetainCallee(F.getParent()));
+  // Remove copy_on_escape metadata.
+  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
+  Class = IC_Retain;
+
+  return true;
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: Class: "
+          << Class << "; " << *Inst << "\n");
+
+    switch (Class) {
+    default: break;
+
+    // Delete no-op casts. These function calls have special semantics, but
+    // the semantics are entirely implemented via lowering in the front-end,
+    // so by the time they reach the optimizer, they are just no-op calls
+    // which return their argument.
+    //
+    // There are gray areas here, as the ability to cast reference-counted
+    // pointers to raw void* and back allows code to break ARC assumptions,
+    // however these are currently considered to be unimportant.
+    case IC_NoopCast:
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
+                   " " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+
+    // If the pointer-to-weak-pointer is null, it's undefined behavior.
+    case IC_StoreWeak:
+    case IC_LoadWeak:
+    case IC_LoadWeakRetained:
+    case IC_InitWeak:
+    case IC_DestroyWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(0))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_CopyWeak:
+    case IC_MoveWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(0)) ||
+          IsNullOrUndef(CI->getArgOperand(1))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_RetainBlock:
+      // If we strength reduce an objc_retainBlock to amn objc_retain, continue
+      // onto the objc_retain peephole optimizations. Otherwise break.
+      if (!OptimizeRetainBlockCall(F, Inst, Class))
+        break;
+      // FALLTHROUGH
+    case IC_Retain:
+      OptimizeRetainCall(F, Inst);
+      break;
+    case IC_RetainRV:
+      if (OptimizeRetainRVCall(F, Inst))
+        continue;
+      break;
+    case IC_AutoreleaseRV:
+      OptimizeAutoreleaseRVCall(F, Inst, Class);
+      break;
+    }
+
+    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+    if (IsAutorelease(Class) && Inst->use_empty()) {
+      CallInst *Call = cast<CallInst>(Inst);
+      const Value *Arg = Call->getArgOperand(0);
+      Arg = FindSingleUseIdentifiedObject(Arg);
+      if (Arg) {
+        Changed = true;
+        ++NumAutoreleases;
+
+        // Create the declaration lazily.
+        LLVMContext &C = Inst->getContext();
+        CallInst *NewCall =
+          CallInst::Create(getReleaseCallee(F.getParent()),
+                           Call->getArgOperand(0), "", Call);
+        NewCall->setMetadata(ImpreciseReleaseMDKind,
+                             MDNode::get(C, ArrayRef<Value *>()));
+
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
+                        "objc_autorelease(x) with objc_release(x) since x is "
+                        "otherwise unused.\n"
+                        "                                     Old: " << *Call <<
+                        "\n                                     New: " <<
+                        *NewCall << "\n");
+
+        EraseInstruction(Call);
+        Inst = NewCall;
+        Class = IC_Release;
+      }
+    }
+
+    // For functions which can never be passed stack arguments, add
+    // a tail keyword.
+    if (IsAlwaysTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
+            " to function since it can never be passed stack args: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall();
+    }
+
+    // Ensure that functions that can never have a "tail" keyword due to the
+    // semantics of ARC truly do not do so.
+    if (IsNeverTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Removing tail "
+            "keyword from function: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall(false);
+    }
+
+    // Set nounwind as needed.
+    if (IsNoThrow(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
+            " class. Setting nounwind on: " << *Inst << "\n");
+      cast<CallInst>(Inst)->setDoesNotThrow();
+    }
+
+    if (!IsNoopOnNull(Class)) {
+      UsedInThisFunction |= 1 << Class;
+      continue;
+    }
+
+    const Value *Arg = GetObjCArg(Inst);
+
+    // ARC calls with null are no-ops. Delete them.
+    if (IsNullOrUndef(Arg)) {
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
+            " null are no-ops. Erasing: " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+    }
+
+    // Keep track of which of retain, release, autorelease, and retain_block
+    // are actually present in this function.
+    UsedInThisFunction |= 1 << Class;
+
+    // If Arg is a PHI, and one or more incoming values to the
+    // PHI are null, and the call is control-equivalent to the PHI, and there
+    // are no relevant side effects between the PHI and the call, the call
+    // could be pushed up to just those paths with non-null incoming values.
+    // For now, don't bother splitting critical edges for this.
+    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+    Worklist.push_back(std::make_pair(Inst, Arg));
+    do {
+      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+      Inst = Pair.first;
+      Arg = Pair.second;
+
+      const PHINode *PN = dyn_cast<PHINode>(Arg);
+      if (!PN) continue;
+
+      // Determine if the PHI has any null operands, or any incoming
+      // critical edges.
+      bool HasNull = false;
+      bool HasCriticalEdges = false;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *Incoming =
+          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+        if (IsNullOrUndef(Incoming))
+          HasNull = true;
+        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
+                   .getNumSuccessors() != 1) {
+          HasCriticalEdges = true;
+          break;
+        }
+      }
+      // If we have null operands and no critical edges, optimize.
+      if (!HasCriticalEdges && HasNull) {
+        SmallPtrSet<Instruction *, 4> DependingInstructions;
+        SmallPtrSet<const BasicBlock *, 4> Visited;
+
+        // Check that there is nothing that cares about the reference
+        // count between the call and the phi.
+        switch (Class) {
+        case IC_Retain:
+        case IC_RetainBlock:
+          // These can always be moved up.
+          break;
+        case IC_Release:
+          // These can't be moved across things that care about the retain
+          // count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_RetainRV:
+        case IC_AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
+        if (DependingInstructions.size() == 1 &&
+            *DependingInstructions.begin() == PN) {
+          Changed = true;
+          ++NumPartialNoops;
+          // Clone the call into each predecessor that has a non-null value.
+          CallInst *CInst = cast<CallInst>(Inst);
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            Value *Incoming =
+              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+            if (!IsNullOrUndef(Incoming)) {
+              CallInst *Clone = cast<CallInst>(CInst->clone());
+              Value *Op = PN->getIncomingValue(i);
+              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              if (Op->getType() != ParamTy)
+                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+              Clone->setArgOperand(0, Op);
+              Clone->insertBefore(InsertPos);
+
+              DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Cloning "
+                           << *CInst << "\n"
+                           "                                     And inserting "
+                           "clone at " << *InsertPos << "\n");
+              Worklist.push_back(std::make_pair(Clone, Incoming));
+            }
+          }
+          // Erase the original call.
+          DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+          EraseInstruction(CInst);
+          continue;
+        }
+      }
+    } while (!Worklist.empty());
+  }
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished List.\n");
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
+       E = MyStates.top_down_ptr_end(); I != E; ++I)
+    switch (I->second.GetSeq()) {
+    default: break;
+    case S_Use: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None:
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_Use:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    case S_CanRelease: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_CanRelease:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Use:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    }
+}
+
+bool
+ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
+                                     BasicBlock *BB,
+                                     MapVector<Value *, RRInfo> &Retains,
+                                     BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+    // If we see two releases in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second release, which may allow us to
+    // eliminate the first release too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
+      DEBUG(dbgs() << "ObjCARCOpt::VisitInstructionBottomUp: Found nested "
+                      "releases (i.e. a release pair)\n");
+      NestingDetected = true;
+    }
+
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+    Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+    ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq);
+    S.ResetSequenceProgress(NewSeq);
+    S.RRI.ReleaseMetadata = ReleaseMetadata;
+    S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
+    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+    S.RRI.Calls.insert(Inst);
+    S.SetKnownPositiveRefCount();
+    break;
+  }
+  case IC_RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    S.SetKnownPositiveRefCount();
+
+    Sequence OldSeq = S.GetSeq();
+    switch (OldSeq) {
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+    case S_Use:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_CanRelease:
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV)
+        Retains[Inst] = S.RRI;
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+    ANNOTATE_BOTTOMUP(Inst, Arg, OldSeq, S.GetSeq());
+    return NestingDetected;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearKnownPositiveRefCount();
+      switch (Seq) {
+      case S_Use:
+        S.SetSeq(S_CanRelease);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S.GetSeq());
+        continue;
+      case S_CanRelease:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Stop:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_Release:
+    case S_MovableRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        assert(S.RRI.ReverseInsertPts.empty());
+        // If this is an invoke instruction, we're scanning it as part of
+        // one of its successor blocks, since we can't insert code after it
+        // in its own block, and we don't want to split critical edges.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+        S.SetSeq(S_Use);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      } else if (Seq == S_Release && IsUser(Class)) {
+        // Non-movable releases depend on any possible objc pointer use.
+        S.SetSeq(S_Stop);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop);
+        assert(S.RRI.ReverseInsertPts.empty());
+        // As above; handle invoke specially.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+      }
+      break;
+    case S_Stop:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        S.SetSeq(S_Use);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      }
+      break;
+    case S_CanRelease:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                          DenseMap<const BasicBlock *, BBState> &BBStates,
+                          MapVector<Value *, RRInfo> &Retains) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_BOTTOMUP_BBEND(MyStates, BB);
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = llvm::prior(I);
+
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitButtonUp: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_BOTTOMUP_BBSTART(MyStates, BB);
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+
+    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // better to let it remain as the first instruction after a call.
+    if (Class != IC_RetainRV) {
+      // If we see two retains in a row on the same pointer. If so, make
+      // a note, and we'll cicle back to revisit it after we've
+      // hopefully eliminated the second retain, which may allow us to
+      // eliminate the first retain too.
+      // Theoretically we could implement removal of nested retain+release
+      // pairs by making PtrState hold a stack of states, but this is
+      // simple and avoids adding overhead for the non-nested case.
+      if (S.GetSeq() == S_Retain)
+        NestingDetected = true;
+
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain);
+      S.ResetSequenceProgress(S_Retain);
+      S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
+      S.RRI.Calls.insert(Inst);
+    }
+
+    S.SetKnownPositiveRefCount();
+
+    // A retain can be a potential use; procede to the generic checking
+    // code below.
+    break;
+  }
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+    S.ClearKnownPositiveRefCount();
+
+    switch (S.GetSeq()) {
+    case S_Retain:
+    case S_CanRelease:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_Use:
+      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      Releases[Inst] = S.RRI;
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None);
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+    break;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearKnownPositiveRefCount();
+      switch (Seq) {
+      case S_Retain:
+        S.SetSeq(S_CanRelease);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease);
+        assert(S.RRI.ReverseInsertPts.empty());
+        S.RRI.ReverseInsertPts.insert(Inst);
+
+        // One call can't cause a transition from S_Retain to S_CanRelease
+        // and S_CanRelease to S_Use. If we've made the first transition,
+        // we're done.
+        continue;
+      case S_Use:
+      case S_CanRelease:
+      case S_None:
+        break;
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        llvm_unreachable("top-down pointer in release state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_CanRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        S.SetSeq(S_Use);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_Use);
+      }
+      break;
+    case S_Retain:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_TOPDOWN_BBSTART(MyStates, BB);
+
+  // Visit all the instructions, top-down.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    Instruction *Inst = I;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitTopDown: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_TOPDOWN_BBEND(MyStates, BB);
+
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
+  /// The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB)) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
+        goto dfs_next_succ;
+      }
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
+    }
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
+  } while (!SuccStack.empty());
+
+  Visited.clear();
+
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *ExitBB = I;
+    BBState &MyStates = BBStates[ExitBB];
+    if (!MyStates.isExit())
+      continue;
+
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
+    Visited.insert(ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
+        BasicBlock *BB = *PredStack.back().second++;
+        if (Visited.insert(BB)) {
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+          goto reverse_dfs_next_succ;
+        }
+      }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+    }
+  }
+}
+
+// Visit the function both top-down and bottom-up.
+bool
+ObjCARCOpt::Visit(Function &F,
+                  DenseMap<const BasicBlock *, BBState> &BBStates,
+                  MapVector<Value *, RRInfo> &Retains,
+                  DenseMap<Value *, RRInfo> &Releases) {
+
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    NoObjCARCExceptionsMDKind,
+                    BBStates);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
+  bool BottomUpNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
+       I != E; ++I)
+    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
+
+  // Use reverse-postorder for top-down.
+  bool TopDownNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       PostOrder.rbegin(), E = PostOrder.rend();
+       I != E; ++I)
+    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg,
+                           RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           MapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+  // Insert the new retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = ReleasesToMove.ReverseInsertPts.begin(),
+       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call =
+      CallInst::Create(getRetainCallee(M), MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Release: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = RetainsToMove.ReverseInsertPts.begin(),
+       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                      "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(ImpreciseReleaseMDKind, M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Retain: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+
+  // Delete the original retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = RetainsToMove.Calls.begin(),
+       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRetain = *AI;
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting retain: " << *OrigRetain <<
+                    "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = ReleasesToMove.Calls.begin(),
+       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRelease = *AI;
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting release: " << *OrigRelease
+                 << "\n");
+  }
+}
+
+bool
+ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
+                                    &BBStates,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  DenseMap<Value *, RRInfo> &Releases,
+                                  Module *M,
+                                  SmallVector<Instruction *, 4> &NewRetains,
+                                  SmallVector<Instruction *, 4> &NewReleases,
+                                  SmallVector<Instruction *, 8> &DeadInsts,
+                                  RRInfo &RetainsToMove,
+                                  RRInfo &ReleasesToMove,
+                                  Value *Arg,
+                                  bool KnownSafe,
+                                  bool &AnyPairsCompletelyEliminated) {
+  // If a pair happens in a region where it is known that the reference count
+  // is already incremented, we can similarly ignore possible decrements.
+  bool KnownSafeTD = true, KnownSafeBU = true;
+
+  // Connect the dots between the top-down-collected RetainsToMove and
+  // bottom-up-collected ReleasesToMove to form sets of related calls.
+  // This is an iterative process so that we connect multiple releases
+  // to multiple retains if needed.
+  unsigned OldDelta = 0;
+  unsigned NewDelta = 0;
+  unsigned OldCount = 0;
+  unsigned NewCount = 0;
+  bool FirstRelease = true;
+  for (;;) {
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
+      Instruction *NewRetain = *NI;
+      MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
+      assert(It != Retains.end());
+      const RRInfo &NewRetainRRI = It->second;
+      KnownSafeTD &= NewRetainRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewRetainRRI.Calls.begin(),
+             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewRetainRelease = *LI;
+        DenseMap<Value *, RRInfo>::const_iterator Jt =
+          Releases.find(NewRetainRelease);
+        if (Jt == Releases.end())
+          return false;
+        const RRInfo &NewRetainReleaseRRI = Jt->second;
+        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+        if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
+          OldDelta -=
+            BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+          // Merge the ReleaseMetadata and IsTailCallRelease values.
+          if (FirstRelease) {
+            ReleasesToMove.ReleaseMetadata =
+              NewRetainReleaseRRI.ReleaseMetadata;
+            ReleasesToMove.IsTailCallRelease =
+              NewRetainReleaseRRI.IsTailCallRelease;
+            FirstRelease = false;
+          } else {
+            if (ReleasesToMove.ReleaseMetadata !=
+                NewRetainReleaseRRI.ReleaseMetadata)
+              ReleasesToMove.ReleaseMetadata = 0;
+            if (ReleasesToMove.IsTailCallRelease !=
+                NewRetainReleaseRRI.IsTailCallRelease)
+              ReleasesToMove.IsTailCallRelease = false;
+          }
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
+                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP))
+                NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+            }
+          NewReleases.push_back(NewRetainRelease);
+        }
+      }
+    }
+    NewRetains.clear();
+    if (NewReleases.empty()) break;
+
+    // Back the other way.
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
+      Instruction *NewRelease = *NI;
+      DenseMap<Value *, RRInfo>::const_iterator It =
+        Releases.find(NewRelease);
+      assert(It != Releases.end());
+      const RRInfo &NewReleaseRRI = It->second;
+      KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewReleaseRRI.Calls.begin(),
+             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewReleaseRetain = *LI;
+        MapVector<Value *, RRInfo>::const_iterator Jt =
+          Retains.find(NewReleaseRetain);
+        if (Jt == Retains.end())
+          return false;
+        const RRInfo &NewReleaseRetainRRI = Jt->second;
+        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
+        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
+          unsigned PathCount =
+            BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+          OldDelta += PathCount;
+          OldCount += PathCount;
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
+                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
+                PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                NewDelta += PathCount;
+                NewCount += PathCount;
+              }
+            }
+          NewRetains.push_back(NewReleaseRetain);
+        }
+      }
+    }
+    NewReleases.clear();
+    if (NewRetains.empty()) break;
+  }
+
+  // If the pointer is known incremented or nested, we can safely delete the
+  // pair regardless of what's between them.
+  if (KnownSafeTD || KnownSafeBU) {
+    RetainsToMove.ReverseInsertPts.clear();
+    ReleasesToMove.ReverseInsertPts.clear();
+    NewCount = 0;
+  } else {
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      return false;
+  }
+
+  // Determine whether the original call points are balanced in the retain and
+  // release calls through the program. If not, conservatively don't touch
+  // them.
+  // TODO: It's theoretically possible to do code motion in this case, as
+  // long as the existing imbalances are maintained.
+  if (OldDelta != 0)
+    return false;
+
+  Changed = true;
+  assert(OldCount != 0 && "Unreachable code?");
+  NumRRs += OldCount - NewCount;
+  // Set to true if we completely removed any RR pairs.
+  AnyPairsCompletelyEliminated = NewCount == 0;
+
+  // We can move calls!
+  return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool
+ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
+                                   &BBStates,
+                                 MapVector<Value *, RRInfo> &Retains,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
+  bool AnyPairsCompletelyEliminated = false;
+  RRInfo RetainsToMove;
+  RRInfo ReleasesToMove;
+  SmallVector<Instruction *, 4> NewRetains;
+  SmallVector<Instruction *, 4> NewReleases;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // Visit each retain.
+  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+
+    DEBUG(dbgs() << "ObjCARCOpt::PerformCodePlacement: Visiting: " << *Retain
+          << "\n");
+
+    Value *Arg = GetObjCArg(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    NewRetains.push_back(Retain);
+    bool PerformMoveCalls =
+      ConnectTDBUTraversals(BBStates, Retains, Releases, M, NewRetains,
+                            NewReleases, DeadInsts, RetainsToMove,
+                            ReleasesToMove, Arg, KnownSafe,
+                            AnyPairsCompletelyEliminated);
+
+#ifdef ARC_ANNOTATIONS
+    // Do not move calls if ARC annotations are requested. If we were to move
+    // calls in this case, we would not be able
+    PerformMoveCalls = PerformMoveCalls && !EnableARCAnnotations;
+#endif // ARC_ANNOTATIONS
+
+    if (PerformMoveCalls) {
+      // Ok, everything checks out and we're all set. Let's move/delete some
+      // code!
+      MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+                Retains, Releases, DeadInsts, M);
+    }
+
+    // Clean up state for next retain.
+    NewReleases.clear();
+    NewRetains.clear();
+    RetainsToMove.clear();
+    ReleasesToMove.clear();
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
+          "\n");
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == IC_LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = llvm::prior(I);
+    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*llvm::prior(J);
+      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
+      switch (EarlierClass) {
+      case IC_LoadWeak:
+      case IC_LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_StoreWeak:
+      case IC_InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_MoveWeak:
+      case IC_CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case IC_AutoreleasepoolPush:
+      case IC_None:
+      case IC_IntrinsicUser:
+      case IC_User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ++UI) {
+        const Instruction *UserInst = cast<Instruction>(*UI);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+        case IC_DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case IC_DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
+
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  /// Releases, Retains - These are used to store the results of the main flow
+  /// analysis. These use Value* as the key instead of Instruction* so that the
+  /// map stays valid when we get around to rewriting code and calls get
+  /// replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  MapVector<Value *, RRInfo> Retains;
+
+  /// This is used during the traversal of the function to track the
+  /// states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  // Transform.
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
+}
+
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
+static bool
+HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
+                             SmallPtrSet<Instruction *, 4> &DepInsts,
+                             SmallPtrSet<const BasicBlock *, 4> &Visited,
+                             ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                   DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return false;
+
+  CallInst *Call =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that the pointer is the return value of the call.
+  if (!Call || Arg != Call)
+    return false;
+
+  // Check that the call is a regular call.
+  InstructionClass Class = GetBasicInstructionClass(Call);
+  if (Class != IC_CallOrUser && Class != IC_Call)
+    return false;
+
+  return true;
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+                                  Instruction *Autorelease,
+                                  SmallPtrSet<Instruction *, 4> &DepInsts,
+                                  SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                  ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg,
+                   BB, Autorelease, DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+  
+  CallInst *Retain =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  
+  // Check that we found a retain with the same argument.
+  if (!Retain ||
+      !IsRetain(GetBasicInstructionClass(Retain)) ||
+      GetObjCArg(Retain) != Arg) {
+    return 0;
+  }
+  
+  return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+                                       ReturnInst *Ret,
+                                       SmallPtrSet<Instruction *, 4> &DepInsts,
+                                       SmallPtrSet<const BasicBlock *, 4> &V,
+                                       ProvenanceAnalysis &PA) {
+  FindDependencies(NeedsPositiveRetainCount, Arg,
+                   BB, Ret, DepInsts, V, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+  
+  CallInst *Autorelease =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  if (!Autorelease)
+    return 0;
+  InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+  if (!IsAutorelease(AutoreleaseClass))
+    return 0;
+  if (GetObjCArg(Autorelease) != Arg)
+    return 0;
+  
+  return Autorelease;
+}
+
+/// Look for this pattern:
+/// \code
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = FI;
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
+
+    if (!Ret)
+      continue;
+    
+    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
+    
+    // Look for an ``autorelease'' instruction that is a predecssor of Ret and
+    // dependent on Arg such that there are no instructions dependent on Arg
+    // that need a positive ref count in between the autorelease and Ret.
+    CallInst *Autorelease =
+      FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret,
+                                             DependingInstructions, Visited,
+                                             PA);
+    if (Autorelease) {
+      DependingInstructions.clear();
+      Visited.clear();
+      
+      CallInst *Retain =
+        FindPredecessorRetainWithSafePath(Arg, BB, Autorelease,
+                                          DependingInstructions, Visited, PA);
+      if (Retain) {
+        DependingInstructions.clear();
+        Visited.clear();
+        
+        // Check that there is nothing that can affect the reference count
+        // between the retain and the call.  Note that Retain need not be in BB.
+        if (HasSafePathToPredecessorCall(Arg, Retain, DependingInstructions,
+                                         Visited, PA)) {
+          // If so, we can zap the retain and autorelease.
+          Changed = true;
+          ++NumRets;
+          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
+                       << "\n                             Erasing: "
+                       << *Autorelease << "\n");
+          EraseInstruction(Retain);
+          EraseInstruction(Autorelease);
+        }
+      }
+    }
+    
+    DependingInstructions.clear();
+    Visited.clear();
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
+
+}
+
+bool ObjCARCOpt::doInitialization(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // Identify the imprecise release metadata kind.
+  ImpreciseReleaseMDKind =
+    M.getContext().getMDKindID("clang.imprecise_release");
+  CopyOnEscapeMDKind =
+    M.getContext().getMDKindID("clang.arc.copy_on_escape");
+  NoObjCARCExceptionsMDKind =
+    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+#ifdef ARC_ANNOTATIONS
+  ARCAnnotationBottomUpMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.bottomup");
+  ARCAnnotationTopDownMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.topdown");
+  ARCAnnotationProvenanceSourceMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.provenancesource");
+#endif // ARC_ANNOTATIONS
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers which can have arbitrary side effects.
+
+  // These are initialized lazily.
+  RetainRVCallee = 0;
+  AutoreleaseRVCallee = 0;
+  ReleaseCallee = 0;
+  RetainCallee = 0;
+  RetainBlockCallee = 0;
+  AutoreleaseCallee = 0;
+
+  return false;
+}
+
+bool ObjCARCOpt::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCOpt: Visiting Function: " << F.getName() << "\n");
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computs UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
+                            (1 << IC_LoadWeakRetained) |
+                            (1 << IC_StoreWeak) |
+                            (1 << IC_InitWeak) |
+                            (1 << IC_CopyWeak) |
+                            (1 << IC_MoveWeak) |
+                            (1 << IC_DestroyWeak)))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << IC_Retain) |
+                            (1 << IC_RetainRV) |
+                            (1 << IC_RetainBlock)))
+    if (UsedInThisFunction & (1 << IC_Release))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction & ((1 << IC_Autorelease) |
+                            (1 << IC_AutoreleaseRV)))
+    OptimizeReturns(F);
+
+  DEBUG(dbgs() << "\n");
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+/// @}
+///
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
new file mode 100644
index 0000000..03e12d4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -0,0 +1,252 @@
+//===- ObjCARCUtil.cpp - ObjC ARC Optimization --------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines several utility functions used by various ARC
+/// optimizations which are IMHO too big to be in a header file.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
+                                       const InstructionClass Class) {
+  switch (Class) {
+  case IC_Retain:
+    return OS << "IC_Retain";
+  case IC_RetainRV:
+    return OS << "IC_RetainRV";
+  case IC_RetainBlock:
+    return OS << "IC_RetainBlock";
+  case IC_Release:
+    return OS << "IC_Release";
+  case IC_Autorelease:
+    return OS << "IC_Autorelease";
+  case IC_AutoreleaseRV:
+    return OS << "IC_AutoreleaseRV";
+  case IC_AutoreleasepoolPush:
+    return OS << "IC_AutoreleasepoolPush";
+  case IC_AutoreleasepoolPop:
+    return OS << "IC_AutoreleasepoolPop";
+  case IC_NoopCast:
+    return OS << "IC_NoopCast";
+  case IC_FusedRetainAutorelease:
+    return OS << "IC_FusedRetainAutorelease";
+  case IC_FusedRetainAutoreleaseRV:
+    return OS << "IC_FusedRetainAutoreleaseRV";
+  case IC_LoadWeakRetained:
+    return OS << "IC_LoadWeakRetained";
+  case IC_StoreWeak:
+    return OS << "IC_StoreWeak";
+  case IC_InitWeak:
+    return OS << "IC_InitWeak";
+  case IC_LoadWeak:
+    return OS << "IC_LoadWeak";
+  case IC_MoveWeak:
+    return OS << "IC_MoveWeak";
+  case IC_CopyWeak:
+    return OS << "IC_CopyWeak";
+  case IC_DestroyWeak:
+    return OS << "IC_DestroyWeak";
+  case IC_StoreStrong:
+    return OS << "IC_StoreStrong";
+  case IC_CallOrUser:
+    return OS << "IC_CallOrUser";
+  case IC_Call:
+    return OS << "IC_Call";
+  case IC_User:
+    return OS << "IC_User";
+  case IC_IntrinsicUser:
+    return OS << "IC_IntrinsicUser";
+  case IC_None:
+    return OS << "IC_None";
+  }
+  llvm_unreachable("Unknown instruction class!");
+}
+
+InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
+  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+
+  // No (mandatory) arguments.
+  if (AI == AE)
+    return StringSwitch<InstructionClass>(F->getName())
+      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Case("clang.arc.use", IC_IntrinsicUser)
+      .Default(IC_CallOrUser);
+
+  // One argument.
+  const Argument *A0 = AI++;
+  if (AI == AE)
+    // Argument is a pointer.
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
+      // Argument is i8*.
+      if (ETy->isIntegerTy(8))
+        return StringSwitch<InstructionClass>(F->getName())
+          .Case("objc_retain",                IC_Retain)
+          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
+          .Case("objc_retainBlock",           IC_RetainBlock)
+          .Case("objc_release",               IC_Release)
+          .Case("objc_autorelease",           IC_Autorelease)
+          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
+          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
+          .Case("objc_retainedObject",        IC_NoopCast)
+          .Case("objc_unretainedObject",      IC_NoopCast)
+          .Case("objc_unretainedPointer",     IC_NoopCast)
+          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
+          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
+          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
+          .Default(IC_CallOrUser);
+
+      // Argument is i8**
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
+        if (Pte->getElementType()->isIntegerTy(8))
+          return StringSwitch<InstructionClass>(F->getName())
+            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
+            .Case("objc_loadWeak",              IC_LoadWeak)
+            .Case("objc_destroyWeak",           IC_DestroyWeak)
+            .Default(IC_CallOrUser);
+    }
+
+  // Two arguments, first is i8**.
+  const Argument *A1 = AI++;
+  if (AI == AE)
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+        if (Pte->getElementType()->isIntegerTy(8))
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
+            // Second argument is i8*
+            if (ETy1->isIntegerTy(8))
+              return StringSwitch<InstructionClass>(F->getName())
+                .Case("objc_storeWeak",             IC_StoreWeak)
+                .Case("objc_initWeak",              IC_InitWeak)
+                .Case("objc_storeStrong",           IC_StoreStrong)
+                .Default(IC_CallOrUser);
+            // Second argument is i8**.
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+              if (Pte1->getElementType()->isIntegerTy(8))
+                return StringSwitch<InstructionClass>(F->getName())
+                  .Case("objc_moveWeak",              IC_MoveWeak)
+                  .Case("objc_copyWeak",              IC_CopyWeak)
+                  // Ignore annotation calls. This is important to stop the
+                  // optimizer from treating annotations as uses which would
+                  // make the state of the pointers they are attempting to
+                  // elucidate to be incorrect.
+                  .Case("llvm.arc.annotation.topdown.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.topdown.bbend", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbend", IC_None)
+                  .Default(IC_CallOrUser);
+          }
+
+  // Anything else.
+  return IC_CallOrUser;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass
+llvm::objcarc::GetInstructionClass(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Any instruction other than bitcast and gep with a pointer operand have a
+    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
+    // to a subsequent use, rather than using it themselves, in this sense.
+    // As a short cut, several other opcodes are known to have no pointer
+    // operands of interest. And ret is never followed by a release, so it's
+    // not interesting to examine.
+    switch (I->getOpcode()) {
+    case Instruction::Call: {
+      const CallInst *CI = cast<CallInst>(I);
+      // Check for calls to special functions.
+      if (const Function *F = CI->getCalledFunction()) {
+        InstructionClass Class = GetFunctionClass(F);
+        if (Class != IC_CallOrUser)
+          return Class;
+
+        // None of the intrinsic functions do objc_release. For intrinsics, the
+        // only question is whether or not they may be users.
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
+        case Intrinsic::stacksave: case Intrinsic::stackrestore:
+        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        case Intrinsic::objectsize: case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
+        // Don't let dbg info affect our results.
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+          return IC_None;
+        default:
+          break;
+        }
+      }
+      return GetCallSiteClass(CI);
+    }
+    case Instruction::Invoke:
+      return GetCallSiteClass(cast<InvokeInst>(I));
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select: case Instruction::PHI:
+    case Instruction::Ret: case Instruction::Br:
+    case Instruction::Switch: case Instruction::IndirectBr:
+    case Instruction::Alloca: case Instruction::VAArg:
+    case Instruction::Add: case Instruction::FAdd:
+    case Instruction::Sub: case Instruction::FSub:
+    case Instruction::Mul: case Instruction::FMul:
+    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
+    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
+    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
+    case Instruction::And: case Instruction::Or: case Instruction::Xor:
+    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
+    case Instruction::IntToPtr: case Instruction::FCmp:
+    case Instruction::FPTrunc: case Instruction::FPExt:
+    case Instruction::FPToUI: case Instruction::FPToSI:
+    case Instruction::UIToFP: case Instruction::SIToFP:
+    case Instruction::InsertElement: case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+      break;
+    case Instruction::ICmp:
+      // Comparing a pointer with null, or any other constant, isn't an
+      // interesting use, because we don't care what the pointer points to, or
+      // about the values of any other dynamic reference-counted pointers.
+      if (IsPotentialRetainableObjPtr(I->getOperand(1)))
+        return IC_User;
+      break;
+    default:
+      // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
+      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (IsPotentialRetainableObjPtr(*OI))
+          return IC_User;
+    }
+  }
+
+  // Otherwise, it's totally inert for ARC purposes.
+  return IC_None;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
new file mode 100644
index 0000000..ae3c628
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -0,0 +1,177 @@
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+                                       const Value *B) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
+
+  // Check both arms of the Select node individually.
+  return related(A->getTrueValue(), B) ||
+         related(A->getFalseValue(), B);
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+                                    const Value *B) {
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
+    const Value *PV1 = A->getIncomingValue(i);
+    if (UniqueSrc.insert(PV1) && related(PV1, B))
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
+         UI != UE; ++UI) {
+      const User *Ur = *UI;
+      if (isa<StoreInst>(Ur)) {
+        if (UI.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur))
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedCheck(const Value *A,
+                                      const Value *B) {
+  // Skip past provenance pass-throughs.
+  A = GetUnderlyingObjCPtr(A);
+  B = GetUnderlyingObjCPtr(B);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case AliasAnalysis::NoAlias:
+    return false;
+  case AliasAnalysis::MustAlias:
+  case AliasAnalysis::PartialAlias:
+    return true;
+  case AliasAnalysis::MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return IsStoredObjCPointer(A);
+    if (BIsIdentified) {
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return IsStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
+    }
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
+      return IsStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
+bool ProvenanceAnalysis::related(const Value *A,
+                                 const Value *B) {
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  bool Result = relatedCheck(A, B);
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
new file mode 100644
index 0000000..ec449fd8e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -0,0 +1,80 @@
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class Value;
+  class AliasAnalysis;
+  class PHINode;
+  class SelectInst;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
+  AliasAnalysis *AA;
+
+  typedef std::pair<const Value *, const Value *> ValuePairTy;
+  typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
+  CachedResultsTy CachedResults;
+
+  bool relatedCheck(const Value *A, const Value *B);
+  bool relatedSelect(const SelectInst *A, const Value *B);
+  bool relatedPHI(const PHINode *A, const Value *B);
+
+  void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+
+public:
+  ProvenanceAnalysis() {}
+
+  void setAA(AliasAnalysis *aa) { AA = aa; }
+
+  AliasAnalysis *getAA() const { return AA; }
+
+  bool related(const Value *A, const Value *B);
+
+  void clear() {
+    CachedResults.clear();
+  }
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index b344952..a097308 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -16,16 +16,16 @@
 
 #define DEBUG_TYPE "adce"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/InstIterator.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/InstIterator.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved, "Number of instructions removed");
diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
index cee5502..e755008 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
@@ -27,12 +27,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "block-placement"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Function.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Transforms/Scalar.h"
 #include <set>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 123ed0f..015fd2e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -15,22 +15,23 @@
 
 #define DEBUG_TYPE "codegenprepare"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -38,10 +39,8 @@
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
@@ -106,6 +105,8 @@ namespace {
       }
     bool runOnFunction(Function &F);
 
+    const char *getPassName() const { return "CodeGen Prepare"; }
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<ProfileInfo>();
@@ -125,7 +126,7 @@ namespace {
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
-    bool DupRetToEnableTailCallOpts(ReturnInst *RI);
+    bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
   };
 }
@@ -148,11 +149,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   PFI = getAnalysisIfAvailable<ProfileInfo>();
-  OptSize = F.getFnAttributes().hasAttribute(Attributes::OptimizeForSize);
+  OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::OptimizeForSize);
 
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
-  if (TLI && TLI->isSlowDivBypassed()) {
+  if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
     const DenseMap<unsigned int, unsigned int> &BypassWidths =
        TLI->getBypassSlowDivWidths();
     for (Function::iterator I = F.begin(); I != F.end(); I++)
@@ -194,9 +196,20 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
           WorkList.insert(*II);
     }
 
-    for (SmallPtrSet<BasicBlock*, 8>::iterator
-           I = WorkList.begin(), E = WorkList.end(); I != E; ++I)
-      DeleteDeadBlock(*I);
+    // Delete the dead blocks and any of their dead successors.
+    MadeChange |= !WorkList.empty();
+    while (!WorkList.empty()) {
+      BasicBlock *BB = *WorkList.begin();
+      WorkList.erase(BB);
+      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
+
+      DeleteDeadBlock(BB);
+      
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
 
     // Merge pairs of basic blocks with unconditional branches, connected by
     // a single edge.
@@ -689,10 +702,14 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 ///   %tmp2 = tail call i32 @f2()
 ///   ret i32 %tmp2
 /// @endcode
-bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
+bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
   if (!TLI)
     return false;
 
+  ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+  if (!RI)
+    return false;
+
   PHINode *PN = 0;
   BitCastInst *BCI = 0;
   Value *V = RI->getReturnValue();
@@ -706,16 +723,15 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
       return false;
   }
 
-  BasicBlock *BB = RI->getParent();
   if (PN && PN->getParent() != BB)
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
   // See llvm::isInTailCallPosition().
   const Function *F = BB->getParent();
-  Attributes CallerRetAttr = F->getAttributes().getRetAttributes();
-  if (CallerRetAttr.hasAttribute(Attributes::ZExt) ||
-      CallerRetAttr.hasAttribute(Attributes::SExt))
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
     return false;
 
   // Make sure there are no instructions between the PHI and return, or that the
@@ -772,11 +788,11 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes();
-    if (AttrBuilder(CalleeRetAttr).
-          removeAttribute(Attributes::NoAlias) !=
-        AttrBuilder(CallerRetAttr).
-          removeAttribute(Attributes::NoAlias))
+    AttributeSet CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
+          removeAttribute(Attribute::NoAlias) !=
+        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
+          removeAttribute(Attribute::NoAlias))
       continue;
 
     // Make sure the call instruction is followed by an unconditional branch to
@@ -803,6 +819,629 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
 // Memory Optimization
 //===----------------------------------------------------------------------===//
 
+namespace {
+
+/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode
+/// which holds actual Value*'s for register values.
+struct ExtAddrMode : public TargetLowering::AddrMode {
+  Value *BaseReg;
+  Value *ScaledReg;
+  ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
+  void print(raw_ostream &OS) const;
+  void dump() const;
+  
+  bool operator==(const ExtAddrMode& O) const {
+    return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
+           (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
+           (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale);
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
+  AM.print(OS);
+  return OS;
+}
+
+void ExtAddrMode::print(raw_ostream &OS) const {
+  bool NeedPlus = false;
+  OS << "[";
+  if (BaseGV) {
+    OS << (NeedPlus ? " + " : "")
+       << "GV:";
+    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  if (BaseOffs)
+    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
+
+  if (BaseReg) {
+    OS << (NeedPlus ? " + " : "")
+       << "Base:";
+    WriteAsOperand(OS, BaseReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+  if (Scale) {
+    OS << (NeedPlus ? " + " : "")
+       << Scale << "*";
+    WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  OS << ']';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ExtAddrMode::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+
+/// \brief A helper class for matching addressing modes.
+///
+/// This encapsulates the logic for matching the target-legal addressing modes.
+class AddressingModeMatcher {
+  SmallVectorImpl<Instruction*> &AddrModeInsts;
+  const TargetLowering &TLI;
+
+  /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
+  /// the memory instruction that we're computing this address for.
+  Type *AccessTy;
+  Instruction *MemoryInst;
+  
+  /// AddrMode - This is the addressing mode that we're building up.  This is
+  /// part of the return value of this addressing mode matching stuff.
+  ExtAddrMode &AddrMode;
+  
+  /// IgnoreProfitability - This is set to true when we should not do
+  /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
+  /// always returns true.
+  bool IgnoreProfitability;
+  
+  AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
+                        const TargetLowering &T, Type *AT,
+                        Instruction *MI, ExtAddrMode &AM)
+    : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) {
+    IgnoreProfitability = false;
+  }
+public:
+  
+  /// Match - Find the maximal addressing mode that a load/store of V can fold,
+  /// give an access type of AccessTy.  This returns a list of involved
+  /// instructions in AddrModeInsts.
+  static ExtAddrMode Match(Value *V, Type *AccessTy,
+                           Instruction *MemoryInst,
+                           SmallVectorImpl<Instruction*> &AddrModeInsts,
+                           const TargetLowering &TLI) {
+    ExtAddrMode Result;
+
+    bool Success = 
+      AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
+                            MemoryInst, Result).MatchAddr(V, 0);
+    (void)Success; assert(Success && "Couldn't select *anything*?");
+    return Result;
+  }
+private:
+  bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
+  bool MatchAddr(Value *V, unsigned Depth);
+  bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth);
+  bool IsProfitableToFoldIntoAddressingMode(Instruction *I,
+                                            ExtAddrMode &AMBefore,
+                                            ExtAddrMode &AMAfter);
+  bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
+};
+
+/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
+/// Return true and update AddrMode if this addr mode is legal for the target,
+/// false if not.
+bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
+                                             unsigned Depth) {
+  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
+  // mode.  Just process that directly.
+  if (Scale == 1)
+    return MatchAddr(ScaleReg, Depth);
+  
+  // If the scale is 0, it takes nothing to add this.
+  if (Scale == 0)
+    return true;
+  
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+
+  ExtAddrMode TestAddrMode = AddrMode;
+
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  TestAddrMode.Scale += Scale;
+  TestAddrMode.ScaledReg = ScaleReg;
+
+  // If the new address isn't legal, bail out.
+  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
+    return false;
+
+  // It was legal, so commit it.
+  AddrMode = TestAddrMode;
+  
+  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+  // X*Scale + C*Scale to addr mode.
+  ConstantInt *CI = 0; Value *AddLHS = 0;
+  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+    TestAddrMode.ScaledReg = AddLHS;
+    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+      
+    // If this addressing mode is legal, commit it and remember that we folded
+    // this instruction.
+    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
+      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+      AddrMode = TestAddrMode;
+      return true;
+    }
+  }
+
+  // Otherwise, not (x+c)*scale, just return what we have.
+  return true;
+}
+
+/// MightBeFoldableInst - This is a little filter, which returns true if an
+/// addressing computation involving I might be folded into a load/store
+/// accessing it.  This doesn't need to be perfect, but needs to accept at least
+/// the set of instructions that MatchOperationAddr can.
+static bool MightBeFoldableInst(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    // Don't touch identity bitcasts.
+    if (I->getType() == I->getOperand(0)->getType())
+      return false;
+    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return true;
+  case Instruction::IntToPtr:
+    // We know the input is intptr_t, so this is foldable.
+    return true;
+  case Instruction::Add:
+    return true;
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // Can only handle X*C and X << C.
+    return isa<ConstantInt>(I->getOperand(1));
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// MatchOperationAddr - Given an instruction or constant expr, see if we can
+/// fold the operation into the addressing mode.  If so, update the addressing
+/// mode and return true, otherwise return false without modifying AddrMode.
+bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
+                                               unsigned Depth) {
+  // Avoid exponential behavior on extremely deep expression trees.
+  if (Depth >= 5) return false;
+  
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return MatchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr:
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
+        TLI.getPointerTy())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::BitCast:
+    // BitCast is always a noop, and we can handle it as long as it is
+    // int->int or pointer->pointer (we don't want int<->fp or something).
+    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
+         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
+        // Don't touch identity bitcasts.  These were probably put here by LSR,
+        // and we don't want to mess around with them.  Assume it knows what it
+        // is doing.
+        AddrInst->getOperand(0)->getType() != AddrInst->getType())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(0), Depth+1))
+      return true;
+    
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(1), Depth+1))
+      return true;
+    
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    break;
+  }
+  //case Instruction::Or:
+  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+  //break;
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) return false;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1LL << Scale;
+    
+    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+    
+    int64_t ConstantOffset = 0;
+    const DataLayout *TD = TLI.getDataLayout();
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1)
+            return false;
+          
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+    
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+        // Check to see if we can fold the base pointer in too.
+        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+      return false;
+    }
+
+    // Save the valid addressing mode in case we can't match.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // See if the scale and offset amount is valid for this target.
+    AddrMode.BaseOffs += ConstantOffset;
+
+    // Match the base operand of the GEP.
+    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
+      // If it couldn't be matched, just stuff the value in a register.
+      if (AddrMode.HasBaseReg) {
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+    }
+
+    // Match the remaining variable portion of the GEP.
+    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+                          Depth)) {
+      // If it couldn't be matched, try stuffing the base into a register
+      // instead of matching it, and retrying the match of the scale.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      if (AddrMode.HasBaseReg)
+        return false;
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+      AddrMode.BaseOffs += ConstantOffset;
+      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
+                            VariableScale, Depth)) {
+        // If even that didn't work, bail.
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+    }
+
+    return true;
+  }
+  }
+  return false;
+}
+
+/// MatchAddr - If we can, try to add the value of 'Addr' into the current
+/// addressing mode.  If Addr can't be added to AddrMode this returns false and
+/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
+/// or intptr_t for the target.
+///
+bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    // Fold in immediates if legal for the target.
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    // If this is a global variable, try to fold it into the addressing mode.
+    if (AddrMode.BaseGV == 0) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+        return true;
+      AddrMode.BaseGV = 0;
+    }
+  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // Check to see if it is possible to fold this operation.
+    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
+      // Okay, it's possible to fold this.  Check to see if it is actually
+      // *profitable* to do so.  We use a simple cost model to avoid increasing
+      // register pressure too much.
+      if (I->hasOneUse() ||
+          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+        AddrModeInsts.push_back(I);
+        return true;
+      }
+      
+      // It isn't profitable to do this, roll back.
+      //cerr << "NOT FOLDING: " << *I;
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
+      return true;
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    // Null pointer gets folded without affecting the addressing mode.
+    return true;
+  }
+
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    AddrMode.BaseReg = Addr;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.HasBaseReg = false;
+    AddrMode.BaseReg = 0;
+  }
+
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    AddrMode.ScaledReg = Addr;
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.Scale = 0;
+    AddrMode.ScaledReg = 0;
+  }
+  // Couldn't match.
+  return false;
+}
+
+/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
+/// inline asm call are due to memory operands.  If so, return true, otherwise
+/// return false.
+static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
+                                    const TargetLowering &TLI) {
+  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+    
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
+    // If this asm operand is our Value*, and if it isn't an indirect memory
+    // operand, we can't fold it!
+    if (OpInfo.CallOperandVal == OpVal &&
+        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
+         !OpInfo.isIndirect))
+      return false;
+  }
+
+  return true;
+}
+
+/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
+/// memory use.  If we find an obviously non-foldable instruction, return true.
+/// Add the ultimately found memory instructions to MemoryUses.
+static bool FindAllMemoryUses(Instruction *I,
+                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
+                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
+                              const TargetLowering &TLI) {
+  // If we already considered this instruction, we're done.
+  if (!ConsideredInsts.insert(I))
+    return false;
+  
+  // If this is an obviously unfoldable instruction, bail out.
+  if (!MightBeFoldableInst(I))
+    return true;
+
+  // Loop over all the uses, recursively processing them.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    User *U = *UI;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      unsigned opNo = UI.getOperandNo();
+      if (opNo == 0) return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(SI, opNo));
+      continue;
+    }
+    
+    if (CallInst *CI = dyn_cast<CallInst>(U)) {
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+      if (!IA) return true;
+      
+      // If this is a memory operand, we're cool, otherwise bail out.
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+        return true;
+      continue;
+    }
+    
+    if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
+                          TLI))
+      return true;
+  }
+
+  return false;
+}
+
+/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
+/// the use site that we're folding it into.  If so, there is no cost to
+/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
+/// that we know are live at the instruction already.
+bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+                                                   Value *KnownLive2) {
+  // If Val is either of the known-live values, we know it is live!
+  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
+    return true;
+  
+  // All values other than instructions and arguments (e.g. constants) are live.
+  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
+  
+  // If Val is a constant sized alloca in the entry block, it is live, this is
+  // true because it is just a reference to the stack/frame pointer, which is
+  // live for the whole function.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->isStaticAlloca())
+      return true;
+  
+  // Check to see if this value is already used in the memory instruction's
+  // block.  If so, it's already live into the block at the very least, so we
+  // can reasonably fold it.
+  return Val->isUsedInBasicBlock(MemoryInst->getParent());
+}
+
+/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
+/// mode of the machine to fold the specified instruction into a load or store
+/// that ultimately uses it.  However, the specified instruction has multiple
+/// uses.  Given this, it may actually increase register pressure to fold it
+/// into the load.  For example, consider this code:
+///
+///     X = ...
+///     Y = X+1
+///     use(Y)   -> nonload/store
+///     Z = Y+1
+///     load Z
+///
+/// In this case, Y has multiple uses, and can be folded into the load of Z
+/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
+/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
+/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
+/// number of computations either.
+///
+/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
+/// X was live across 'load Z' for other reasons, we actually *would* want to
+/// fold the addressing mode in the Z case.  This would make Y die earlier.
+bool AddressingModeMatcher::
+IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+                                     ExtAddrMode &AMAfter) {
+  if (IgnoreProfitability) return true;
+  
+  // AMBefore is the addressing mode before this instruction was folded into it,
+  // and AMAfter is the addressing mode after the instruction was folded.  Get
+  // the set of registers referenced by AMAfter and subtract out those
+  // referenced by AMBefore: this is the set of values which folding in this
+  // address extends the lifetime of.
+  //
+  // Note that there are only two potential values being referenced here,
+  // BaseReg and ScaleReg (global addresses are always available, as are any
+  // folded immediates).
+  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
+  
+  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
+  // lifetime wasn't extended by adding this instruction.
+  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    BaseReg = 0;
+  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    ScaledReg = 0;
+
+  // If folding this instruction (and it's subexprs) didn't extend any live
+  // ranges, we're ok with it.
+  if (BaseReg == 0 && ScaledReg == 0)
+    return true;
+
+  // If all uses of this instruction are ultimately load/store/inlineasm's,
+  // check to see if their addressing modes will include this instruction.  If
+  // so, we can fold it into all uses, so it doesn't matter if it has multiple
+  // uses.
+  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
+  SmallPtrSet<Instruction*, 16> ConsideredInsts;
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+    return false;  // Has a non-memory, non-foldable use!
+  
+  // Now that we know that all uses of this instruction are part of a chain of
+  // computation involving only operations that could theoretically be folded
+  // into a memory use, loop over each of these uses and see if they could
+  // *actually* fold the instruction.
+  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
+  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
+    Instruction *User = MemoryUses[i].first;
+    unsigned OpNo = MemoryUses[i].second;
+    
+    // Get the access type of this use.  If the use isn't a pointer, we don't
+    // know what it accesses.
+    Value *Address = User->getOperand(OpNo);
+    if (!Address->getType()->isPointerTy())
+      return false;
+    Type *AddressAccessTy =
+      cast<PointerType>(Address->getType())->getElementType();
+    
+    // Do a match against the root of this address, ignoring profitability. This
+    // will tell us if the addressing mode for the memory operation will
+    // *actually* cover the shared instruction.
+    ExtAddrMode Result;
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
+                                  MemoryInst, Result);
+    Matcher.IgnoreProfitability = true;
+    bool Success = Matcher.MatchAddr(Address, 0);
+    (void)Success; assert(Success && "Couldn't select *anything*?");
+
+    // If the match didn't cover I, then it won't be shared by it.
+    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
+                  I) == MatchedAddrModeInsts.end())
+      return false;
+    
+    MatchedAddrModeInsts.clear();
+  }
+  
+  return true;
+}
+
+} // end anonymous namespace
+
 /// IsNonLocalValue - Return true if the specified values are defined in a
 /// different basic block than BB.
 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
@@ -1319,9 +1958,6 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (CallInst *CI = dyn_cast<CallInst>(I))
     return OptimizeCallInst(CI);
 
-  if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
-    return DupRetToEnableTailCallOpts(RI);
-
   if (SelectInst *SI = dyn_cast<SelectInst>(I))
     return OptimizeSelectInst(SI);
 
@@ -1339,6 +1975,8 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
   while (CurInstIterator != BB.end())
     MadeChange |= OptimizeInst(CurInstIterator++);
 
+  MadeChange |= DupRetToEnableTailCallOpts(&BB);
+
   return MadeChange;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 369720b..d5a96ec 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -20,14 +20,14 @@
 
 #define DEBUG_TYPE "constprop"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Constant.h"
-#include "llvm/Instruction.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include <set>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3ec6f3d..995782e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -13,15 +13,17 @@
 
 #define DEBUG_TYPE "correlated-value-propagation"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumPhis,      "Number of phis propagated");
@@ -97,12 +99,29 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
     Value *Incoming = P->getIncomingValue(i);
     if (isa<Constant>(Incoming)) continue;
 
-    Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i),
-                                         P->getIncomingBlock(i),
-                                         BB);
-    if (!C) continue;
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB);
+
+    // Look if the incoming value is a select with a constant but LVI tells us
+    // that the incoming value can never be that constant. In that case replace
+    // the incoming value with the other value of the select. This often allows
+    // us to remove the select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+      if (!C) continue;
+
+      if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+                                  P->getIncomingBlock(i), BB) !=
+          LazyValueInfo::False)
+        continue;
+
+      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+      V = SI->getTrueValue();
+    }
 
-    P->setIncomingValue(i, C);
+    P->setIncomingValue(i, V);
     Changed = true;
   }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index a2e074f..e8a090a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -18,12 +18,12 @@
 
 #define DEBUG_TYPE "dce"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Instruction.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 736cc05..57432c7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -17,25 +17,25 @@
 
 #define DEBUG_TYPE "dse"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/DataLayout.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumFastStores, "Number of stores deleted");
@@ -376,10 +376,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
-  const DataLayout &TD = *AA.getDataLayout();
+  const DataLayout *TD = AA.getDataLayout();
 
-  const Value *UO1 = GetUnderlyingObject(P1, &TD),
-              *UO2 = GetUnderlyingObject(P2, &TD);
+  const Value *UO1 = GetUnderlyingObject(P1, TD),
+              *UO2 = GetUnderlyingObject(P2, TD);
 
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 101009d..3c08634 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -14,18 +14,18 @@
 
 #define DEBUG_TYPE "early-cse"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <deque>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index f003e06..129af8d 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -17,11 +17,6 @@
 
 #define DEBUG_TYPE "gvn"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
@@ -37,11 +32,16 @@
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PatternMatch.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -849,8 +849,8 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD);
+  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD);
   if (StoreBase != LoadBase)
     return -1;
 
@@ -945,7 +945,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
-    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD);
+    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD);
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
 
   unsigned Size = MemoryDependenceAnalysis::
@@ -1526,10 +1526,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
-  bool isSinglePred = false;
   bool allSingleSucc = true;
   while (TmpBB->getSinglePredecessor()) {
-    isSinglePred = true;
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
       return false;
@@ -1548,28 +1546,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   assert(TmpBB);
   LoadBB = TmpBB;
 
-  // FIXME: It is extremely unclear what this loop is doing, other than
-  // artificially restricting loadpre.
-  if (isSinglePred) {
-    bool isHot = false;
-    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
-      const AvailableValueInBlock &AV = ValuesPerBlock[i];
-      if (AV.isSimpleValue())
-        // "Hot" Instruction is in some loop (because it dominates its dep.
-        // instruction).
-        if (Instruction *I = dyn_cast<Instruction>(AV.getSimpleValue()))
-          if (DT->dominates(LI, I)) {
-            isHot = true;
-            break;
-          }
-    }
-
-    // We are interested only in "hot" instructions. We don't want to do any
-    // mis-optimizations here.
-    if (!isHot)
-      return false;
-  }
-
   // Check to see how many predecessors have the loaded value fully
   // available.
   DenseMap<BasicBlock*, Value*> PredLoads;
@@ -1738,7 +1714,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return true;
 }
 
-static void patchReplacementInstruction(Value *Repl, Instruction *I) {
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
   BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
@@ -1780,8 +1756,8 @@ static void patchReplacementInstruction(Value *Repl, Instruction *I) {
   }
 }
 
-static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) {
-  patchReplacementInstruction(Repl, I);
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
   I->replaceAllUsesWith(Repl);
 }
 
@@ -1943,7 +1919,7 @@ bool GVN::processLoad(LoadInst *L) {
     }
 
     // Remove it!
-    patchAndReplaceAllUsesWith(AvailableVal, L);
+    patchAndReplaceAllUsesWith(L, AvailableVal);
     if (DepLI->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
     markInstructionForDeletion(L);
@@ -2284,7 +2260,7 @@ bool GVN::processInstruction(Instruction *I) {
   }
 
   // Remove it!
-  patchAndReplaceAllUsesWith(repl, I);
+  patchAndReplaceAllUsesWith(I, repl);
   if (MD && repl->getType()->getScalarType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
   markInstructionForDeletion(I);
@@ -2371,8 +2347,8 @@ bool GVN::processBlock(BasicBlock *BB) {
          E = InstrsToErase.end(); I != E; ++I) {
       DEBUG(dbgs() << "GVN removed: " << **I << '\n');
       if (MD) MD->removeInstruction(*I);
-      (*I)->eraseFromParent();
       DEBUG(verifyRemoved(*I));
+      (*I)->eraseFromParent();
     }
     InstrsToErase.clear();
 
@@ -2389,7 +2365,7 @@ bool GVN::processBlock(BasicBlock *BB) {
 /// control flow patterns and attempts to perform simple PRE at the join point.
 bool GVN::performPRE(Function &F) {
   bool Changed = false;
-  DenseMap<BasicBlock*, Value*> predMap;
+  SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
   for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
        DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
     BasicBlock *CurrentBlock = *DI;
@@ -2445,19 +2421,22 @@ bool GVN::performPRE(Function &F) {
         if (P == CurrentBlock) {
           NumWithout = 2;
           break;
-        } else if (!DT->dominates(&F.getEntryBlock(), P))  {
+        } else if (!DT->isReachableFromEntry(P))  {
           NumWithout = 2;
           break;
         }
 
         Value* predV = findLeader(P, ValNo);
         if (predV == 0) {
+          predMap.push_back(std::make_pair(static_cast<Value *>(0), P));
           PREPred = P;
           ++NumWithout;
         } else if (predV == CurInst) {
+          /* CurInst dominates this predecessor. */
           NumWithout = 2;
+          break;
         } else {
-          predMap[P] = predV;
+          predMap.push_back(std::make_pair(predV, P));
           ++NumWith;
         }
       }
@@ -2504,15 +2483,14 @@ bool GVN::performPRE(Function &F) {
       // the PRE predecessor.  This is typically because of loads which
       // are not value numbered precisely.
       if (!success) {
-        delete PREInstr;
         DEBUG(verifyRemoved(PREInstr));
+        delete PREInstr;
         continue;
       }
 
       PREInstr->insertBefore(PREPred->getTerminator());
       PREInstr->setName(CurInst->getName() + ".pre");
       PREInstr->setDebugLoc(CurInst->getDebugLoc());
-      predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
       ++NumGVNPRE;
 
@@ -2520,13 +2498,14 @@ bool GVN::performPRE(Function &F) {
       addToLeaderTable(ValNo, PREInstr, PREPred);
 
       // Create a PHI to make the value available in this block.
-      pred_iterator PB = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
-      PHINode* Phi = PHINode::Create(CurInst->getType(), std::distance(PB, PE),
+      PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
                                      CurInst->getName() + ".pre-phi",
                                      CurrentBlock->begin());
-      for (pred_iterator PI = PB; PI != PE; ++PI) {
-        BasicBlock *P = *PI;
-        Phi->addIncoming(predMap[P], P);
+      for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+        if (Value *V = predMap[i].first)
+          Phi->addIncoming(V, predMap[i].second);
+        else
+          Phi->addIncoming(PREInstr, PREPred);
       }
 
       VN.add(Phi, ValNo);
@@ -2551,8 +2530,8 @@ bool GVN::performPRE(Function &F) {
 
       DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
       if (MD) MD->removeInstruction(CurInst);
-      CurInst->eraseFromParent();
       DEBUG(verifyRemoved(CurInst));
+      CurInst->eraseFromParent();
       Changed = true;
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
index 6301aad..5d02c68 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -53,21 +53,28 @@
 
 #define DEBUG_TYPE "global-merge"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Attributes.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
+static cl::opt<bool>
+EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
+                  	cl::desc("Enable global merge pass on constants"),
+                  	cl::init(false));
+
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
@@ -76,7 +83,24 @@ namespace {
     const TargetLowering *TLI;
 
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
-                 Module &M, bool isConst) const;
+                 Module &M, bool isConst, unsigned AddrSpace) const;
+
+    /// \brief Check if the given variable has been identified as must keep
+    /// \pre setMustKeepGlobalVariables must have been called on the Module that
+    ///      contains GV
+    bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
+      return MustKeepGlobalVariables.count(GV);
+    }
+
+    /// Collect every variables marked as "used" or used in a landing pad
+    /// instruction for this Module.
+    void setMustKeepGlobalVariables(Module &M);
+
+    /// Collect every variables marked as "used"
+    void collectUsedGlobalVariables(Module &M);
+
+    /// Keep track of the GlobalVariable that must not be merged away
+    SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
@@ -87,6 +111,7 @@ namespace {
 
     virtual bool doInitialization(Module &M);
     virtual bool runOnFunction(Function &F);
+    virtual bool doFinalization(Module &M);
 
     const char *getPassName() const {
       return "Merge internal globals";
@@ -118,7 +143,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge",
 
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
-                             Module &M, bool isConst) const {
+                          Module &M, bool isConst, unsigned AddrSpace) const {
   const DataLayout *TD = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
@@ -150,7 +175,9 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
     Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
     GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
                                                   GlobalValue::InternalLinkage,
-                                                  MergedInit, "_MergedGlobals");
+                                                  MergedInit, "_MergedGlobals",
+                                                  0, GlobalVariable::NotThreadLocal,
+                                                  AddrSpace);
     for (size_t k = i; k < j; ++k) {
       Constant *Idx[2] = {
         ConstantInt::get(Int32Ty, 0),
@@ -167,12 +194,51 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   return true;
 }
 
+void GlobalMerge::collectUsedGlobalVariables(Module &M) {
+  // Extract global variables from llvm.used array
+  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (InitList == 0) return;
+ 
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const GlobalVariable *G =
+        dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
+      MustKeepGlobalVariables.insert(G);
+}
+
+void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
+  collectUsedGlobalVariables(M);
+
+  for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn;
+       ++IFn) {
+    for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end();
+         IBB != IEndBB; ++IBB) {
+      // Follow the inwoke link to find the landing pad instruction
+      const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator());
+      if (!II) continue;
+
+      const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst();
+      // Look for globals in the clauses of the landing pad instruction
+      for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses();
+           Idx != NumClauses; ++Idx)
+        if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(LPInst->getClause(Idx)
+                                     ->stripPointerCasts()))
+          MustKeepGlobalVariables.insert(GV);
+    }
+  }
+}
 
 bool GlobalMerge::doInitialization(Module &M) {
-  SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals;
+  DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
+                                                        BSSGlobals;
   const DataLayout *TD = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
+  setMustKeepGlobalVariables(M);
 
   // Grab all non-const globals.
   for (Module::global_iterator I = M.global_begin(),
@@ -181,6 +247,11 @@ bool GlobalMerge::doInitialization(Module &M) {
     if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
       continue;
 
+    PointerType *PT = dyn_cast<PointerType>(I->getType());
+    assert(PT && "Global variable is not a pointer!");
+
+    unsigned AddressSpace = PT->getAddressSpace();
+
     // Ignore fancy-aligned globals for now.
     unsigned Alignment = TD->getPreferredAlignment(I);
     Type *Ty = I->getType()->getElementType();
@@ -192,27 +263,36 @@ bool GlobalMerge::doInitialization(Module &M) {
         I->getName().startswith(".llvm."))
       continue;
 
+    // Ignore all "required" globals:
+    if (isMustKeepGlobalVariable(I))
+      continue;
+
     if (TD->getTypeAllocSize(Ty) < MaxOffset) {
       if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
           .isBSSLocal())
-        BSSGlobals.push_back(I);
+        BSSGlobals[AddressSpace].push_back(I);
       else if (I->isConstant())
-        ConstGlobals.push_back(I);
+        ConstGlobals[AddressSpace].push_back(I);
       else
-        Globals.push_back(I);
+        Globals[AddressSpace].push_back(I);
     }
   }
 
-  if (Globals.size() > 1)
-    Changed |= doMerge(Globals, M, false);
-  if (BSSGlobals.size() > 1)
-    Changed |= doMerge(BSSGlobals, M, false);
+  for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+       I = Globals.begin(), E = Globals.end(); I != E; ++I)
+    if (I->second.size() > 1)
+      Changed |= doMerge(I->second, M, false, I->first);
+
+  for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+       I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I)
+    if (I->second.size() > 1)
+      Changed |= doMerge(I->second, M, false, I->first);
 
-  // FIXME: This currently breaks the EH processing due to way how the
-  // typeinfo detection works. We might want to detect the TIs and ignore
-  // them in the future.
-  // if (ConstGlobals.size() > 1)
-  //  Changed |= doMerge(ConstGlobals, M, true);
+  if (EnableGlobalMergeOnConst)
+    for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+         I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I)
+      if (I->second.size() > 1)
+        Changed |= doMerge(I->second, M, true, I->first);
 
   return Changed;
 }
@@ -221,6 +301,11 @@ bool GlobalMerge::runOnFunction(Function &F) {
   return false;
 }
 
+bool GlobalMerge::doFinalization(Module &M) {
+  MustKeepGlobalVariables.clear();
+  return false;
+}
+
 Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
   return new GlobalMerge(tli);
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 310fd61..8e76c78 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -26,28 +26,28 @@
 
 #define DEBUG_TYPE "indvars"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Type.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumWidened     , "Number of indvars widened");
@@ -535,6 +535,45 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         if (!SE->isLoopInvariant(ExitValue, L))
           continue;
 
+        // Computing the value outside of the loop brings no benefit if :
+        //  - it is definitely used inside the loop in a way which can not be
+        //    optimized away.
+        //  - no use outside of the loop can take advantage of hoisting the
+        //    computation out of the loop
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          unsigned NumHardInternalUses = 0;
+          unsigned NumSoftExternalUses = 0;
+          unsigned NumUses = 0;
+          for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end();
+               IB!=IE && NumUses<=6 ; ++IB) {
+            Instruction *UseInstr = cast<Instruction>(*IB);
+            unsigned Opc = UseInstr->getOpcode();
+            NumUses++;
+            if (L->contains(UseInstr)) {
+              if (Opc == Instruction::Call || Opc == Instruction::Ret)
+                NumHardInternalUses++;
+            } else {
+              if (Opc == Instruction::PHI) {
+                // Do not count the Phi as a use. LCSSA may have inserted
+                // plenty of trivial ones.
+                NumUses--;
+                for (Value::use_iterator PB=UseInstr->use_begin(),
+                                         PE=UseInstr->use_end();
+                     PB!=PE && NumUses<=6 ; ++PB, ++NumUses) {
+                  unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+                  if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+                    NumSoftExternalUses++;
+                }
+                continue;
+              }
+              if (Opc != Instruction::Call && Opc != Instruction::Ret)
+                NumSoftExternalUses++;
+            }
+          }
+          if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+            continue;
+        }
+
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
 
         DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index e7ffa09..b61c5ba 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -13,28 +13,28 @@
 
 #define DEBUG_TYPE "jump-threading"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumThreads, "Number of jumps threaded");
@@ -216,19 +216,24 @@ bool JumpThreading::runOnFunction(Function &F) {
 }
 
 /// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
+/// thread across it. Stop scanning the block when passing the threshold.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+                                             unsigned Threshold) {
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I = BB->getFirstNonPHI();
 
   // FIXME: THREADING will delete values that are just used to compute the
   // branch, so they shouldn't count against the duplication cost.
 
-
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
   for (; !isa<TerminatorInst>(I); ++I) {
+
+    // Stop scanning the block if we've reached the threshold.
+    if (Size > Threshold)
+      return Size;
+
     // Debugger intrinsics don't incur code size.
     if (isa<DbgInfoIntrinsic>(I)) continue;
 
@@ -244,7 +249,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
     // as having cost of 2 total, and if they are a vector intrinsic, we model
     // them as having cost 1.
     if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (!isa<IntrinsicInst>(CI))
+      if (CI->hasFnAttr(Attribute::NoDuplicate))
+        // Blocks with NoDuplicate are modelled as having infinite cost, so they
+        // are never duplicated.
+        return ~0U;
+      else if (!isa<IntrinsicInst>(CI))
         Size += 3;
       else if (!CI->getType()->isVectorTy())
         Size += 1;
@@ -1337,7 +1346,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold);
   if (JumpThreadCost > Threshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1481,7 +1490,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB);
+  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold);
   if (DuplicationCost > Threshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index 4818437..f94cd2a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -32,27 +32,28 @@
 
 #define DEBUG_TYPE "licm"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -90,6 +91,8 @@ namespace {
       AU.addRequired<TargetLibraryInfo>();
     }
 
+    using llvm::Pass::doFinalization;
+
     bool doFinalization() {
       assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
       return false;
@@ -437,13 +440,12 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
   }
 
   // Only these instructions are hoistable/sinkable.
-  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
-                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
-                            isa<ExtractElementInst>(I) ||
-                            isa<ShuffleVectorInst>(I));
-  if (!HoistableKind)
-      return false;
+  if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+      !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+      !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+      !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+      !isa<InsertValueInst>(I))
+    return false;
 
   return isSafeToExecuteUnconditionally(I);
 }
@@ -663,16 +665,18 @@ namespace {
     AliasSetTracker &AST;
     DebugLoc DL;
     int Alignment;
+    MDNode *TBAATag;
   public:
     LoopPromoter(Value *SP,
                  const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  SmallPtrSet<Value*, 4> &PMA,
                  SmallVectorImpl<BasicBlock*> &LEB,
                  SmallVectorImpl<Instruction*> &LIP,
-                 AliasSetTracker &ast, DebugLoc dl, int alignment)
+                 AliasSetTracker &ast, DebugLoc dl, int alignment,
+                 MDNode *TBAATag)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP),
         PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP),
-        AST(ast), DL(dl), Alignment(alignment) {}
+        AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {}
 
     virtual bool isInstInList(Instruction *I,
                               const SmallVectorImpl<Instruction*> &) const {
@@ -696,6 +700,7 @@ namespace {
         StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
+        if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
       }
     }
 
@@ -749,10 +754,11 @@ void LICM::PromoteAliasSet(AliasSet &AS,
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
+  MDNode *TBAATag = 0;
 
   // Check that all of the pointers in the alias set have the same type.  We
   // cannot (yet) promote a memory location that is loaded and stored in
-  // different sizes.
+  // different sizes.  While we are at it, collect alignment and TBAA info.
   for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
     Value *ASIV = ASI->getValue();
     PointerMustAliases.insert(ASIV);
@@ -794,8 +800,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
         // instruction will be executed, update the alignment.
         // Larger is better, with the exception of 0 being the best alignment.
         unsigned InstAlignment = store->getAlignment();
-        if ((InstAlignment > Alignment || InstAlignment == 0)
-            && (Alignment != 0))
+        if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
           if (isGuaranteedToExecute(*Use)) {
             GuaranteedToExecute = true;
             Alignment = InstAlignment;
@@ -807,6 +812,15 @@ void LICM::PromoteAliasSet(AliasSet &AS,
       } else
         return; // Not a load or store.
 
+      // Merge the TBAA tags.
+      if (LoopUses.empty()) {
+        // On the first load/store, just take its TBAA tag.
+        TBAATag = Use->getMetadata(LLVMContext::MD_tbaa);
+      } else if (TBAATag) {
+        TBAATag = MDNode::getMostGenericTBAA(TBAATag,
+                                       Use->getMetadata(LLVMContext::MD_tbaa));
+      }
+      
       LoopUses.push_back(Use);
     }
   }
@@ -839,7 +853,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, *CurAST, DL, Alignment);
+                        InsertPts, *CurAST, DL, Alignment, TBAATag);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -848,6 +862,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
                  Preheader->getTerminator());
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
+  if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
   // Rewrite all the loads in the loop and remember all the definitions from
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 3771f5a..0b62050 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -16,11 +16,11 @@
 
 #define DEBUG_TYPE "loop-delete"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
@@ -34,13 +34,9 @@ namespace {
     }
 
     // Possibly eliminate loop L if it is dead.
-    bool runOnLoop(Loop* L, LPPassManager& LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
 
-    bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
-                    SmallVector<BasicBlock*, 4>& exitBlocks,
-                    bool &Changed, BasicBlock *Preheader);
-
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<DominatorTree>();
       AU.addRequired<LoopInfo>();
       AU.addRequired<ScalarEvolution>();
@@ -53,6 +49,12 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
     }
+
+  private:
+    bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks,
+                    SmallVector<BasicBlock*, 4> &exitBlocks,
+                    bool &Changed, BasicBlock *Preheader);
+
   };
 }
 
@@ -67,18 +69,18 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
 
-Pass* llvm::createLoopDeletionPass() {
+Pass *llvm::createLoopDeletionPass() {
   return new LoopDeletion();
 }
 
-/// IsLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
 /// checked for unique exit and exiting blocks, and that the code is in LCSSA
 /// form.
-bool LoopDeletion::IsLoopDead(Loop* L,
-                              SmallVector<BasicBlock*, 4>& exitingBlocks,
-                              SmallVector<BasicBlock*, 4>& exitBlocks,
+bool LoopDeletion::isLoopDead(Loop *L,
+                              SmallVector<BasicBlock*, 4> &exitingBlocks,
+                              SmallVector<BasicBlock*, 4> &exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
-  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock *exitBlock = exitBlocks[0];
 
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
@@ -86,19 +88,19 @@ bool LoopDeletion::IsLoopDead(Loop* L,
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
   BasicBlock::iterator BI = exitBlock->begin();
-  while (PHINode* P = dyn_cast<PHINode>(BI)) {
-    Value* incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
 
     // Make sure all exiting blocks produce the same incoming value for the exit
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
-    for (unsigned i = 1; i < exitingBlocks.size(); ++i) {
+    for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
       if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
         return false;
     }
 
-    if (Instruction* I = dyn_cast<Instruction>(incoming))
+    if (Instruction *I = dyn_cast<Instruction>(incoming))
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
 
@@ -127,10 +129,10 @@ bool LoopDeletion::IsLoopDead(Loop* L,
 /// so could change the halting/non-halting nature of a program.
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
-bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
-  BasicBlock* preheader = L->getLoopPreheader();
+  BasicBlock *preheader = L->getLoopPreheader();
   if (!preheader)
     return false;
 
@@ -158,19 +160,19 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
     return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
-  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock *exitBlock = exitBlocks[0];
 
   // Because we're deleting a large chunk of code at once, the sequence in which
   // we remove things is very important to avoid invalidation issues.  Don't
@@ -182,14 +184,14 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   SE.forgetLoop(L);
 
   // Connect the preheader directly to the exit block.
-  TerminatorInst* TI = preheader->getTerminator();
+  TerminatorInst *TI = preheader->getTerminator();
   TI->replaceUsesOfWith(L->getHeader(), exitBlock);
 
   // Rewrite phis in the exit block to get their inputs from
   // the preheader instead of the exiting block.
-  BasicBlock* exitingBlock = exitingBlocks[0];
+  BasicBlock *exitingBlock = exitingBlocks[0];
   BasicBlock::iterator BI = exitBlock->begin();
-  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
     int j = P->getBasicBlockIndex(exitingBlock);
     assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
     P->setIncomingBlock(j, preheader);
@@ -200,7 +202,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
-  DominatorTree& DT = getAnalysis<DominatorTree>();
+  DominatorTree &DT = getAnalysis<DominatorTree>();
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
@@ -230,7 +232,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Finally, the blocks from loopinfo.  This has to happen late because
   // otherwise our loop iterators won't work.
-  LoopInfo& loopInfo = getAnalysis<LoopInfo>();
+  LoopInfo &loopInfo = getAnalysis<LoopInfo>();
   SmallPtrSet<BasicBlock*, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a44e798..8258719 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -43,18 +43,19 @@
 
 #define DEBUG_TYPE "loop-idiom"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -63,16 +64,83 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 
 namespace {
+
+  class LoopIdiomRecognize;
+
+  /// This class defines some utility functions for loop idiom recognization.
+  class LIRUtil {
+  public:
+    /// Return true iff the block contains nothing but an uncondition branch
+    /// (aka goto instruction).
+    static bool isAlmostEmpty(BasicBlock *);
+
+    static BranchInst *getBranch(BasicBlock *BB) {
+      return dyn_cast<BranchInst>(BB->getTerminator());
+    }
+
+    /// Return the condition of the branch terminating the given basic block.
+    static Value *getBrCondtion(BasicBlock *);
+
+    /// Derive the precondition block (i.e the block that guards the loop 
+    /// preheader) from the given preheader.
+    static BasicBlock *getPrecondBb(BasicBlock *PreHead);
+  };
+
+  /// This class is to recoginize idioms of population-count conducted in
+  /// a noncountable loop. Currently it only recognizes this pattern:
+  /// \code
+  ///   while(x) {cnt++; ...; x &= x - 1; ...}
+  /// \endcode
+  class NclPopcountRecognize {
+    LoopIdiomRecognize &LIR;
+    Loop *CurLoop;
+    BasicBlock *PreCondBB;
+
+    typedef IRBuilder<> IRBuilderTy;
+
+  public:
+    explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR);
+    bool recognize();
+
+  private:
+    /// Take a glimpse of the loop to see if we need to go ahead recoginizing
+    /// the idiom.
+    bool preliminaryScreen();
+
+    /// Check if the given conditional branch is based on the comparison
+    /// beween a variable and zero, and if the variable is non-zero, the
+    /// control yeilds to the loop entry. If the branch matches the behavior,
+    /// the variable involved in the comparion is returned. This function will
+    /// be called to see if the precondition and postcondition of the loop 
+    /// are in desirable form.
+    Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
+
+    /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
+    /// is set to the instruction counting the pupulation bit. 2) \p CntPhi
+    /// is set to the corresponding phi node. 3) \p Var is set to the value
+    /// whose population bits are being counted.
+    bool detectIdiom
+      (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
+
+    /// Insert ctpop intrinsic function and some obviously dead instructions.
+    void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
+
+    /// Create llvm.ctpop.* intrinsic function.
+    CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
+  };
+
   class LoopIdiomRecognize : public LoopPass {
     Loop *CurLoop;
     const DataLayout *TD;
     DominatorTree *DT;
     ScalarEvolution *SE;
     TargetLibraryInfo *TLI;
+    const TargetTransformInfo *TTI;
   public:
     static char ID;
     explicit LoopIdiomRecognize() : LoopPass(ID) {
       initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+      TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0;
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -109,7 +177,34 @@ namespace {
       AU.addPreserved<DominatorTree>();
       AU.addRequired<DominatorTree>();
       AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetTransformInfo>();
+    }
+
+    const DataLayout *getDataLayout() {
+      return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>();
+    }
+
+    DominatorTree *getDominatorTree() {
+      return DT ? DT : (DT=&getAnalysis<DominatorTree>());
+    }
+
+    ScalarEvolution *getScalarEvolution() {
+      return SE ? SE : (SE = &getAnalysis<ScalarEvolution>());
     }
+
+    TargetLibraryInfo *getTargetLibraryInfo() {
+      return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
+    }
+
+    const TargetTransformInfo *getTargetTransformInfo() {
+      return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>());
+    }
+
+    Loop *getLoop() const { return CurLoop; }
+
+  private:
+    bool runOnNoncountableLoop();
+    bool runOnCountableLoop();
   };
 }
 
@@ -123,6 +218,7 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
                     false, false)
 
@@ -172,24 +268,393 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
       deleteDeadInstruction(I, SE, TLI);
 }
 
-bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
-  CurLoop = L;
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LIRUtil
+//
+//===----------------------------------------------------------------------===//
 
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!L->getLoopPreheader())
+// This fucntion will return true iff the given block contains nothing but goto. 
+// A typical usage of this function is to check if the preheader fucntion is 
+// "almost" empty such that generated intrinsic function can be moved across 
+// preheader and to be placed at the end of the preconditiona block without 
+// concerning of breaking data dependence.
+bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
+  if (BranchInst *Br = getBranch(BB)) {
+    return Br->isUnconditional() && BB->size() == 1;
+  }
+  return false;
+}
+
+Value *LIRUtil::getBrCondtion(BasicBlock *BB) {
+  BranchInst *Br = getBranch(BB);
+  return Br ? Br->getCondition() : 0;
+}
+
+BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
+  if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
+    BranchInst *Br = getBranch(BB);
+    return Br && Br->isConditional() ? BB : 0;
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of NclPopcountRecognize
+//
+//===----------------------------------------------------------------------===//
+
+NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
+  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) {
+}
+
+bool NclPopcountRecognize::preliminaryScreen() {
+  const TargetTransformInfo *TTI = LIR.getTargetTransformInfo();
+  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
     return false;
 
-  // Disable loop idiom recognition if the function's name is a common idiom.
-  StringRef Name = L->getHeader()->getParent()->getName();
-  if (Name == "memset" || Name == "memcpy")
+  // Counting population are usually conducted by few arithmetic instrutions.
+  // Such instructions can be easilly "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
     return false;
 
-  // The trip count of the loop must be analyzable.
-  SE = &getAnalysis<ScalarEvolution>();
-  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but a goto instruction.
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead))
+    return false;
+
+  // It should have a precondition block where the generated popcount instrinsic
+  // function will be inserted.
+  PreCondBB = LIRUtil::getPrecondBb(PreHead);
+  if (!PreCondBB)
+    return false;
+ 
+  return true;
+}
+
+Value *NclPopcountRecognize::matchCondition (BranchInst *Br,
+                                             BasicBlock *LoopEntry) const {
+  if (!Br || !Br->isConditional())
+    return 0;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
+  if (!Cond)
+    return 0;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return 0;
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
+    return Cond->getOperand(0);
+
+  return 0;
+}
+
+bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
+                                       PHINode *&CntPhi,
+                                       Value *&Var) const {
+  // Following code tries to detect this idiom:
+  //
+  //    if (x0 != 0)
+  //      goto loop-exit // the precondition of the loop
+  //    cnt0 = init-val;
+  //    do {
+  //       x1 = phi (x0, x2);
+  //       cnt1 = phi(cnt0, cnt2);
+  //
+  //       cnt2 = cnt1 + 1;
+  //        ...
+  //       x2 = x1 & (x1 - 1);
+  //        ...
+  //    } while(x != 0);
+  //
+  // loop-exit:
+  //
+
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = 0;
+  VarX1 = VarX0 = 0;
+  PhiX = CountPhi = 0;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp)
+      return false;
+
+    Instruction *SubInst = cast<Instruction>(SubOneOp);
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    if (!Dec ||
+        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  {
+    PhiX = dyn_cast<PHINode>(VarX1);
+    if (!PhiX ||
+        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+      return false;
+    }
+  }
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = NULL;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
+           IterE = LoopEntry->end(); Iter != IterE; Iter++) {
+      Instruction *Inst = Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+      if (!Phi || Phi->getParent() != LoopEntry)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+             I != E;  I++) {
+        if ((cast<Instruction>(*I))->getParent() != LoopEntry) {
+          LiveOutLoop = true; break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
+    Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+void NclPopcountRecognize::transform(Instruction *CntInst,
+                                     PHINode *CntPhi, Value *Var) {
+
+  ScalarEvolution *SE = LIR.getScalarEvolution();
+  TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo();
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
+  const DebugLoc DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+ 
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilderTy Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+      Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the popoulation counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to
+  //   "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond =
+      cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCond->replaceAllUsesWith(NewPreCond);
+
+    deleteDeadInstruction(PreCond, *SE, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enble us to to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop become dead after
+  //    the transformation. It is lots easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, a infite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    BranchInst *LbBr = LIRUtil::getBranch(Body);
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin());
+
+    Builder.SetInsertPoint(LbCond);
+    Value *Opnd1 = cast<Value>(TcPhi);
+    Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1));
+    Instruction *TcDec =
+      cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ?
+      CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0)));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  {
+    SmallVector<Value *, 4> CntUses;
+    for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end();
+         I != E; I++) {
+      if (cast<Instruction>(*I)->getParent() != Body)
+        CntUses.push_back(*I);
+    }
+    for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
+      (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
+    }
+  }
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
+CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 
+                                                      Value *Val, DebugLoc DL) {
+  Value *Ops[] = { Val };
+  Type *Tys[] = { Val->getType() };
+
+  Module *M = (*(CurLoop->block_begin()))->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// recognize - detect population count idiom in a non-countable loop. If
+///   detected, transform the relevant code to popcount intrinsic function
+///   call, and return true; otherwise, return false.
+bool NclPopcountRecognize::recognize() {
+
+  if (!LIR.getTargetTransformInfo())
+    return false;
+
+  LIR.getScalarEvolution();
+
+  if (!preliminaryScreen())
     return false;
-  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectIdiom(CntInst, CntPhi, Val))
+    return false;
+
+  transform(CntInst, CntPhi, Val);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
   if (isa<SCEVCouldNotCompute>(BECount)) return false;
 
   // If this loop executes exactly one time, then it should be peeled, not
@@ -199,24 +664,29 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
       return false;
 
   // We require target data for now.
-  TD = getAnalysisIfAvailable<DataLayout>();
-  if (TD == 0) return false;
+  if (!getDataLayout())
+    return false;
+
+  // set DT 
+  (void)getDominatorTree();
 
-  DT = &getAnalysis<DominatorTree>();
   LoopInfo &LI = getAnalysis<LoopInfo>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
+  // set TLI 
+  (void)getTargetLibraryInfo();
+
   SmallVector<BasicBlock*, 8> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
 
   DEBUG(dbgs() << "loop-idiom Scanning: F["
-               << L->getHeader()->getParent()->getName()
-               << "] Loop %" << L->getHeader()->getName() << "\n");
+               << CurLoop->getHeader()->getParent()->getName()
+               << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
 
   bool MadeChange = false;
   // Scan all the blocks in the loop that are not in subloops.
-  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
-       ++BI) {
+  for (Loop::block_iterator BI = CurLoop->block_begin(),
+         E = CurLoop->block_end(); BI != E; ++BI) {
     // Ignore blocks in subloops.
     if (LI.getLoopFor(*BI) != CurLoop)
       continue;
@@ -226,6 +696,33 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   return MadeChange;
 }
 
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  NclPopcountRecognize Popcount(*this);
+  if (Popcount.recognize())
+    return true;
+
+  return false;
+}
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  CurLoop = L;
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
+    return false;
+
+  SE = &getAnalysis<ScalarEvolution>();
+  if (SE->hasLoopInvariantBackedgeTakenCount(L))
+    return runOnCountableLoop();
+  return runOnNoncountableLoop();
+}
+
 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
 /// with the specified backedge count.  This block is known to be in the current
 /// loop and not in any subloops.
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 558f62e..a23860a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -12,17 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-instsimplify"
-#include "llvm/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index abe07aa..e98ae95 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,20 +13,21 @@
 
 #define DEBUG_TYPE "loop-rotate"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Function.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 #define MAX_HEADER_SIZE 16
@@ -51,6 +52,7 @@ namespace {
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -59,11 +61,13 @@ namespace {
 
   private:
     LoopInfo *LI;
+    const TargetTransformInfo *TTI;
   };
 }
 
 char LoopRotate::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -75,6 +79,7 @@ Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 /// the loop is rotated at least once.
 bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
 
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
@@ -274,10 +279,16 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
     return false;
 
-  // Check size of original header and reject loop if it is very big.
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
   {
     CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader);
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI);
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable"
+            << " instructions: "; L->dump());
+      return false;
+    }
     if (Metrics.NumInsts > MAX_HEADER_SIZE)
       return false;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 958348d..73e44d7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -37,8 +37,8 @@
 //
 // TODO: Handle multiple loops at a time.
 //
-// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
-//       instead of a GlobalValue?
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+//       of a GlobalValue?
 //
 // TODO: When truncation is free, truncate ICmp users' operands to make it a
 //       smaller encoding (on x86 at least).
@@ -54,27 +54,27 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-reduce"
-#include "llvm/AddressingMode.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -224,16 +224,24 @@ namespace {
 /// computing satisfying a use. It may include broken-out immediates and scaled
 /// registers.
 struct Formula {
-  /// AM - This is used to represent complex addressing, as well as other kinds
-  /// of interesting uses.
-  AddrMode AM;
+  /// Global base address used for complex addressing.
+  GlobalValue *BaseGV;
+
+  /// Base offset for complex addressing.
+  int64_t BaseOffset;
+
+  /// Whether any complex addressing has a base register.
+  bool HasBaseReg;
+
+  /// The scale of any complex addressing.
+  int64_t Scale;
 
   /// BaseRegs - The list of "base" registers for this use. When this is
-  /// non-empty, AM.HasBaseReg should be set to true.
-  SmallVector<const SCEV *, 2> BaseRegs;
+  /// non-empty,
+  SmallVector<const SCEV *, 4> BaseRegs;
 
   /// ScaledReg - The 'scaled' register for this use. This should be non-null
-  /// when AM.Scale is not zero.
+  /// when Scale is not zero.
   const SCEV *ScaledReg;
 
   /// UnfoldedOffset - An additional constant offset which added near the
@@ -241,7 +249,9 @@ struct Formula {
   /// live in an add immediate field rather than a register.
   int64_t UnfoldedOffset;
 
-  Formula() : ScaledReg(0), UnfoldedOffset(0) {}
+  Formula()
+      : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0),
+        UnfoldedOffset(0) {}
 
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
@@ -327,13 +337,13 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
     const SCEV *Sum = SE.getAddExpr(Good);
     if (!Sum->isZero())
       BaseRegs.push_back(Sum);
-    AM.HasBaseReg = true;
+    HasBaseReg = true;
   }
   if (!Bad.empty()) {
     const SCEV *Sum = SE.getAddExpr(Bad);
     if (!Sum->isZero())
       BaseRegs.push_back(Sum);
-    AM.HasBaseReg = true;
+    HasBaseReg = true;
   }
 }
 
@@ -349,7 +359,7 @@ unsigned Formula::getNumRegs() const {
 Type *Formula::getType() const {
   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
          ScaledReg ? ScaledReg->getType() :
-         AM.BaseGV ? AM.BaseGV->getType() :
+         BaseGV ? BaseGV->getType() :
          0;
 }
 
@@ -382,29 +392,29 @@ bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
 
 void Formula::print(raw_ostream &OS) const {
   bool First = true;
-  if (AM.BaseGV) {
+  if (BaseGV) {
     if (!First) OS << " + "; else First = false;
-    WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
+    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
   }
-  if (AM.BaseOffs != 0) {
+  if (BaseOffset != 0) {
     if (!First) OS << " + "; else First = false;
-    OS << AM.BaseOffs;
+    OS << BaseOffset;
   }
   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
        E = BaseRegs.end(); I != E; ++I) {
     if (!First) OS << " + "; else First = false;
     OS << "reg(" << **I << ')';
   }
-  if (AM.HasBaseReg && BaseRegs.empty()) {
+  if (HasBaseReg && BaseRegs.empty()) {
     if (!First) OS << " + "; else First = false;
     OS << "**error: HasBaseReg**";
-  } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
+  } else if (!HasBaseReg && !BaseRegs.empty()) {
     if (!First) OS << " + "; else First = false;
     OS << "**error: !HasBaseReg**";
   }
-  if (AM.Scale != 0) {
+  if (Scale != 0) {
     if (!First) OS << " + "; else First = false;
-    OS << AM.Scale << "*reg(";
+    OS << Scale << "*reg(";
     if (ScaledReg)
       OS << *ScaledReg;
     else
@@ -885,7 +895,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
   }
   if (Regs.insert(Reg)) {
     RateRegister(Reg, Regs, L, SE, DT);
-    if (isLoser())
+    if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
   }
 }
@@ -927,8 +937,8 @@ void Cost::RateFormula(const Formula &F,
   // Tally up the non-zero immediates.
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
        E = Offsets.end(); I != E; ++I) {
-    int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
-    if (F.AM.BaseGV)
+    int64_t Offset = (uint64_t)*I + F.BaseOffset;
+    if (F.BaseGV)
       ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
     else if (Offset != 0)
@@ -1078,19 +1088,19 @@ namespace {
 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
 struct UniquifierDenseMapInfo {
-  static SmallVector<const SCEV *, 2> getEmptyKey() {
-    SmallVector<const SCEV *, 2> V;
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
     V.push_back(reinterpret_cast<const SCEV *>(-1));
     return V;
   }
 
-  static SmallVector<const SCEV *, 2> getTombstoneKey() {
-    SmallVector<const SCEV *, 2> V;
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
     V.push_back(reinterpret_cast<const SCEV *>(-2));
     return V;
   }
 
-  static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
     unsigned Result = 0;
     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
          E = V.end(); I != E; ++I)
@@ -1098,8 +1108,8 @@ struct UniquifierDenseMapInfo {
     return Result;
   }
 
-  static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
-                      const SmallVector<const SCEV *, 2> &RHS) {
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
     return LHS == RHS;
   }
 };
@@ -1110,7 +1120,7 @@ struct UniquifierDenseMapInfo {
 /// the user itself, and information about how the use may be satisfied.
 /// TODO: Represent multiple users of the same expression in common?
 class LSRUse {
-  DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
 
 public:
   /// KindType - An enum for a kind of use, indicating what types of
@@ -1169,7 +1179,7 @@ public:
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
-  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
   std::sort(Key.begin(), Key.end());
@@ -1179,7 +1189,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
-  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
   std::sort(Key.begin(), Key.end());
@@ -1270,46 +1280,42 @@ void LSRUse::dump() const {
 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
 /// be completely folded into the user instruction at isel time. This includes
 /// address-mode folding and special icmp tricks.
-static bool isLegalUse(const AddrMode &AM,
-                       LSRUse::KindType Kind, Type *AccessTy,
-                       const TargetLowering *TLI) {
+static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
+                       Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset,
+                       bool HasBaseReg, int64_t Scale) {
   switch (Kind) {
   case LSRUse::Address:
-    // If we have low-level target information, ask the target if it can
-    // completely fold this address.
-    if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
+    return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
 
     // Otherwise, just guess that reg+reg addressing is legal.
-    return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
+    //return ;
 
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
     // fold a GV into an ICmp.
-    if (AM.BaseGV)
+    if (BaseGV)
       return false;
 
     // ICmp only has two operands; don't allow more than two non-trivial parts.
-    if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
+    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
       return false;
 
     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
     // putting the scaled register in the other operand of the icmp.
-    if (AM.Scale != 0 && AM.Scale != -1)
+    if (Scale != 0 && Scale != -1)
       return false;
 
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
-    if (AM.BaseOffs != 0) {
-      if (!TLI)
-        return false;
+    if (BaseOffset != 0) {
       // We have one of:
-      // ICmpZero     BaseReg + Offset => ICmp BaseReg, -Offset
-      // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset
+      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
       // Offs is the ICmp immediate.
-      int64_t Offs = AM.BaseOffs;
-      if (AM.Scale == 0)
-        Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN.
-      return TLI->isLegalICmpImmediate(Offs);
+      if (Scale == 0)
+        // The cast does the right thing with INT64_MIN.
+        BaseOffset = -(uint64_t)BaseOffset;
+      return TTI.isLegalICmpImmediate(BaseOffset);
     }
 
     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
@@ -1317,92 +1323,87 @@ static bool isLegalUse(const AddrMode &AM,
 
   case LSRUse::Basic:
     // Only handle single-register values.
-    return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
+    return !BaseGV && Scale == 0 && BaseOffset == 0;
 
   case LSRUse::Special:
     // Special case Basic to handle -1 scales.
-    return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0;
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
   }
 
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
-static bool isLegalUse(AddrMode AM,
-                       int64_t MinOffset, int64_t MaxOffset,
-                       LSRUse::KindType Kind, Type *AccessTy,
-                       const TargetLowering *TLI) {
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
+                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
+                       int64_t Scale) {
   // Check for overflow.
-  if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
+  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
       (MinOffset > 0))
     return false;
-  AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
-  if (isLegalUse(AM, Kind, AccessTy, TLI)) {
-    AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
-    // Check for overflow.
-    if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
-        (MaxOffset > 0))
-      return false;
-    AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
-    return isLegalUse(AM, Kind, AccessTy, TLI);
-  }
-  return false;
+  MinOffset = (uint64_t)BaseOffset + MinOffset;
+  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+      (MaxOffset > 0))
+    return false;
+  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+  return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg,
+                    Scale) &&
+         isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale);
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
+                       const Formula &F) {
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+                    F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
-static bool isAlwaysFoldable(int64_t BaseOffs,
-                             GlobalValue *BaseGV,
-                             bool HasBaseReg,
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, Type *AccessTy,
-                             const TargetLowering *TLI) {
+                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg) {
   // Fast-path: zero is always foldable.
-  if (BaseOffs == 0 && !BaseGV) return true;
+  if (BaseOffset == 0 && !BaseGV) return true;
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
-  AddrMode AM;
-  AM.BaseOffs = BaseOffs;
-  AM.BaseGV = BaseGV;
-  AM.HasBaseReg = HasBaseReg;
-  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 
   // Canonicalize a scale of 1 to a base register if the formula doesn't
   // already have a base register.
-  if (!AM.HasBaseReg && AM.Scale == 1) {
-    AM.Scale = 0;
-    AM.HasBaseReg = true;
+  if (!HasBaseReg && Scale == 1) {
+    Scale = 0;
+    HasBaseReg = true;
   }
 
-  return isLegalUse(AM, Kind, AccessTy, TLI);
+  return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
 }
 
-static bool isAlwaysFoldable(const SCEV *S,
-                             int64_t MinOffset, int64_t MaxOffset,
-                             bool HasBaseReg,
-                             LSRUse::KindType Kind, Type *AccessTy,
-                             const TargetLowering *TLI,
-                             ScalarEvolution &SE) {
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             ScalarEvolution &SE, int64_t MinOffset,
+                             int64_t MaxOffset, LSRUse::KindType Kind,
+                             Type *AccessTy, const SCEV *S, bool HasBaseReg) {
   // Fast-path: zero is always foldable.
   if (S->isZero()) return true;
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
-  int64_t BaseOffs = ExtractImmediate(S, SE);
+  int64_t BaseOffset = ExtractImmediate(S, SE);
   GlobalValue *BaseGV = ExtractSymbol(S, SE);
 
   // If there's anything else involved, it's not foldable.
   if (!S->isZero()) return false;
 
   // Fast-path: zero is always foldable.
-  if (BaseOffs == 0 && !BaseGV) return true;
+  if (BaseOffset == 0 && !BaseGV) return true;
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
-  AddrMode AM;
-  AM.BaseOffs = BaseOffs;
-  AM.BaseGV = BaseGV;
-  AM.HasBaseReg = HasBaseReg;
-  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 
-  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                    BaseOffset, HasBaseReg, Scale);
 }
 
 namespace {
@@ -1502,7 +1503,7 @@ class LSRInstance {
   ScalarEvolution &SE;
   DominatorTree &DT;
   LoopInfo &LI;
-  const TargetLowering *const TLI;
+  const TargetTransformInfo &TTI;
   Loop *const L;
   bool Changed;
 
@@ -1638,7 +1639,7 @@ class LSRInstance {
                          Pass *P);
 
 public:
-  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
+  LSRInstance(Loop *L, Pass *P);
 
   bool getChanged() const { return Changed; }
 
@@ -1688,12 +1689,9 @@ void LSRInstance::OptimizeShadowIV() {
     }
     if (!DestTy) continue;
 
-    if (TLI) {
-      // If target does not support DestTy natively then do not apply
-      // this transformation.
-      EVT DVT = TLI->getValueType(DestTy);
-      if (!TLI->isTypeLegal(DVT)) continue;
-    }
+    // If target does not support DestTy natively then do not apply
+    // this transformation.
+    if (!TTI.isTypeLegal(DestTy)) continue;
 
     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
     if (!PH) continue;
@@ -1897,15 +1895,13 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   if (ICmpInst::isTrueWhenEqual(Pred)) {
     // Look for n+1, and grab n.
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
-      if (isa<ConstantInt>(BO->getOperand(1)) &&
-          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
-          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
-        NewRHS = BO->getOperand(0);
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
-      if (isa<ConstantInt>(BO->getOperand(1)) &&
-          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
-          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
-        NewRHS = BO->getOperand(0);
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
     if (!NewRHS)
       return Cond;
   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
@@ -2015,18 +2011,17 @@ LSRInstance::OptimizeLoopTermCond() {
             if (C->getValue().getMinSignedBits() >= 64 ||
                 C->getValue().isMinSignedValue())
               goto decline_post_inc;
-            // Without TLI, assume that any stride might be valid, and so any
-            // use might be shared.
-            if (!TLI)
-              goto decline_post_inc;
             // Check for possible scaled-address reuse.
             Type *AccessTy = getAccessType(UI->getUser());
-            AddrMode AM;
-            AM.Scale = C->getSExtValue();
-            if (TLI->isLegalAddressingMode(AM, AccessTy))
+            int64_t Scale = C->getSExtValue();
+            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
+                                          /*BaseOffset=*/ 0,
+                                          /*HasBaseReg=*/ false, Scale))
               goto decline_post_inc;
-            AM.Scale = -AM.Scale;
-            if (TLI->isLegalAddressingMode(AM, AccessTy))
+            Scale = -Scale;
+            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
+                                          /*BaseOffset=*/ 0,
+                                          /*HasBaseReg=*/ false, Scale))
               goto decline_post_inc;
           }
         }
@@ -2096,13 +2091,13 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
     return false;
   // Conservatively assume HasBaseReg is true for now.
   if (NewOffset < LU.MinOffset) {
-    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
-                          Kind, AccessTy, TLI))
+    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+                          LU.MaxOffset - NewOffset, HasBaseReg))
       return false;
     NewMinOffset = NewOffset;
   } else if (NewOffset > LU.MaxOffset) {
-    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
-                          Kind, AccessTy, TLI))
+    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+                          NewOffset - LU.MinOffset, HasBaseReg))
       return false;
     NewMaxOffset = NewOffset;
   }
@@ -2131,7 +2126,8 @@ LSRInstance::getUse(const SCEV *&Expr,
   int64_t Offset = ExtractImmediate(Expr, SE);
 
   // Basic uses can't accept any offset, for example.
-  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
+  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+                        Offset, /*HasBaseReg=*/ true)) {
     Expr = Copy;
     Offset = 0;
   }
@@ -2199,10 +2195,10 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
         // as OrigF.
         if (F.BaseRegs == OrigF.BaseRegs &&
             F.ScaledReg == OrigF.ScaledReg &&
-            F.AM.BaseGV == OrigF.AM.BaseGV &&
-            F.AM.Scale == OrigF.AM.Scale &&
+            F.BaseGV == OrigF.BaseGV &&
+            F.Scale == OrigF.Scale &&
             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
-          if (F.AM.BaseOffs == 0)
+          if (F.BaseOffset == 0)
             return &LU;
           // This is the formula where all the registers and symbols matched;
           // there aren't going to be any others. Since we declined it, we
@@ -2396,7 +2392,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// TODO: Consider IVInc free if it's already used in another chains.
 static bool
 isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
-                  ScalarEvolution &SE, const TargetLowering *TLI) {
+                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
   if (StressIVChain)
     return true;
 
@@ -2539,6 +2535,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
     // Add this IV user to the end of the chain.
     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
   }
+  IVChain &Chain = IVChainVec[ChainIdx];
 
   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
   // This chain's NearUsers become FarUsers.
@@ -2556,8 +2553,19 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   for (Value::use_iterator UseIter = IVOper->use_begin(),
          UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
     Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
-    if (!OtherUse || OtherUse == UserInst)
+    if (!OtherUse)
       continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
     if (SE.isSCEVable(OtherUse->getType())
         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
         && IU.isIVUserOrOperand(OtherUse)) {
@@ -2654,7 +2662,7 @@ void LSRInstance::CollectChains() {
   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
        UsersIdx < NChains; ++UsersIdx) {
     if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE, TLI))
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
       continue;
     // Preserve the chain at UsesIdx.
     if (ChainIdx != UsersIdx)
@@ -2681,7 +2689,7 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
 
 /// Return true if the IVInc can be folded into an addressing mode.
 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
-                             Value *Operand, const TargetLowering *TLI) {
+                             Value *Operand, const TargetTransformInfo &TTI) {
   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
   if (!IncConst || !isAddressUse(UserInst, Operand))
     return false;
@@ -2690,8 +2698,9 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
     return false;
 
   int64_t IncOffset = IncConst->getValue()->getSExtValue();
-  if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false,
-                       LSRUse::Address, getAccessType(UserInst), TLI))
+  if (!isAlwaysFoldable(TTI, LSRUse::Address,
+                        getAccessType(UserInst), /*BaseGV=*/ 0,
+                        IncOffset, /*HaseBaseReg=*/ false))
     return false;
 
   return true;
@@ -2705,6 +2714,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   // by LSR.
   const IVInc &Head = Chain.Incs[0];
   User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
                                              IVOpEnd, L, SE);
   Value *IVSrc = 0;
@@ -2762,7 +2772,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
 
       // If an IV increment can't be folded, use it as the next IV value.
       if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
-                            TLI)) {
+                            TTI)) {
         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
         IVSrc = IVOper;
         LeftOverExpr = 0;
@@ -2904,7 +2914,7 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S,
                                        LSRUse &LU, size_t LUIdx) {
   Formula F;
   F.BaseRegs.push_back(S);
-  F.AM.HasBaseReg = true;
+  F.HasBaseReg = true;
   bool Inserted = InsertFormula(LU, LUIdx, F);
   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 }
@@ -3106,9 +3116,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 
       // Don't pull a constant into a register if the constant could be folded
       // into an immediate field.
-      if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
-                           Base.getNumRegs() > 1,
-                           LU.Kind, LU.AccessTy, TLI, SE))
+      if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                           LU.AccessTy, *J, Base.getNumRegs() > 1))
         continue;
 
       // Collect all operands except *J.
@@ -3120,9 +3129,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
       // Don't leave just a constant behind in a register if the constant could
       // be folded into an immediate field.
       if (InnerAddOps.size() == 1 &&
-          isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
-                           Base.getNumRegs() > 1,
-                           LU.Kind, LU.AccessTy, TLI, SE))
+          isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                           LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
         continue;
 
       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
@@ -3132,10 +3140,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 
       // Add the remaining pieces of the add back into the new formula.
       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
-      if (TLI && InnerSumSC &&
+      if (InnerSumSC &&
           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
-          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
-                                   InnerSumSC->getValue()->getZExtValue())) {
+          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                  InnerSumSC->getValue()->getZExtValue())) {
         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
                            InnerSumSC->getValue()->getZExtValue();
         F.BaseRegs.erase(F.BaseRegs.begin() + i);
@@ -3144,9 +3152,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 
       // Add J as its own register, or an unfolded immediate.
       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
-      if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
-          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
-                                   SC->getValue()->getZExtValue()))
+      if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                  SC->getValue()->getZExtValue()))
         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
                            SC->getValue()->getZExtValue();
       else
@@ -3195,7 +3203,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                           Formula Base) {
   // We can't add a symbolic offset if the address already contains one.
-  if (Base.AM.BaseGV) return;
+  if (Base.BaseGV) return;
 
   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
     const SCEV *G = Base.BaseRegs[i];
@@ -3203,9 +3211,8 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
     if (G->isZero() || !GV)
       continue;
     Formula F = Base;
-    F.AM.BaseGV = GV;
-    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
-                    LU.Kind, LU.AccessTy, TLI))
+    F.BaseGV = GV;
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
       continue;
     F.BaseRegs[i] = G;
     (void)InsertFormula(LU, LUIdx, F);
@@ -3228,9 +3235,9 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
          E = Worklist.end(); I != E; ++I) {
       Formula F = Base;
-      F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
-      if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
-                     LU.Kind, LU.AccessTy, TLI)) {
+      F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
+      if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
+                     LU.AccessTy, F)) {
         // Add the offset to the base register.
         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
         // If it cancelled out, drop the base register, otherwise update it.
@@ -3248,9 +3255,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
     if (G->isZero() || Imm == 0)
       continue;
     Formula F = Base;
-    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
-    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
-                    LU.Kind, LU.AccessTy, TLI))
+    F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
       continue;
     F.BaseRegs[i] = G;
     (void)InsertFormula(LU, LUIdx, F);
@@ -3271,7 +3277,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   // Don't do this if there is more than one offset.
   if (LU.MinOffset != LU.MaxOffset) return;
 
-  assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
+  assert(!Base.BaseGV && "ICmpZero use is not legal!");
 
   // Check each interesting stride.
   for (SmallSetVector<int64_t, 8>::const_iterator
@@ -3279,10 +3285,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     int64_t Factor = *I;
 
     // Check that the multiplication doesn't overflow.
-    if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
+    if (Base.BaseOffset == INT64_MIN && Factor == -1)
       continue;
-    int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
-    if (NewBaseOffs / Factor != Base.AM.BaseOffs)
+    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+    if (NewBaseOffset / Factor != Base.BaseOffset)
       continue;
 
     // Check that multiplying with the use offset doesn't overflow.
@@ -3294,14 +3300,14 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
       continue;
 
     Formula F = Base;
-    F.AM.BaseOffs = NewBaseOffs;
+    F.BaseOffset = NewBaseOffset;
 
     // Check that this scale is legal.
-    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
+    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
       continue;
 
     // Compensate for the use having MinOffset built into it.
-    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
+    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
 
     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
 
@@ -3342,23 +3348,23 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   if (!IntTy) return;
 
   // If this Formula already has a scaled register, we can't add another one.
-  if (Base.AM.Scale != 0) return;
+  if (Base.Scale != 0) return;
 
   // Check each interesting stride.
   for (SmallSetVector<int64_t, 8>::const_iterator
        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
     int64_t Factor = *I;
 
-    Base.AM.Scale = Factor;
-    Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
+    Base.Scale = Factor;
+    Base.HasBaseReg = Base.BaseRegs.size() > 1;
     // Check whether this scale is going to be legal.
-    if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
-                    LU.Kind, LU.AccessTy, TLI)) {
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    Base)) {
       // As a special-case, handle special out-of-loop Basic users specially.
       // TODO: Reconsider this special case.
       if (LU.Kind == LSRUse::Basic &&
-          isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
-                     LSRUse::Special, LU.AccessTy, TLI) &&
+          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+                     LU.AccessTy, Base) &&
           LU.AllFixupsOutsideLoop)
         LU.Kind = LSRUse::Special;
       else
@@ -3367,7 +3373,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     // For an ICmpZero, negating a solitary base register won't lead to
     // new solutions.
     if (LU.Kind == LSRUse::ICmpZero &&
-        !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
+        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
       continue;
     // For each addrec base reg, apply the scale, if possible.
     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
@@ -3391,11 +3397,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
 
 /// GenerateTruncates - Generate reuse formulae from different IV types.
 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
-  // This requires TargetLowering to tell us which truncates are free.
-  if (!TLI) return;
-
   // Don't bother truncating symbolic values.
-  if (Base.AM.BaseGV) return;
+  if (Base.BaseGV) return;
 
   // Determine the integer type for the base formula.
   Type *DstTy = Base.getType();
@@ -3405,7 +3408,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   for (SmallSetVector<Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
     Type *SrcTy = *I;
-    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
+    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
       Formula F = Base;
 
       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
@@ -3552,16 +3555,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       const Formula &F = LU.Formulae[L];
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
-        int64_t Offs = (uint64_t)F.AM.BaseOffs +
-                       Imm * (uint64_t)F.AM.Scale;
+        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
         // Don't create 50 + reg(-50).
         if (F.referencesReg(SE.getSCEV(
-                   ConstantInt::get(IntTy, -(uint64_t)Offs))))
+                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
           continue;
         Formula NewF = F;
-        NewF.AM.BaseOffs = Offs;
-        if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
-                        LU.Kind, LU.AccessTy, TLI))
+        NewF.BaseOffset = Offset;
+        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        NewF))
           continue;
         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
 
@@ -3570,9 +3572,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
         // immediate itself, then the formula isn't worthwhile.
         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
           if (C->getValue()->isNegative() !=
-                (NewF.AM.BaseOffs < 0) &&
-              (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
-                .ule(abs64(NewF.AM.BaseOffs)))
+                (NewF.BaseOffset < 0) &&
+              (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale))
+                .ule(abs64(NewF.BaseOffset)))
             continue;
 
         // OK, looks good.
@@ -3584,11 +3586,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           if (BaseReg != OrigReg)
             continue;
           Formula NewF = F;
-          NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
-          if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
-                          LU.Kind, LU.AccessTy, TLI)) {
-            if (!TLI ||
-                !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
               continue;
             NewF = F;
             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
@@ -3602,11 +3603,11 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
                J != JE; ++J)
             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
-              if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
-                   abs64(NewF.AM.BaseOffs)) &&
+              if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt(
+                   abs64(NewF.BaseOffset)) &&
                   (C->getValue()->getValue() +
-                   NewF.AM.BaseOffs).countTrailingZeros() >=
-                   CountTrailingZeros_64(NewF.AM.BaseOffs))
+                   NewF.BaseOffset).countTrailingZeros() >=
+                   CountTrailingZeros_64(NewF.BaseOffset))
                 goto skip_formula;
 
           // Ok, looks good.
@@ -3667,7 +3668,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
   // Collect the best formula for each unique set of shared registers. This
   // is reset for each use.
-  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
+  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
     BestFormulaeTy;
   BestFormulaeTy BestFormulae;
 
@@ -3702,7 +3703,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
               dbgs() << "\n");
       }
       else {
-        SmallVector<const SCEV *, 2> Key;
+        SmallVector<const SCEV *, 4> Key;
         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
                JE = F.BaseRegs.end(); J != JE; ++J) {
           const SCEV *Reg = *J;
@@ -3804,7 +3805,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
             Formula NewF = F;
-            NewF.AM.BaseOffs += C->getValue()->getSExtValue();
+            NewF.BaseOffset += C->getValue()->getSExtValue();
             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                 (I - F.BaseRegs.begin()));
             if (LU.HasFormulaWithSameRegs(NewF)) {
@@ -3817,9 +3818,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
             }
           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
-              if (!F.AM.BaseGV) {
+              if (!F.BaseGV) {
                 Formula NewF = F;
-                NewF.AM.BaseGV = GV;
+                NewF.BaseGV = GV;
                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                     (I - F.BaseRegs.begin()));
                 if (LU.HasFormulaWithSameRegs(NewF)) {
@@ -3848,84 +3849,83 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
 /// for expressions like A, A+1, A+2, etc., allocate a single register for
 /// them.
 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
-  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
-    DEBUG(dbgs() << "The search space is too complex.\n");
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
 
-    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
-                    "separated by a constant offset will use the same "
-                    "registers.\n");
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by assuming that uses separated "
+                  "by a constant offset will use the same registers.\n");
 
-    // This is especially useful for unrolled loops.
+  // This is especially useful for unrolled loops.
 
-    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
-      LSRUse &LU = Uses[LUIdx];
-      for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
-           E = LU.Formulae.end(); I != E; ++I) {
-        const Formula &F = *I;
-        if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
-          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
-            if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
-                                   /*HasBaseReg=*/false,
-                                   LU.Kind, LU.AccessTy)) {
-              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
-                    dbgs() << '\n');
-
-              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
-
-              // Update the relocs to reference the new use.
-              for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
-                   E = Fixups.end(); I != E; ++I) {
-                LSRFixup &Fixup = *I;
-                if (Fixup.LUIdx == LUIdx) {
-                  Fixup.LUIdx = LUThatHas - &Uses.front();
-                  Fixup.Offset += F.AM.BaseOffs;
-                  // Add the new offset to LUThatHas' offset list.
-                  if (LUThatHas->Offsets.back() != Fixup.Offset) {
-                    LUThatHas->Offsets.push_back(Fixup.Offset);
-                    if (Fixup.Offset > LUThatHas->MaxOffset)
-                      LUThatHas->MaxOffset = Fixup.Offset;
-                    if (Fixup.Offset < LUThatHas->MinOffset)
-                      LUThatHas->MinOffset = Fixup.Offset;
-                  }
-                  DEBUG(dbgs() << "New fixup has offset "
-                               << Fixup.Offset << '\n');
-                }
-                if (Fixup.LUIdx == NumUses-1)
-                  Fixup.LUIdx = LUIdx;
-              }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
+         E = LU.Formulae.end(); I != E; ++I) {
+      const Formula &F = *I;
+      if (F.BaseOffset == 0 || F.Scale != 0)
+        continue;
 
-              // Delete formulae from the new use which are no longer legal.
-              bool Any = false;
-              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
-                Formula &F = LUThatHas->Formulae[i];
-                if (!isLegalUse(F.AM,
-                                LUThatHas->MinOffset, LUThatHas->MaxOffset,
-                                LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
-                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
-                        dbgs() << '\n');
-                  LUThatHas->DeleteFormula(F);
-                  --i;
-                  --e;
-                  Any = true;
-                }
-              }
-              if (Any)
-                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
 
-              // Delete the old use.
-              DeleteUse(LU, LUIdx);
-              --LUIdx;
-              --NumUses;
-              break;
-            }
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Update the relocs to reference the new use.
+      for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
+           E = Fixups.end(); I != E; ++I) {
+        LSRFixup &Fixup = *I;
+        if (Fixup.LUIdx == LUIdx) {
+          Fixup.LUIdx = LUThatHas - &Uses.front();
+          Fixup.Offset += F.BaseOffset;
+          // Add the new offset to LUThatHas' offset list.
+          if (LUThatHas->Offsets.back() != Fixup.Offset) {
+            LUThatHas->Offsets.push_back(Fixup.Offset);
+            if (Fixup.Offset > LUThatHas->MaxOffset)
+              LUThatHas->MaxOffset = Fixup.Offset;
+            if (Fixup.Offset < LUThatHas->MinOffset)
+              LUThatHas->MinOffset = Fixup.Offset;
           }
+          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
         }
+        if (Fixup.LUIdx == NumUses-1)
+          Fixup.LUIdx = LUIdx;
       }
-    }
 
-    DEBUG(dbgs() << "After pre-selection:\n";
-          print_uses(dbgs()));
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
   }
+
+  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
 }
 
 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
@@ -4308,7 +4308,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
 
   // Expand the ScaledReg portion.
   Value *ICmpScaledV = 0;
-  if (F.AM.Scale != 0) {
+  if (F.Scale != 0) {
     const SCEV *ScaledS = F.ScaledReg;
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
@@ -4321,7 +4321,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // An interesting way of "folding" with an icmp is to use a negated
       // scale, which we'll implement by inserting it into the other operand
       // of the icmp.
-      assert(F.AM.Scale == -1 &&
+      assert(F.Scale == -1 &&
              "The only scale supported by ICmpZero uses is -1!");
       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
     } else {
@@ -4336,20 +4336,20 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       }
       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
       ScaledS = SE.getMulExpr(ScaledS,
-                              SE.getConstant(ScaledS->getType(), F.AM.Scale));
+                              SE.getConstant(ScaledS->getType(), F.Scale));
       Ops.push_back(ScaledS);
     }
   }
 
   // Expand the GV portion.
-  if (F.AM.BaseGV) {
+  if (F.BaseGV) {
     // Flush the operand list to suppress SCEVExpander hoisting.
     if (!Ops.empty()) {
       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
       Ops.clear();
       Ops.push_back(SE.getUnknown(FullV));
     }
-    Ops.push_back(SE.getUnknown(F.AM.BaseGV));
+    Ops.push_back(SE.getUnknown(F.BaseGV));
   }
 
   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
@@ -4361,7 +4361,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   }
 
   // Expand the immediate portion.
-  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
+  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
   if (Offset != 0) {
     if (LU.Kind == LSRUse::ICmpZero) {
       // The other interesting way of "folding" with an ICmpZero is to use a
@@ -4402,9 +4402,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   if (LU.Kind == LSRUse::ICmpZero) {
     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
     DeadInsts.push_back(CI->getOperand(1));
-    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
+    assert(!F.BaseGV && "ICmp does not support folding a global value and "
                            "a scale at the same time!");
-    if (F.AM.Scale == -1) {
+    if (F.Scale == -1) {
       if (ICmpScaledV->getType() != OpTy) {
         Instruction *Cast =
           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
@@ -4414,7 +4414,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       }
       CI->setOperand(1, ICmpScaledV);
     } else {
-      assert(F.AM.Scale == 0 &&
+      assert(F.Scale == 0 &&
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
@@ -4589,13 +4589,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
 }
 
-LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
-  : IU(P->getAnalysis<IVUsers>()),
-    SE(P->getAnalysis<ScalarEvolution>()),
-    DT(P->getAnalysis<DominatorTree>()),
-    LI(P->getAnalysis<LoopInfo>()),
-    TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
-
+LSRInstance::LSRInstance(Loop *L, Pass *P)
+    : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
+      DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()),
+      TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
+      IVIncInsertPos(0) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
@@ -4678,14 +4676,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
 
 #ifndef NDEBUG
   // Formulae should be legal.
-  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
-       E = Uses.end(); I != E; ++I) {
-     const LSRUse &LU = *I;
-     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
-          JE = LU.Formulae.end(); J != JE; ++J)
-        assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
-                          LU.Kind, LU.AccessTy, TLI) &&
-               "Illegal formula generated!");
+  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end();
+       I != E; ++I) {
+    const LSRUse &LU = *I;
+    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
+                                                  JE = LU.Formulae.end();
+         J != JE; ++J)
+      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        *J) && "Illegal formula generated!");
   };
 #endif
 
@@ -4757,13 +4755,9 @@ void LSRInstance::dump() const {
 namespace {
 
 class LoopStrengthReduce : public LoopPass {
-  /// TLI - Keep a pointer of a TargetLowering to consult for determining
-  /// transformation profitability.
-  const TargetLowering *const TLI;
-
 public:
   static char ID; // Pass ID, replacement for typeid
-  explicit LoopStrengthReduce(const TargetLowering *tli = 0);
+  LoopStrengthReduce();
 
 private:
   bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -4775,6 +4769,7 @@ private:
 char LoopStrengthReduce::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(IVUsers)
@@ -4784,14 +4779,13 @@ INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
 
 
-Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
-  return new LoopStrengthReduce(TLI);
+Pass *llvm::createLoopStrengthReducePass() {
+  return new LoopStrengthReduce();
 }
 
-LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
-  : LoopPass(ID), TLI(tli) {
-    initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
-  }
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
 
 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   // We split critical edges, so we change the CFG.  However, we do update
@@ -4810,24 +4804,27 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredID(LoopSimplifyID);
   AU.addRequired<IVUsers>();
   AU.addPreserved<IVUsers>();
+  AU.addRequired<TargetTransformInfo>();
 }
 
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   bool Changed = false;
 
   // Run the main LSR transformation.
-  Changed |= LSRInstance(TLI, L, this).getChanged();
+  Changed |= LSRInstance(L, this).getChanged();
 
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader());
-  if (EnablePhiElim) {
+  if (EnablePhiElim && L->isLoopSimplifyForm()) {
     SmallVector<WeakVH, 16> DeadInsts;
     SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
 #ifndef NDEBUG
     Rewriter.setDebugType(DEBUG_TYPE);
 #endif
-    unsigned numFolded = Rewriter.
-      replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI);
+    unsigned numFolded =
+        Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(),
+                                     DeadInsts,
+                                     &getAnalysis<TargetTransformInfo>());
     if (numFolded) {
       Changed = true;
       DeleteTriviallyDeadInstructions(DeadInsts);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 0d781ac..80d060b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -13,16 +13,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-unroll"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/DataLayout.h"
 #include <climits>
 
 using namespace llvm;
@@ -90,6 +91,7 @@ namespace {
       AU.addPreservedID(LCSSAID);
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
@@ -101,6 +103,7 @@ namespace {
 
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -113,12 +116,14 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    const DataLayout *TD) {
+                                    bool &NotDuplicatable,
+                                    const TargetTransformInfo &TTI) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I, TD);
+    Metrics.analyzeBasicBlock(*I, TTI);
   NumCalls = Metrics.NumInlineCandidates;
+  NotDuplicatable = Metrics.notDuplicatable;
 
   unsigned LoopSize = Metrics.NumInsts;
 
@@ -133,6 +138,7 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -145,8 +151,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // not user specified.
   unsigned Threshold = CurrentThreshold;
   if (!UserThreshold &&
-      Header->getParent()->getFnAttributes().
-        hasAttribute(Attributes::OptimizeForSize))
+      Header->getParent()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex,
+                     Attribute::OptimizeForSize))
     Threshold = OptSizeUnrollThreshold;
 
   // Find trip count and trip multiple if count is not available
@@ -179,10 +186,16 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
-    const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
     unsigned NumInlineCandidates;
-    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD);
+    bool notDuplicatable;
+    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
+                                            notDuplicatable, TTI);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+    if (notDuplicatable) {
+      DEBUG(dbgs() << "  Not unrolling loop which contains non duplicatable"
+            << " instructions.\n");
+      return false;
+    }
     if (NumInlineCandidates != 0) {
       DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
       return false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 047b43e..0e8199f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -28,25 +28,26 @@
 
 #define DEBUG_TYPE "loop-unswitch"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <map>
 #include <set>
@@ -101,7 +102,7 @@ namespace {
 
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop* L);
+      bool countLoop(const Loop* L, const TargetTransformInfo &TTI);
 
       // Clean all data related to given loop.
       void forgetLoop(const Loop* L);
@@ -170,6 +171,7 @@ namespace {
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
   private:
@@ -221,7 +223,7 @@ namespace {
 
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop* L) {
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
   std::pair<LoopPropsMapIt, bool> InsertRes =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
@@ -243,11 +245,18 @@ bool LUAnalysisCache::countLoop(const Loop* L) {
     for (Loop::block_iterator I = L->block_begin(),
            E = L->block_end();
          I != E; ++I)
-      Metrics.analyzeBasicBlock(*I);
+      Metrics.analyzeBasicBlock(*I, TTI);
 
     Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
     Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
     MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "NOT unswitching loop %"
+            << L->getHeader()->getName() << ", contents cannot be "
+            << "duplicated!\n");
+      return false;
+    }
   }
 
   if (!Props.CanBeUnswitchedCount) {
@@ -327,6 +336,7 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
 char LoopUnswitch::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -417,7 +427,7 @@ bool LoopUnswitch::processCurrentLoop() {
 
   // Probably we reach the quota of branches for this loop. If so
   // stop unswitching.
-  if (!BranchesInfo.countLoop(currentLoop))
+  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>()))
     return false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
@@ -639,7 +649,8 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
 
   // Do not do non-trivial unswitch while optimizing for size.
   if (OptimizeForSize ||
-      F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize))
+      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::OptimizeForSize))
     return false;
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
index 7419a65..8ced494 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -14,9 +14,9 @@
 
 #define DEBUG_TYPE "loweratomic"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 517657cf..be0f0e8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -14,20 +14,20 @@
 
 #define DEBUG_TYPE "memcpyopt"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
diff --git a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp
deleted file mode 100644
index dfdf505..0000000
--- a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp
+++ /dev/null
@@ -1,4232 +0,0 @@
-//===- ObjCARC.cpp - ObjC ARC Optimization --------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines ObjC ARC optimizations. ARC stands for
-// Automatic Reference Counting and is a system for managing reference counts
-// for objects in Objective C.
-//
-// The optimizations performed include elimination of redundant, partially
-// redundant, and inconsequential reference count operations, elimination of
-// redundant weak pointer operations, pattern-matching and replacement of
-// low-level operations into higher-level operations, and numerous minor
-// simplifications.
-//
-// This file also defines a simple ARC-aware AliasAnalysis.
-//
-// WARNING: This file knows about certain library functions. It recognizes them
-// by name, and hardwires knowledge of their semantics.
-//
-// WARNING: This file knows about how certain Objective-C library functions are
-// used. Naive LLVM IR transformations which would otherwise be
-// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "objc-arc"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/DenseMap.h"
-using namespace llvm;
-
-// A handy option to enable/disable all optimizations in this file.
-static cl::opt<bool> EnableARCOpts("enable-objc-arc-opts", cl::init(true));
-
-//===----------------------------------------------------------------------===//
-// Misc. Utilities
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// MapVector - An associative container with fast insertion-order
-  /// (deterministic) iteration over its elements. Plus the special
-  /// blot operation.
-  template<class KeyT, class ValueT>
-  class MapVector {
-    /// Map - Map keys to indices in Vector.
-    typedef DenseMap<KeyT, size_t> MapTy;
-    MapTy Map;
-
-    /// Vector - Keys and values.
-    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
-    VectorTy Vector;
-
-  public:
-    typedef typename VectorTy::iterator iterator;
-    typedef typename VectorTy::const_iterator const_iterator;
-    iterator begin() { return Vector.begin(); }
-    iterator end() { return Vector.end(); }
-    const_iterator begin() const { return Vector.begin(); }
-    const_iterator end() const { return Vector.end(); }
-
-#ifdef XDEBUG
-    ~MapVector() {
-      assert(Vector.size() >= Map.size()); // May differ due to blotting.
-      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
-           I != E; ++I) {
-        assert(I->second < Vector.size());
-        assert(Vector[I->second].first == I->first);
-      }
-      for (typename VectorTy::const_iterator I = Vector.begin(),
-           E = Vector.end(); I != E; ++I)
-        assert(!I->first ||
-               (Map.count(I->first) &&
-                Map[I->first] == size_t(I - Vector.begin())));
-    }
-#endif
-
-    ValueT &operator[](const KeyT &Arg) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(Arg, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(std::make_pair(Arg, ValueT()));
-        return Vector[Num].second;
-      }
-      return Vector[Pair.first->second].second;
-    }
-
-    std::pair<iterator, bool>
-    insert(const std::pair<KeyT, ValueT> &InsertPair) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(InsertPair);
-        return std::make_pair(Vector.begin() + Num, true);
-      }
-      return std::make_pair(Vector.begin() + Pair.first->second, false);
-    }
-
-    const_iterator find(const KeyT &Key) const {
-      typename MapTy::const_iterator It = Map.find(Key);
-      if (It == Map.end()) return Vector.end();
-      return Vector.begin() + It->second;
-    }
-
-    /// blot - This is similar to erase, but instead of removing the element
-    /// from the vector, it just zeros out the key in the vector. This leaves
-    /// iterators intact, but clients must be prepared for zeroed-out keys when
-    /// iterating.
-    void blot(const KeyT &Key) {
-      typename MapTy::iterator It = Map.find(Key);
-      if (It == Map.end()) return;
-      Vector[It->second].first = KeyT();
-      Map.erase(It);
-    }
-
-    void clear() {
-      Map.clear();
-      Vector.clear();
-    }
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// ARC Utilities.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/ADT/StringSwitch.h"
-
-namespace {
-  /// InstructionClass - A simple classification for instructions.
-  enum InstructionClass {
-    IC_Retain,              ///< objc_retain
-    IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
-    IC_RetainBlock,         ///< objc_retainBlock
-    IC_Release,             ///< objc_release
-    IC_Autorelease,         ///< objc_autorelease
-    IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
-    IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
-    IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
-    IC_NoopCast,            ///< objc_retainedObject, etc.
-    IC_FusedRetainAutorelease, ///< objc_retainAutorelease
-    IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
-    IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
-    IC_StoreWeak,           ///< objc_storeWeak (primitive)
-    IC_InitWeak,            ///< objc_initWeak (derived)
-    IC_LoadWeak,            ///< objc_loadWeak (derived)
-    IC_MoveWeak,            ///< objc_moveWeak (derived)
-    IC_CopyWeak,            ///< objc_copyWeak (derived)
-    IC_DestroyWeak,         ///< objc_destroyWeak (derived)
-    IC_StoreStrong,         ///< objc_storeStrong (derived)
-    IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
-    IC_Call,                ///< could call objc_release
-    IC_User,                ///< could "use" a pointer
-    IC_None                 ///< anything else
-  };
-}
-
-/// IsPotentialUse - Test whether the given value is possible a
-/// reference-counted pointer.
-static bool IsPotentialUse(const Value *Op) {
-  // Pointers to static or stack storage are not reference-counted pointers.
-  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
-    return false;
-  // Special arguments are not reference-counted.
-  if (const Argument *Arg = dyn_cast<Argument>(Op))
-    if (Arg->hasByValAttr() ||
-        Arg->hasNestAttr() ||
-        Arg->hasStructRetAttr())
-      return false;
-  // Only consider values with pointer types.
-  // It seemes intuitive to exclude function pointer types as well, since
-  // functions are never reference-counted, however clang occasionally
-  // bitcasts reference-counted pointers to function-pointer type
-  // temporarily.
-  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
-  if (!Ty)
-    return false;
-  // Conservatively assume anything else is a potential use.
-  return true;
-}
-
-/// GetCallSiteClass - Helper for GetInstructionClass. Determines what kind
-/// of construct CS is.
-static InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
-  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-       I != E; ++I)
-    if (IsPotentialUse(*I))
-      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
-
-  return CS.onlyReadsMemory() ? IC_None : IC_Call;
-}
-
-/// GetFunctionClass - Determine if F is one of the special known Functions.
-/// If it isn't, return IC_CallOrUser.
-static InstructionClass GetFunctionClass(const Function *F) {
-  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-
-  // No arguments.
-  if (AI == AE)
-    return StringSwitch<InstructionClass>(F->getName())
-      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
-      .Default(IC_CallOrUser);
-
-  // One argument.
-  const Argument *A0 = AI++;
-  if (AI == AE)
-    // Argument is a pointer.
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
-      Type *ETy = PTy->getElementType();
-      // Argument is i8*.
-      if (ETy->isIntegerTy(8))
-        return StringSwitch<InstructionClass>(F->getName())
-          .Case("objc_retain",                IC_Retain)
-          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
-          .Case("objc_retainBlock",           IC_RetainBlock)
-          .Case("objc_release",               IC_Release)
-          .Case("objc_autorelease",           IC_Autorelease)
-          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
-          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
-          .Case("objc_retainedObject",        IC_NoopCast)
-          .Case("objc_unretainedObject",      IC_NoopCast)
-          .Case("objc_unretainedPointer",     IC_NoopCast)
-          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
-          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
-          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
-          .Default(IC_CallOrUser);
-
-      // Argument is i8**
-      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
-        if (Pte->getElementType()->isIntegerTy(8))
-          return StringSwitch<InstructionClass>(F->getName())
-            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
-            .Case("objc_loadWeak",              IC_LoadWeak)
-            .Case("objc_destroyWeak",           IC_DestroyWeak)
-            .Default(IC_CallOrUser);
-    }
-
-  // Two arguments, first is i8**.
-  const Argument *A1 = AI++;
-  if (AI == AE)
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
-        if (Pte->getElementType()->isIntegerTy(8))
-          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            Type *ETy1 = PTy1->getElementType();
-            // Second argument is i8*
-            if (ETy1->isIntegerTy(8))
-              return StringSwitch<InstructionClass>(F->getName())
-                     .Case("objc_storeWeak",             IC_StoreWeak)
-                     .Case("objc_initWeak",              IC_InitWeak)
-                     .Case("objc_storeStrong",           IC_StoreStrong)
-                     .Default(IC_CallOrUser);
-            // Second argument is i8**.
-            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
-              if (Pte1->getElementType()->isIntegerTy(8))
-                return StringSwitch<InstructionClass>(F->getName())
-                       .Case("objc_moveWeak",              IC_MoveWeak)
-                       .Case("objc_copyWeak",              IC_CopyWeak)
-                       .Default(IC_CallOrUser);
-          }
-
-  // Anything else.
-  return IC_CallOrUser;
-}
-
-/// GetInstructionClass - Determine what kind of construct V is.
-static InstructionClass GetInstructionClass(const Value *V) {
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    // Any instruction other than bitcast and gep with a pointer operand have a
-    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
-    // to a subsequent use, rather than using it themselves, in this sense.
-    // As a short cut, several other opcodes are known to have no pointer
-    // operands of interest. And ret is never followed by a release, so it's
-    // not interesting to examine.
-    switch (I->getOpcode()) {
-    case Instruction::Call: {
-      const CallInst *CI = cast<CallInst>(I);
-      // Check for calls to special functions.
-      if (const Function *F = CI->getCalledFunction()) {
-        InstructionClass Class = GetFunctionClass(F);
-        if (Class != IC_CallOrUser)
-          return Class;
-
-        // None of the intrinsic functions do objc_release. For intrinsics, the
-        // only question is whether or not they may be users.
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
-        case Intrinsic::stacksave: case Intrinsic::stackrestore:
-        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
-        case Intrinsic::objectsize: case Intrinsic::prefetch:
-        case Intrinsic::stackprotector:
-        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
-        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
-        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
-        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
-        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
-        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
-        // Don't let dbg info affect our results.
-        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
-          // Short cut: Some intrinsics obviously don't use ObjC pointers.
-          return IC_None;
-        default:
-          break;
-        }
-      }
-      return GetCallSiteClass(CI);
-    }
-    case Instruction::Invoke:
-      return GetCallSiteClass(cast<InvokeInst>(I));
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-    case Instruction::Select: case Instruction::PHI:
-    case Instruction::Ret: case Instruction::Br:
-    case Instruction::Switch: case Instruction::IndirectBr:
-    case Instruction::Alloca: case Instruction::VAArg:
-    case Instruction::Add: case Instruction::FAdd:
-    case Instruction::Sub: case Instruction::FSub:
-    case Instruction::Mul: case Instruction::FMul:
-    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
-    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
-    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
-    case Instruction::And: case Instruction::Or: case Instruction::Xor:
-    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
-    case Instruction::IntToPtr: case Instruction::FCmp:
-    case Instruction::FPTrunc: case Instruction::FPExt:
-    case Instruction::FPToUI: case Instruction::FPToSI:
-    case Instruction::UIToFP: case Instruction::SIToFP:
-    case Instruction::InsertElement: case Instruction::ExtractElement:
-    case Instruction::ShuffleVector:
-    case Instruction::ExtractValue:
-      break;
-    case Instruction::ICmp:
-      // Comparing a pointer with null, or any other constant, isn't an
-      // interesting use, because we don't care what the pointer points to, or
-      // about the values of any other dynamic reference-counted pointers.
-      if (IsPotentialUse(I->getOperand(1)))
-        return IC_User;
-      break;
-    default:
-      // For anything else, check all the operands.
-      // Note that this includes both operands of a Store: while the first
-      // operand isn't actually being dereferenced, it is being stored to
-      // memory where we can no longer track who might read it and dereference
-      // it, so we have to consider it potentially used.
-      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
-           OI != OE; ++OI)
-        if (IsPotentialUse(*OI))
-          return IC_User;
-    }
-  }
-
-  // Otherwise, it's totally inert for ARC purposes.
-  return IC_None;
-}
-
-/// GetBasicInstructionClass - Determine what kind of construct V is. This is
-/// similar to GetInstructionClass except that it only detects objc runtine
-/// calls. This allows it to be faster.
-static InstructionClass GetBasicInstructionClass(const Value *V) {
-  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
-    if (const Function *F = CI->getCalledFunction())
-      return GetFunctionClass(F);
-    // Otherwise, be conservative.
-    return IC_CallOrUser;
-  }
-
-  // Otherwise, be conservative.
-  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
-}
-
-/// IsRetain - Test if the given class is objc_retain or
-/// equivalent.
-static bool IsRetain(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV;
-}
-
-/// IsAutorelease - Test if the given class is objc_autorelease or
-/// equivalent.
-static bool IsAutorelease(InstructionClass Class) {
-  return Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// IsForwarding - Test if the given class represents instructions which return
-/// their argument verbatim.
-static bool IsForwarding(InstructionClass Class) {
-  // objc_retainBlock technically doesn't always return its argument
-  // verbatim, but it doesn't matter for our purposes here.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock ||
-         Class == IC_NoopCast;
-}
-
-/// IsNoopOnNull - Test if the given class represents instructions which do
-/// nothing if passed a null pointer.
-static bool IsNoopOnNull(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock;
-}
-
-/// IsAlwaysTail - Test if the given class represents instructions which are
-/// always safe to mark with the "tail" keyword.
-static bool IsAlwaysTail(InstructionClass Class) {
-  // IC_RetainBlock may be given a stack argument.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// IsNoThrow - Test if the given class represents instructions which are always
-/// safe to mark with the nounwind attribute..
-static bool IsNoThrow(InstructionClass Class) {
-  // objc_retainBlock is not nounwind because it calls user copy constructors
-  // which could theoretically throw.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_AutoreleasepoolPush ||
-         Class == IC_AutoreleasepoolPop;
-}
-
-/// EraseInstruction - Erase the given instruction. Many ObjC calls return their
-/// argument verbatim, so if it's such a call and the return value has users,
-/// replace them with the argument value.
-static void EraseInstruction(Instruction *CI) {
-  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
-
-  bool Unused = CI->use_empty();
-
-  if (!Unused) {
-    // Replace the return value with the argument.
-    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
-           "Can't delete non-forwarding instruction with users!");
-    CI->replaceAllUsesWith(OldArg);
-  }
-
-  CI->eraseFromParent();
-
-  if (Unused)
-    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
-}
-
-/// GetUnderlyingObjCPtr - This is a wrapper around getUnderlyingObject which
-/// also knows how to look through objc_retain and objc_autorelease calls, which
-/// we know to return their argument verbatim.
-static const Value *GetUnderlyingObjCPtr(const Value *V) {
-  for (;;) {
-    V = GetUnderlyingObject(V);
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-
-  return V;
-}
-
-/// StripPointerCastsAndObjCCalls - This is a wrapper around
-/// Value::stripPointerCasts which also knows how to look through objc_retain
-/// and objc_autorelease calls, which we know to return their argument verbatim.
-static const Value *StripPointerCastsAndObjCCalls(const Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// StripPointerCastsAndObjCCalls - This is a wrapper around
-/// Value::stripPointerCasts which also knows how to look through objc_retain
-/// and objc_autorelease calls, which we know to return their argument verbatim.
-static Value *StripPointerCastsAndObjCCalls(Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// GetObjCArg - Assuming the given instruction is one of the special calls such
-/// as objc_retain or objc_release, return the argument value, stripped of no-op
-/// casts and forwarding calls.
-static Value *GetObjCArg(Value *Inst) {
-  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
-}
-
-/// IsObjCIdentifiedObject - This is similar to AliasAnalysis'
-/// isObjCIdentifiedObject, except that it uses special knowledge of
-/// ObjC conventions...
-static bool IsObjCIdentifiedObject(const Value *V) {
-  // Assume that call results and arguments have their own "provenance".
-  // Constants (including GlobalVariables) and Allocas are never
-  // reference-counted.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
-      isa<Argument>(V) || isa<Constant>(V) ||
-      isa<AllocaInst>(V))
-    return true;
-
-  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
-    const Value *Pointer =
-      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
-      // A constant pointer can't be pointing to an object on the heap. It may
-      // be reference-counted, but it won't be deleted.
-      if (GV->isConstant())
-        return true;
-      StringRef Name = GV->getName();
-      // These special variables are known to hold values which are not
-      // reference-counted pointers.
-      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
-          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
-          Name.startswith("\01l_objc_msgSend_fixup_"))
-        return true;
-    }
-  }
-
-  return false;
-}
-
-/// FindSingleUseIdentifiedObject - This is similar to
-/// StripPointerCastsAndObjCCalls but it stops as soon as it finds a value
-/// with multiple uses.
-static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
-  if (Arg->hasOneUse()) {
-    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
-      return FindSingleUseIdentifiedObject(BC->getOperand(0));
-    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
-      if (GEP->hasAllZeroIndices())
-        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
-    if (IsForwarding(GetBasicInstructionClass(Arg)))
-      return FindSingleUseIdentifiedObject(
-               cast<CallInst>(Arg)->getArgOperand(0));
-    if (!IsObjCIdentifiedObject(Arg))
-      return 0;
-    return Arg;
-  }
-
-  // If we found an identifiable object but it has multiple uses, but they are
-  // trivial uses, we can still consider this to be a single-use value.
-  if (IsObjCIdentifiedObject(Arg)) {
-    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
-         UI != UE; ++UI) {
-      const User *U = *UI;
-      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
-         return 0;
-    }
-
-    return Arg;
-  }
-
-  return 0;
-}
-
-/// ModuleHasARC - Test if the given module looks interesting to run ARC
-/// optimization on.
-static bool ModuleHasARC(const Module &M) {
-  return
-    M.getNamedValue("objc_retain") ||
-    M.getNamedValue("objc_release") ||
-    M.getNamedValue("objc_autorelease") ||
-    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
-    M.getNamedValue("objc_retainBlock") ||
-    M.getNamedValue("objc_autoreleaseReturnValue") ||
-    M.getNamedValue("objc_autoreleasePoolPush") ||
-    M.getNamedValue("objc_loadWeakRetained") ||
-    M.getNamedValue("objc_loadWeak") ||
-    M.getNamedValue("objc_destroyWeak") ||
-    M.getNamedValue("objc_storeWeak") ||
-    M.getNamedValue("objc_initWeak") ||
-    M.getNamedValue("objc_moveWeak") ||
-    M.getNamedValue("objc_copyWeak") ||
-    M.getNamedValue("objc_retainedObject") ||
-    M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer");
-}
-
-/// DoesObjCBlockEscape - Test whether the given pointer, which is an
-/// Objective C block pointer, does not "escape". This differs from regular
-/// escape analysis in that a use as an argument to a call is not considered
-/// an escape.
-static bool DoesObjCBlockEscape(const Value *BlockPtr) {
-  // Walk the def-use chains.
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(BlockPtr);
-  do {
-    const Value *V = Worklist.pop_back_val();
-    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-         UI != UE; ++UI) {
-      const User *UUser = *UI;
-      // Special - Use by a call (callee or argument) is not considered
-      // to be an escape.
-      switch (GetBasicInstructionClass(UUser)) {
-      case IC_StoreWeak:
-      case IC_InitWeak:
-      case IC_StoreStrong:
-      case IC_Autorelease:
-      case IC_AutoreleaseRV:
-        // These special functions make copies of their pointer arguments.
-        return true;
-      case IC_User:
-      case IC_None:
-        // Use by an instruction which copies the value is an escape if the
-        // result is an escape.
-        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-          Worklist.push_back(UUser);
-          continue;
-        }
-        // Use by a load is not an escape.
-        if (isa<LoadInst>(UUser))
-          continue;
-        // Use by a store is not an escape if the use is the address.
-        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-          if (V != SI->getValueOperand())
-            continue;
-        break;
-      default:
-        // Regular calls and other stuff are not considered escapes.
-        continue;
-      }
-      // Otherwise, conservatively assume an escape.
-      return true;
-    }
-  } while (!Worklist.empty());
-
-  // No escapes found.
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC AliasAnalysis.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Pass.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
-
-namespace {
-  /// ObjCARCAliasAnalysis - This is a simple alias analysis
-  /// implementation that uses knowledge of ARC constructs to answer queries.
-  ///
-  /// TODO: This class could be generalized to know about other ObjC-specific
-  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
-  /// even though their offsets are dynamic.
-  class ObjCARCAliasAnalysis : public ImmutablePass,
-                               public AliasAnalysis {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
-      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    virtual void initializePass() {
-      InitializeAliasAnalysis(this);
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const void *PI) {
-      if (PI == &AliasAnalysis::ID)
-        return static_cast<AliasAnalysis *>(this);
-      return this;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual AliasResult alias(const Location &LocA, const Location &LocB);
-    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
-    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
-    virtual ModRefBehavior getModRefBehavior(const Function *F);
-    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
-                                       const Location &Loc);
-    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                                       ImmutableCallSite CS2);
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char ObjCARCAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
-                   "ObjC-ARC-Based Alias Analysis", false, true, false)
-
-ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
-  return new ObjCARCAliasAnalysis();
-}
-
-void
-ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
-}
-
-AliasAnalysis::AliasResult
-ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::alias(LocA, LocB);
-
-  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
-  // precise alias query.
-  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
-  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
-  AliasResult Result =
-    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
-                         Location(SB, LocB.Size, LocB.TBAATag));
-  if (Result != MayAlias)
-    return Result;
-
-  // If that failed, climb to the underlying object, including climbing through
-  // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *UA = GetUnderlyingObjCPtr(SA);
-  const Value *UB = GetUnderlyingObjCPtr(SB);
-  if (UA != SA || UB != SB) {
-    Result = AliasAnalysis::alias(Location(UA), Location(UB));
-    // We can't use MustAlias or PartialAlias results here because
-    // GetUnderlyingObjCPtr may return an offsetted pointer value.
-    if (Result == NoAlias)
-      return NoAlias;
-  }
-
-  // If that failed, fail. We don't need to chain here, since that's covered
-  // by the earlier precise query.
-  return MayAlias;
-}
-
-bool
-ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
-                                             bool OrLocal) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
-
-  // First, strip off no-ops, including ObjC-specific no-ops, and try making
-  // a precise alias query.
-  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
-  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
-                                            OrLocal))
-    return true;
-
-  // If that failed, climb to the underlying object, including climbing through
-  // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *U = GetUnderlyingObjCPtr(S);
-  if (U != S)
-    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
-
-  // If that failed, fail. We don't need to chain here, since that's covered
-  // by the earlier precise query.
-  return false;
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
-  // We have nothing to do. Just chain to the next AliasAnalysis.
-  return AliasAnalysis::getModRefBehavior(CS);
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::getModRefBehavior(F);
-
-  switch (GetFunctionClass(F)) {
-  case IC_NoopCast:
-    return DoesNotAccessMemory;
-  default:
-    break;
-  }
-
-  return AliasAnalysis::getModRefBehavior(F);
-}
-
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::getModRefInfo(CS, Loc);
-
-  switch (GetBasicInstructionClass(CS.getInstruction())) {
-  case IC_Retain:
-  case IC_RetainRV:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_NoopCast:
-  case IC_AutoreleasepoolPush:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    // These functions don't access any memory visible to the compiler.
-    // Note that this doesn't include objc_retainBlock, because it updates
-    // pointers when it copies block data.
-    return NoModRef;
-  default:
-    break;
-  }
-
-  return AliasAnalysis::getModRefInfo(CS, Loc);
-}
-
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
-                                    ImmutableCallSite CS2) {
-  // TODO: Theoretically we could check for dependencies between objc_* calls
-  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
-}
-
-//===----------------------------------------------------------------------===//
-// ARC expansion.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Transforms/Scalar.h"
-
-namespace {
-  /// ObjCARCExpand - Early ARC transformations.
-  class ObjCARCExpand : public FunctionPass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-  public:
-    static char ID;
-    ObjCARCExpand() : FunctionPass(ID) {
-      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCExpand::ID = 0;
-INITIALIZE_PASS(ObjCARCExpand,
-                "objc-arc-expand", "ObjC ARC expansion", false, false)
-
-Pass *llvm::createObjCARCExpandPass() {
-  return new ObjCARCExpand();
-}
-
-void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCExpand::doInitialization(Module &M) {
-  Run = ModuleHasARC(M);
-  return false;
-}
-
-bool ObjCARCExpand::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  bool Changed = false;
-
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
-    Instruction *Inst = &*I;
-
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_Retain:
-    case IC_RetainRV:
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV:
-      // These calls return their argument verbatim, as a low-level
-      // optimization. However, this makes high-level optimizations
-      // harder. Undo any uses of this optimization that the front-end
-      // emitted here. We'll redo them in the contract pass.
-      Changed = true;
-      Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC autorelease pool elimination.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Constants.h"
-#include "llvm/ADT/STLExtras.h"
-
-namespace {
-  /// ObjCARCAPElim - Autorelease pool elimination.
-  class ObjCARCAPElim : public ModulePass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool runOnModule(Module &M);
-
-    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
-    static bool OptimizeBB(BasicBlock *BB);
-
-  public:
-    static char ID;
-    ObjCARCAPElim() : ModulePass(ID) {
-      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCAPElim::ID = 0;
-INITIALIZE_PASS(ObjCARCAPElim,
-                "objc-arc-apelim",
-                "ObjC ARC autorelease pool elimination",
-                false, false)
-
-Pass *llvm::createObjCARCAPElimPass() {
-  return new ObjCARCAPElim();
-}
-
-void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-/// MayAutorelease - Interprocedurally determine if calls made by the
-/// given call site can possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
-  if (const Function *Callee = CS.getCalledFunction()) {
-    if (Callee->isDeclaration() || Callee->mayBeOverridden())
-      return true;
-    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
-         I != E; ++I) {
-      const BasicBlock *BB = I;
-      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
-           J != F; ++J)
-        if (ImmutableCallSite JCS = ImmutableCallSite(J))
-          // This recursion depth limit is arbitrary. It's just great
-          // enough to cover known interesting testcases.
-          if (Depth < 3 &&
-              !JCS.onlyReadsMemory() &&
-              MayAutorelease(JCS, Depth + 1))
-            return true;
-    }
-    return false;
-  }
-
-  return true;
-}
-
-bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
-  bool Changed = false;
-
-  Instruction *Push = 0;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-    Instruction *Inst = I++;
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPush:
-      Push = Inst;
-      break;
-    case IC_AutoreleasepoolPop:
-      // If this pop matches a push and nothing in between can autorelease,
-      // zap the pair.
-      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
-        Changed = true;
-        Inst->eraseFromParent();
-        Push->eraseFromParent();
-      }
-      Push = 0;
-      break;
-    case IC_CallOrUser:
-      if (MayAutorelease(ImmutableCallSite(Inst)))
-        Push = 0;
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool ObjCARCAPElim::runOnModule(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!ModuleHasARC(M))
-    return false;
-
-  // Find the llvm.global_ctors variable, as the first step in
-  // identifying the global constructors. In theory, unnecessary autorelease
-  // pools could occur anywhere, but in practice it's pretty rare. Global
-  // ctors are a place where autorelease pools get inserted automatically,
-  // so it's pretty common for them to be unnecessary, and it's pretty
-  // profitable to eliminate them.
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
-  if (!GV)
-    return false;
-
-  assert(GV->hasDefinitiveInitializer() &&
-         "llvm.global_ctors is uncooperative!");
-
-  bool Changed = false;
-
-  // Dig the constructor functions out of GV's initializer.
-  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
-  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
-       OI != OE; ++OI) {
-    Value *Op = *OI;
-    // llvm.global_ctors is an array of pairs where the second members
-    // are constructor functions.
-    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
-    // If the user used a constructor function with the wrong signature and
-    // it got bitcasted or whatever, look the other way.
-    if (!F)
-      continue;
-    // Only look at function definitions.
-    if (F->isDeclaration())
-      continue;
-    // Only look at functions with one basic block.
-    if (llvm::next(F->begin()) != F->end())
-      continue;
-    // Ok, a single-block constructor function definition. Try to optimize it.
-    Changed |= OptimizeBB(F->begin());
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC optimization.
-//===----------------------------------------------------------------------===//
-
-// TODO: On code like this:
-//
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-// stuff_that_cannot_release()
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-//
-// The second retain and autorelease can be deleted.
-
-// TODO: It should be possible to delete
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop
-// pairs if nothing is actually autoreleased between them. Also, autorelease
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
-// after inlining) can be turned into plain release calls.
-
-// TODO: Critical-edge splitting. If the optimial insertion point is
-// a critical edge, the current algorithm has to fail, because it doesn't
-// know how to split edges. It should be possible to make the optimizer
-// think in terms of edges, rather than blocks, and then split critical
-// edges on demand.
-
-// TODO: OptimizeSequences could generalized to be Interprocedural.
-
-// TODO: Recognize that a bunch of other objc runtime calls have
-// non-escaping arguments and non-releasing arguments, and may be
-// non-autoreleasing.
-
-// TODO: Sink autorelease calls as far as possible. Unfortunately we
-// usually can't sink them past other calls, which would be the main
-// case where it would be useful.
-
-// TODO: The pointer returned from objc_loadWeakRetained is retained.
-
-// TODO: Delete release+retain pairs (rare).
-
-#include "llvm/LLVMContext.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
-
-STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
-STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
-STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
-STATISTIC(NumRets,        "Number of return value forwarding "
-                          "retain+autoreleaes eliminated");
-STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
-STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
-
-namespace {
-  /// ProvenanceAnalysis - This is similar to BasicAliasAnalysis, and it
-  /// uses many of the same techniques, except it uses special ObjC-specific
-  /// reasoning about pointer relationships.
-  class ProvenanceAnalysis {
-    AliasAnalysis *AA;
-
-    typedef std::pair<const Value *, const Value *> ValuePairTy;
-    typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
-    CachedResultsTy CachedResults;
-
-    bool relatedCheck(const Value *A, const Value *B);
-    bool relatedSelect(const SelectInst *A, const Value *B);
-    bool relatedPHI(const PHINode *A, const Value *B);
-
-    void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-    ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-
-  public:
-    ProvenanceAnalysis() {}
-
-    void setAA(AliasAnalysis *aa) { AA = aa; }
-
-    AliasAnalysis *getAA() const { return AA; }
-
-    bool related(const Value *A, const Value *B);
-
-    void clear() {
-      CachedResults.clear();
-    }
-  };
-}
-
-bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) {
-  // If the values are Selects with the same condition, we can do a more precise
-  // check: just check for relations between the values on corresponding arms.
-  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
-    if (A->getCondition() == SB->getCondition())
-      return related(A->getTrueValue(), SB->getTrueValue()) ||
-             related(A->getFalseValue(), SB->getFalseValue());
-
-  // Check both arms of the Select node individually.
-  return related(A->getTrueValue(), B) ||
-         related(A->getFalseValue(), B);
-}
-
-bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) {
-  // If the values are PHIs in the same block, we can do a more precise as well
-  // as efficient check: just check for relations between the values on
-  // corresponding edges.
-  if (const PHINode *PNB = dyn_cast<PHINode>(B))
-    if (PNB->getParent() == A->getParent()) {
-      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
-        if (related(A->getIncomingValue(i),
-                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
-          return true;
-      return false;
-    }
-
-  // Check each unique source of the PHI node against B.
-  SmallPtrSet<const Value *, 4> UniqueSrc;
-  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
-    const Value *PV1 = A->getIncomingValue(i);
-    if (UniqueSrc.insert(PV1) && related(PV1, B))
-      return true;
-  }
-
-  // All of the arms checked out.
-  return false;
-}
-
-/// isStoredObjCPointer - Test if the value of P, or any value covered by its
-/// provenance, is ever stored within the function (not counting callees).
-static bool isStoredObjCPointer(const Value *P) {
-  SmallPtrSet<const Value *, 8> Visited;
-  SmallVector<const Value *, 8> Worklist;
-  Worklist.push_back(P);
-  Visited.insert(P);
-  do {
-    P = Worklist.pop_back_val();
-    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
-         UI != UE; ++UI) {
-      const User *Ur = *UI;
-      if (isa<StoreInst>(Ur)) {
-        if (UI.getOperandNo() == 0)
-          // The pointer is stored.
-          return true;
-        // The pointed is stored through.
-        continue;
-      }
-      if (isa<CallInst>(Ur))
-        // The pointer is passed as an argument, ignore this.
-        continue;
-      if (isa<PtrToIntInst>(P))
-        // Assume the worst.
-        return true;
-      if (Visited.insert(Ur))
-        Worklist.push_back(Ur);
-    }
-  } while (!Worklist.empty());
-
-  // Everything checked out.
-  return false;
-}
-
-bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
-  // Skip past provenance pass-throughs.
-  A = GetUnderlyingObjCPtr(A);
-  B = GetUnderlyingObjCPtr(B);
-
-  // Quick check.
-  if (A == B)
-    return true;
-
-  // Ask regular AliasAnalysis, for a first approximation.
-  switch (AA->alias(A, B)) {
-  case AliasAnalysis::NoAlias:
-    return false;
-  case AliasAnalysis::MustAlias:
-  case AliasAnalysis::PartialAlias:
-    return true;
-  case AliasAnalysis::MayAlias:
-    break;
-  }
-
-  bool AIsIdentified = IsObjCIdentifiedObject(A);
-  bool BIsIdentified = IsObjCIdentifiedObject(B);
-
-  // An ObjC-Identified object can't alias a load if it is never locally stored.
-  if (AIsIdentified) {
-    // Check for an obvious escape.
-    if (isa<LoadInst>(B))
-      return isStoredObjCPointer(A);
-    if (BIsIdentified) {
-      // Check for an obvious escape.
-      if (isa<LoadInst>(A))
-        return isStoredObjCPointer(B);
-      // Both pointers are identified and escapes aren't an evident problem.
-      return false;
-    }
-  } else if (BIsIdentified) {
-    // Check for an obvious escape.
-    if (isa<LoadInst>(A))
-      return isStoredObjCPointer(B);
-  }
-
-   // Special handling for PHI and Select.
-  if (const PHINode *PN = dyn_cast<PHINode>(A))
-    return relatedPHI(PN, B);
-  if (const PHINode *PN = dyn_cast<PHINode>(B))
-    return relatedPHI(PN, A);
-  if (const SelectInst *S = dyn_cast<SelectInst>(A))
-    return relatedSelect(S, B);
-  if (const SelectInst *S = dyn_cast<SelectInst>(B))
-    return relatedSelect(S, A);
-
-  // Conservative.
-  return true;
-}
-
-bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
-  // Begin by inserting a conservative value into the map. If the insertion
-  // fails, we have the answer already. If it succeeds, leave it there until we
-  // compute the real answer to guard against recursive queries.
-  if (A > B) std::swap(A, B);
-  std::pair<CachedResultsTy::iterator, bool> Pair =
-    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
-  if (!Pair.second)
-    return Pair.first->second;
-
-  bool Result = relatedCheck(A, B);
-  CachedResults[ValuePairTy(A, B)] = Result;
-  return Result;
-}
-
-namespace {
-  // Sequence - A sequence of states that a pointer may go through in which an
-  // objc_retain and objc_release are actually needed.
-  enum Sequence {
-    S_None,
-    S_Retain,         ///< objc_retain(x)
-    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement
-    S_Use,            ///< any use of x
-    S_Stop,           ///< like S_Release, but code motion is stopped
-    S_Release,        ///< objc_release(x)
-    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release
-  };
-}
-
-static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
-  // The easy cases.
-  if (A == B)
-    return A;
-  if (A == S_None || B == S_None)
-    return S_None;
-
-  if (A > B) std::swap(A, B);
-  if (TopDown) {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Retain || A == S_CanRelease) &&
-        (B == S_CanRelease || B == S_Use))
-      return B;
-  } else {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
-      return A;
-    // If both sides are releases, choose the more conservative one.
-    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
-      return A;
-    if (A == S_Release && B == S_MovableRelease)
-      return A;
-  }
-
-  return S_None;
-}
-
-namespace {
-  /// RRInfo - Unidirectional information about either a
-  /// retain-decrement-use-release sequence or release-use-decrement-retain
-  /// reverese sequence.
-  struct RRInfo {
-    /// KnownSafe - After an objc_retain, the reference count of the referenced
-    /// object is known to be positive. Similarly, before an objc_release, the
-    /// reference count of the referenced object is known to be positive. If
-    /// there are retain-release pairs in code regions where the retain count
-    /// is known to be positive, they can be eliminated, regardless of any side
-    /// effects between them.
-    ///
-    /// Also, a retain+release pair nested within another retain+release
-    /// pair all on the known same pointer value can be eliminated, regardless
-    /// of any intervening side effects.
-    ///
-    /// KnownSafe is true when either of these conditions is satisfied.
-    bool KnownSafe;
-
-    /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
-    /// opposed to objc_retain calls).
-    bool IsRetainBlock;
-
-    /// IsTailCallRelease - True of the objc_release calls are all marked
-    /// with the "tail" keyword.
-    bool IsTailCallRelease;
-
-    /// ReleaseMetadata - If the Calls are objc_release calls and they all have
-    /// a clang.imprecise_release tag, this is the metadata tag.
-    MDNode *ReleaseMetadata;
-
-    /// Calls - For a top-down sequence, the set of objc_retains or
-    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
-    SmallPtrSet<Instruction *, 2> Calls;
-
-    /// ReverseInsertPts - The set of optimal insert positions for
-    /// moving calls in the opposite sequence.
-    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
-
-    RRInfo() :
-      KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false),
-      ReleaseMetadata(0) {}
-
-    void clear();
-  };
-}
-
-void RRInfo::clear() {
-  KnownSafe = false;
-  IsRetainBlock = false;
-  IsTailCallRelease = false;
-  ReleaseMetadata = 0;
-  Calls.clear();
-  ReverseInsertPts.clear();
-}
-
-namespace {
-  /// PtrState - This class summarizes several per-pointer runtime properties
-  /// which are propogated through the flow graph.
-  class PtrState {
-    /// KnownPositiveRefCount - True if the reference count is known to
-    /// be incremented.
-    bool KnownPositiveRefCount;
-
-    /// Partial - True of we've seen an opportunity for partial RR elimination,
-    /// such as pushing calls into a CFG triangle or into one side of a
-    /// CFG diamond.
-    bool Partial;
-
-    /// Seq - The current position in the sequence.
-    Sequence Seq : 8;
-
-  public:
-    /// RRI - Unidirectional information about the current sequence.
-    /// TODO: Encapsulate this better.
-    RRInfo RRI;
-
-    PtrState() : KnownPositiveRefCount(false), Partial(false),
-                 Seq(S_None) {}
-
-    void SetKnownPositiveRefCount() {
-      KnownPositiveRefCount = true;
-    }
-
-    void ClearRefCount() {
-      KnownPositiveRefCount = false;
-    }
-
-    bool IsKnownIncremented() const {
-      return KnownPositiveRefCount;
-    }
-
-    void SetSeq(Sequence NewSeq) {
-      Seq = NewSeq;
-    }
-
-    Sequence GetSeq() const {
-      return Seq;
-    }
-
-    void ClearSequenceProgress() {
-      ResetSequenceProgress(S_None);
-    }
-
-    void ResetSequenceProgress(Sequence NewSeq) {
-      Seq = NewSeq;
-      Partial = false;
-      RRI.clear();
-    }
-
-    void Merge(const PtrState &Other, bool TopDown);
-  };
-}
-
-void
-PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
-  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-
-  // We can't merge a plain objc_retain with an objc_retainBlock.
-  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
-    Seq = S_None;
-
-  // If we're not in a sequence (anymore), drop all associated state.
-  if (Seq == S_None) {
-    Partial = false;
-    RRI.clear();
-  } else if (Partial || Other.Partial) {
-    // If we're doing a merge on a path that's previously seen a partial
-    // merge, conservatively drop the sequence, to avoid doing partial
-    // RR elimination. If the branch predicates for the two merge differ,
-    // mixing them is unsafe.
-    ClearSequenceProgress();
-  } else {
-    // Conservatively merge the ReleaseMetadata information.
-    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
-      RRI.ReleaseMetadata = 0;
-
-    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
-    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
-                            Other.RRI.IsTailCallRelease;
-    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
-
-    // Merge the insert point sets. If there are any differences,
-    // that makes this a partial merge.
-    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
-    for (SmallPtrSet<Instruction *, 2>::const_iterator
-         I = Other.RRI.ReverseInsertPts.begin(),
-         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
-      Partial |= RRI.ReverseInsertPts.insert(*I);
-  }
-}
-
-namespace {
-  /// BBState - Per-BasicBlock state.
-  class BBState {
-    /// TopDownPathCount - The number of unique control paths from the entry
-    /// which can reach this block.
-    unsigned TopDownPathCount;
-
-    /// BottomUpPathCount - The number of unique control paths to exits
-    /// from this block.
-    unsigned BottomUpPathCount;
-
-    /// MapTy - A type for PerPtrTopDown and PerPtrBottomUp.
-    typedef MapVector<const Value *, PtrState> MapTy;
-
-    /// PerPtrTopDown - The top-down traversal uses this to record information
-    /// known about a pointer at the bottom of each block.
-    MapTy PerPtrTopDown;
-
-    /// PerPtrBottomUp - The bottom-up traversal uses this to record information
-    /// known about a pointer at the top of each block.
-    MapTy PerPtrBottomUp;
-
-    /// Preds, Succs - Effective successors and predecessors of the current
-    /// block (this ignores ignorable edges and ignored backedges).
-    SmallVector<BasicBlock *, 2> Preds;
-    SmallVector<BasicBlock *, 2> Succs;
-
-  public:
-    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
-
-    typedef MapTy::iterator ptr_iterator;
-    typedef MapTy::const_iterator ptr_const_iterator;
-
-    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
-    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
-    ptr_const_iterator top_down_ptr_begin() const {
-      return PerPtrTopDown.begin();
-    }
-    ptr_const_iterator top_down_ptr_end() const {
-      return PerPtrTopDown.end();
-    }
-
-    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
-    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
-    ptr_const_iterator bottom_up_ptr_begin() const {
-      return PerPtrBottomUp.begin();
-    }
-    ptr_const_iterator bottom_up_ptr_end() const {
-      return PerPtrBottomUp.end();
-    }
-
-    /// SetAsEntry - Mark this block as being an entry block, which has one
-    /// path from the entry by definition.
-    void SetAsEntry() { TopDownPathCount = 1; }
-
-    /// SetAsExit - Mark this block as being an exit block, which has one
-    /// path to an exit by definition.
-    void SetAsExit()  { BottomUpPathCount = 1; }
-
-    PtrState &getPtrTopDownState(const Value *Arg) {
-      return PerPtrTopDown[Arg];
-    }
-
-    PtrState &getPtrBottomUpState(const Value *Arg) {
-      return PerPtrBottomUp[Arg];
-    }
-
-    void clearBottomUpPointers() {
-      PerPtrBottomUp.clear();
-    }
-
-    void clearTopDownPointers() {
-      PerPtrTopDown.clear();
-    }
-
-    void InitFromPred(const BBState &Other);
-    void InitFromSucc(const BBState &Other);
-    void MergePred(const BBState &Other);
-    void MergeSucc(const BBState &Other);
-
-    /// GetAllPathCount - Return the number of possible unique paths from an
-    /// entry to an exit which pass through this block. This is only valid
-    /// after both the top-down and bottom-up traversals are complete.
-    unsigned GetAllPathCount() const {
-      assert(TopDownPathCount != 0);
-      assert(BottomUpPathCount != 0);
-      return TopDownPathCount * BottomUpPathCount;
-    }
-
-    // Specialized CFG utilities.
-    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
-    edge_iterator pred_begin() { return Preds.begin(); }
-    edge_iterator pred_end() { return Preds.end(); }
-    edge_iterator succ_begin() { return Succs.begin(); }
-    edge_iterator succ_end() { return Succs.end(); }
-
-    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
-    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
-
-    bool isExit() const { return Succs.empty(); }
-  };
-}
-
-void BBState::InitFromPred(const BBState &Other) {
-  PerPtrTopDown = Other.PerPtrTopDown;
-  TopDownPathCount = Other.TopDownPathCount;
-}
-
-void BBState::InitFromSucc(const BBState &Other) {
-  PerPtrBottomUp = Other.PerPtrBottomUp;
-  BottomUpPathCount = Other.BottomUpPathCount;
-}
-
-/// MergePred - The top-down traversal uses this to merge information about
-/// predecessors to form the initial state for a new block.
-void BBState::MergePred(const BBState &Other) {
-  // Other.TopDownPathCount can be 0, in which case it is either dead or a
-  // loop backedge. Loop backedges are special.
-  TopDownPathCount += Other.TopDownPathCount;
-
-  // Check for overflow. If we have overflow, fall back to conservative behavior.
-  if (TopDownPathCount < Other.TopDownPathCount) {
-    clearTopDownPointers();
-    return;
-  }
-
-  // For each entry in the other set, if our set has an entry with the same key,
-  // merge the entries. Otherwise, copy the entry and merge it with an empty
-  // entry.
-  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
-       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
-                             /*TopDown=*/true);
-  }
-
-  // For each entry in our set, if the other set doesn't have an entry with the
-  // same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = top_down_ptr_begin(),
-       ME = top_down_ptr_end(); MI != ME; ++MI)
-    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/true);
-}
-
-/// MergeSucc - The bottom-up traversal uses this to merge information about
-/// successors to form the initial state for a new block.
-void BBState::MergeSucc(const BBState &Other) {
-  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
-  // loop backedge. Loop backedges are special.
-  BottomUpPathCount += Other.BottomUpPathCount;
-
-  // Check for overflow. If we have overflow, fall back to conservative behavior.
-  if (BottomUpPathCount < Other.BottomUpPathCount) {
-    clearBottomUpPointers();
-    return;
-  }
-
-  // For each entry in the other set, if our set has an entry with the
-  // same key, merge the entries. Otherwise, copy the entry and merge
-  // it with an empty entry.
-  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
-       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
-                             /*TopDown=*/false);
-  }
-
-  // For each entry in our set, if the other set doesn't have an entry
-  // with the same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = bottom_up_ptr_begin(),
-       ME = bottom_up_ptr_end(); MI != ME; ++MI)
-    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/false);
-}
-
-namespace {
-  /// ObjCARCOpt - The main ARC optimization pass.
-  class ObjCARCOpt : public FunctionPass {
-    bool Changed;
-    ProvenanceAnalysis PA;
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// RetainRVCallee, etc. - Declarations for ObjC runtime
-    /// functions, for use in creating calls to them. These are initialized
-    /// lazily to avoid cluttering up the Module with unused declarations.
-    Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
-             *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee;
-
-    /// UsedInThisFunciton - Flags which determine whether each of the
-    /// interesting runtine functions is in fact used in the current function.
-    unsigned UsedInThisFunction;
-
-    /// ImpreciseReleaseMDKind - The Metadata Kind for clang.imprecise_release
-    /// metadata.
-    unsigned ImpreciseReleaseMDKind;
-
-    /// CopyOnEscapeMDKind - The Metadata Kind for clang.arc.copy_on_escape
-    /// metadata.
-    unsigned CopyOnEscapeMDKind;
-
-    /// NoObjCARCExceptionsMDKind - The Metadata Kind for
-    /// clang.arc.no_objc_arc_exceptions metadata.
-    unsigned NoObjCARCExceptionsMDKind;
-
-    Constant *getRetainRVCallee(Module *M);
-    Constant *getAutoreleaseRVCallee(Module *M);
-    Constant *getReleaseCallee(Module *M);
-    Constant *getRetainCallee(Module *M);
-    Constant *getRetainBlockCallee(Module *M);
-    Constant *getAutoreleaseCallee(Module *M);
-
-    bool IsRetainBlockOptimizable(const Instruction *Inst);
-
-    void OptimizeRetainCall(Function &F, Instruction *Retain);
-    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
-    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV);
-    void OptimizeIndividualCalls(Function &F);
-
-    void CheckForCFGHazards(const BasicBlock *BB,
-                            DenseMap<const BasicBlock *, BBState> &BBStates,
-                            BBState &MyStates) const;
-    bool VisitInstructionBottomUp(Instruction *Inst,
-                                  BasicBlock *BB,
-                                  MapVector<Value *, RRInfo> &Retains,
-                                  BBState &MyStates);
-    bool VisitBottomUp(BasicBlock *BB,
-                       DenseMap<const BasicBlock *, BBState> &BBStates,
-                       MapVector<Value *, RRInfo> &Retains);
-    bool VisitInstructionTopDown(Instruction *Inst,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 BBState &MyStates);
-    bool VisitTopDown(BasicBlock *BB,
-                      DenseMap<const BasicBlock *, BBState> &BBStates,
-                      DenseMap<Value *, RRInfo> &Releases);
-    bool Visit(Function &F,
-               DenseMap<const BasicBlock *, BBState> &BBStates,
-               MapVector<Value *, RRInfo> &Retains,
-               DenseMap<Value *, RRInfo> &Releases);
-
-    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
-                   MapVector<Value *, RRInfo> &Retains,
-                   DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts,
-                   Module *M);
-
-    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
-                              MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases,
-                              Module *M);
-
-    void OptimizeWeakCalls(Function &F);
-
-    bool OptimizeSequences(Function &F);
-
-    void OptimizeReturns(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-    virtual void releaseMemory();
-
-  public:
-    static char ID;
-    ObjCARCOpt() : FunctionPass(ID) {
-      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCOpt::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCOpt,
-                      "objc-arc", "ObjC ARC optimization", false, false)
-INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
-INITIALIZE_PASS_END(ObjCARCOpt,
-                    "objc-arc", "ObjC ARC optimization", false, false)
-
-Pass *llvm::createObjCARCOptPass() {
-  return new ObjCARCOpt();
-}
-
-void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ObjCARCAliasAnalysis>();
-  AU.addRequired<AliasAnalysis>();
-  // ARC optimization doesn't currently split critical edges.
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
-  // Without the magic metadata tag, we have to assume this might be an
-  // objc_retainBlock call inserted to convert a block pointer to an id,
-  // in which case it really is needed.
-  if (!Inst->getMetadata(CopyOnEscapeMDKind))
-    return false;
-
-  // If the pointer "escapes" (not including being used in a call),
-  // the copy may be needed.
-  if (DoesObjCBlockEscape(Inst))
-    return false;
-
-  // Otherwise, it's not needed.
-  return true;
-}
-
-Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
-  if (!RetainRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    RetainRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
-                             Attributes);
-  }
-  return RetainRVCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
-  if (!AutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    AutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
-                             Attributes);
-  }
-  return AutoreleaseRVCallee;
-}
-
-Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
-  if (!ReleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    ReleaseCallee =
-      M->getOrInsertFunction(
-        "objc_release",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attributes);
-  }
-  return ReleaseCallee;
-}
-
-Constant *ObjCARCOpt::getRetainCallee(Module *M) {
-  if (!RetainCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    RetainCallee =
-      M->getOrInsertFunction(
-        "objc_retain",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attributes);
-  }
-  return RetainCallee;
-}
-
-Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
-  if (!RetainBlockCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    // objc_retainBlock is not nounwind because it calls user copy constructors
-    // which could theoretically throw.
-    RetainBlockCallee =
-      M->getOrInsertFunction(
-        "objc_retainBlock",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        AttrListPtr());
-  }
-  return RetainBlockCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
-  if (!AutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    AutoreleaseCallee =
-      M->getOrInsertFunction(
-        "objc_autorelease",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attributes);
-  }
-  return AutoreleaseCallee;
-}
-
-/// IsPotentialUse - Test whether the given value is possible a
-/// reference-counted pointer, including tests which utilize AliasAnalysis.
-static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
-  // First make the rudimentary check.
-  if (!IsPotentialUse(Op))
-    return false;
-
-  // Objects in constant memory are not reference-counted.
-  if (AA.pointsToConstantMemory(Op))
-    return false;
-
-  // Pointers in constant memory are not pointing to reference-counted objects.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
-    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
-      return false;
-
-  // Otherwise assume the worst.
-  return true;
-}
-
-/// CanAlterRefCount - Test whether the given instruction can result in a
-/// reference count modification (positive or negative) for the pointer's
-/// object.
-static bool
-CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
-                 ProvenanceAnalysis &PA, InstructionClass Class) {
-  switch (Class) {
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_User:
-    // These operations never directly modify a reference count.
-    return false;
-  default: break;
-  }
-
-  ImmutableCallSite CS = static_cast<const Value *>(Inst);
-  assert(CS && "Only calls can alter reference counts!");
-
-  // See if AliasAnalysis can help us with the call.
-  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
-  if (AliasAnalysis::onlyReadsMemory(MRB))
-    return false;
-  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
-    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-         I != E; ++I) {
-      const Value *Op = *I;
-      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true;
-    }
-    return false;
-  }
-
-  // Assume the worst.
-  return true;
-}
-
-/// CanUse - Test whether the given instruction can "use" the given pointer's
-/// object in a way that requires the reference count to be positive.
-static bool
-CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
-       InstructionClass Class) {
-  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
-  if (Class == IC_Call)
-    return false;
-
-  // Consider various instructions which may have pointer arguments which are
-  // not "uses".
-  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
-    // Comparing a pointer with null, or any other constant, isn't really a use,
-    // because we don't care what the pointer points to, or about the values
-    // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
-      return false;
-  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
-    // For calls, just check the arguments (and not the callee operand).
-    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
-         OE = CS.arg_end(); OI != OE; ++OI) {
-      const Value *Op = *OI;
-      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true;
-    }
-    return false;
-  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    // Special-case stores, because we don't care about the stored value, just
-    // the store address.
-    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
-    // If we can't tell what the underlying object was, assume there is a
-    // dependence.
-    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
-  }
-
-  // Check each operand for a match.
-  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
-       OI != OE; ++OI) {
-    const Value *Op = *OI;
-    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-      return true;
-  }
-  return false;
-}
-
-/// CanInterruptRV - Test whether the given instruction can autorelease
-/// any pointer or cause an autoreleasepool pop.
-static bool
-CanInterruptRV(InstructionClass Class) {
-  switch (Class) {
-  case IC_AutoreleasepoolPop:
-  case IC_CallOrUser:
-  case IC_Call:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    return true;
-  default:
-    return false;
-  }
-}
-
-namespace {
-  /// DependenceKind - There are several kinds of dependence-like concepts in
-  /// use here.
-  enum DependenceKind {
-    NeedsPositiveRetainCount,
-    AutoreleasePoolBoundary,
-    CanChangeRetainCount,
-    RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
-    RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
-    RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
-  };
-}
-
-/// Depends - Test if there can be dependencies on Inst through Arg. This
-/// function only tests dependencies relevant for removing pairs of calls.
-static bool
-Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
-        ProvenanceAnalysis &PA) {
-  // If we've reached the definition of Arg, stop.
-  if (Inst == Arg)
-    return true;
-
-  switch (Flavor) {
-  case NeedsPositiveRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      return false;
-    default:
-      return CanUse(Inst, Arg, PA, Class);
-    }
-  }
-
-  case AutoreleasePoolBoundary: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-      // These mark the end and begin of an autorelease pool scope.
-      return true;
-    default:
-      // Nothing else does this.
-      return false;
-    }
-  }
-
-  case CanChangeRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-      // Conservatively assume this can decrement any count.
-      return true;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      return false;
-    default:
-      return CanAlterRefCount(Inst, Arg, PA, Class);
-    }
-  }
-
-  case RetainAutoreleaseDep:
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-      // Don't merge an objc_autorelease with an objc_retain inside a different
-      // autoreleasepool scope.
-      return true;
-    case IC_Retain:
-    case IC_RetainRV:
-      // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
-    default:
-      // Nothing else matters for objc_retainAutorelease formation.
-      return false;
-    }
-
-  case RetainAutoreleaseRVDep: {
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    switch (Class) {
-    case IC_Retain:
-    case IC_RetainRV:
-      // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
-    default:
-      // Anything that can autorelease interrupts
-      // retainAutoreleaseReturnValue formation.
-      return CanInterruptRV(Class);
-    }
-  }
-
-  case RetainRVDep:
-    return CanInterruptRV(GetBasicInstructionClass(Inst));
-  }
-
-  llvm_unreachable("Invalid dependence flavor");
-}
-
-/// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and
-/// find local and non-local dependencies on Arg.
-/// TODO: Cache results?
-static void
-FindDependencies(DependenceKind Flavor,
-                 const Value *Arg,
-                 BasicBlock *StartBB, Instruction *StartInst,
-                 SmallPtrSet<Instruction *, 4> &DependingInstructions,
-                 SmallPtrSet<const BasicBlock *, 4> &Visited,
-                 ProvenanceAnalysis &PA) {
-  BasicBlock::iterator StartPos = StartInst;
-
-  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
-  Worklist.push_back(std::make_pair(StartBB, StartPos));
-  do {
-    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
-      Worklist.pop_back_val();
-    BasicBlock *LocalStartBB = Pair.first;
-    BasicBlock::iterator LocalStartPos = Pair.second;
-    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
-    for (;;) {
-      if (LocalStartPos == StartBBBegin) {
-        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
-        if (PI == PE)
-          // If we've reached the function entry, produce a null dependence.
-          DependingInstructions.insert(0);
-        else
-          // Add the predecessors to the worklist.
-          do {
-            BasicBlock *PredBB = *PI;
-            if (Visited.insert(PredBB))
-              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
-          } while (++PI != PE);
-        break;
-      }
-
-      Instruction *Inst = --LocalStartPos;
-      if (Depends(Flavor, Inst, Arg, PA)) {
-        DependingInstructions.insert(Inst);
-        break;
-      }
-    }
-  } while (!Worklist.empty());
-
-  // Determine whether the original StartBB post-dominates all of the blocks we
-  // visited. If not, insert a sentinal indicating that most optimizations are
-  // not safe.
-  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
-       E = Visited.end(); I != E; ++I) {
-    const BasicBlock *BB = *I;
-    if (BB == StartBB)
-      continue;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-      const BasicBlock *Succ = *SI;
-      if (Succ != StartBB && !Visited.count(Succ)) {
-        DependingInstructions.insert(reinterpret_cast<Instruction *>(-1));
-        return;
-      }
-    }
-  }
-}
-
-static bool isNullOrUndef(const Value *V) {
-  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
-}
-
-static bool isNoopInstruction(const Instruction *I) {
-  return isa<BitCastInst>(I) ||
-         (isa<GetElementPtrInst>(I) &&
-          cast<GetElementPtrInst>(I)->hasAllZeroIndices());
-}
-
-/// OptimizeRetainCall - Turn objc_retain into
-/// objc_retainAutoreleasedReturnValue if the operand is a return value.
-void
-ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetObjCArg(Retain));
-  const Instruction *Call = CS.getInstruction();
-  if (!Call) return;
-  if (Call->getParent() != Retain->getParent()) return;
-
-  // Check that the call is next to the retain.
-  BasicBlock::const_iterator I = Call;
-  ++I;
-  while (isNoopInstruction(I)) ++I;
-  if (&*I != Retain)
-    return;
-
-  // Turn it to an objc_retainAutoreleasedReturnValue..
-  Changed = true;
-  ++NumPeeps;
-  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
-}
-
-/// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into
-/// objc_retain if the operand is not a return value.  Or, if it can be paired
-/// with an objc_autoreleaseReturnValue, delete the pair and return true.
-bool
-ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
-  // Check for the argument being from an immediately preceding call or invoke.
-  const Value *Arg = GetObjCArg(RetainRV);
-  ImmutableCallSite CS(Arg);
-  if (const Instruction *Call = CS.getInstruction()) {
-    if (Call->getParent() == RetainRV->getParent()) {
-      BasicBlock::const_iterator I = Call;
-      ++I;
-      while (isNoopInstruction(I)) ++I;
-      if (&*I == RetainRV)
-        return false;
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      BasicBlock *RetainRVParent = RetainRV->getParent();
-      if (II->getNormalDest() == RetainRVParent) {
-        BasicBlock::const_iterator I = RetainRVParent->begin();
-        while (isNoopInstruction(I)) ++I;
-        if (&*I == RetainRV)
-          return false;
-      }
-    }
-  }
-
-  // Check for being preceded by an objc_autoreleaseReturnValue on the same
-  // pointer. In this case, we can delete the pair.
-  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
-  if (I != Begin) {
-    do --I; while (I != Begin && isNoopInstruction(I));
-    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
-        GetObjCArg(I) == Arg) {
-      Changed = true;
-      ++NumPeeps;
-      EraseInstruction(I);
-      EraseInstruction(RetainRV);
-      return true;
-    }
-  }
-
-  // Turn it to a plain objc_retain.
-  Changed = true;
-  ++NumPeeps;
-  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
-  return false;
-}
-
-/// OptimizeAutoreleaseRVCall - Turn objc_autoreleaseReturnValue into
-/// objc_autorelease if the result is not used as a return value.
-void
-ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
-  // Check for a return of the pointer value.
-  const Value *Ptr = GetObjCArg(AutoreleaseRV);
-  SmallVector<const Value *, 2> Users;
-  Users.push_back(Ptr);
-  do {
-    Ptr = Users.pop_back_val();
-    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
-         UI != UE; ++UI) {
-      const User *I = *UI;
-      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
-        return;
-      if (isa<BitCastInst>(I))
-        Users.push_back(I);
-    }
-  } while (!Users.empty());
-
-  Changed = true;
-  ++NumPeeps;
-  cast<CallInst>(AutoreleaseRV)->
-    setCalledFunction(getAutoreleaseCallee(F.getParent()));
-}
-
-/// OptimizeIndividualCalls - Visit each call, one at a time, and make
-/// simplifications without doing any additional analysis.
-void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
-  // Reset all the flags in preparation for recomputing them.
-  UsedInThisFunction = 0;
-
-  // Visit all objc_* calls in F.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-
-    switch (Class) {
-    default: break;
-
-    // Delete no-op casts. These function calls have special semantics, but
-    // the semantics are entirely implemented via lowering in the front-end,
-    // so by the time they reach the optimizer, they are just no-op calls
-    // which return their argument.
-    //
-    // There are gray areas here, as the ability to cast reference-counted
-    // pointers to raw void* and back allows code to break ARC assumptions,
-    // however these are currently considered to be unimportant.
-    case IC_NoopCast:
-      Changed = true;
-      ++NumNoops;
-      EraseInstruction(Inst);
-      continue;
-
-    // If the pointer-to-weak-pointer is null, it's undefined behavior.
-    case IC_StoreWeak:
-    case IC_LoadWeak:
-    case IC_LoadWeakRetained:
-    case IC_InitWeak:
-    case IC_DestroyWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-        CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case IC_CopyWeak:
-    case IC_MoveWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0)) ||
-          isNullOrUndef(CI->getArgOperand(1))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-        CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case IC_Retain:
-      OptimizeRetainCall(F, Inst);
-      break;
-    case IC_RetainRV:
-      if (OptimizeRetainRVCall(F, Inst))
-        continue;
-      break;
-    case IC_AutoreleaseRV:
-      OptimizeAutoreleaseRVCall(F, Inst);
-      break;
-    }
-
-    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
-    if (IsAutorelease(Class) && Inst->use_empty()) {
-      CallInst *Call = cast<CallInst>(Inst);
-      const Value *Arg = Call->getArgOperand(0);
-      Arg = FindSingleUseIdentifiedObject(Arg);
-      if (Arg) {
-        Changed = true;
-        ++NumAutoreleases;
-
-        // Create the declaration lazily.
-        LLVMContext &C = Inst->getContext();
-        CallInst *NewCall =
-          CallInst::Create(getReleaseCallee(F.getParent()),
-                           Call->getArgOperand(0), "", Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind,
-                             MDNode::get(C, ArrayRef<Value *>()));
-        EraseInstruction(Call);
-        Inst = NewCall;
-        Class = IC_Release;
-      }
-    }
-
-    // For functions which can never be passed stack arguments, add
-    // a tail keyword.
-    if (IsAlwaysTail(Class)) {
-      Changed = true;
-      cast<CallInst>(Inst)->setTailCall();
-    }
-
-    // Set nounwind as needed.
-    if (IsNoThrow(Class)) {
-      Changed = true;
-      cast<CallInst>(Inst)->setDoesNotThrow();
-    }
-
-    if (!IsNoopOnNull(Class)) {
-      UsedInThisFunction |= 1 << Class;
-      continue;
-    }
-
-    const Value *Arg = GetObjCArg(Inst);
-
-    // ARC calls with null are no-ops. Delete them.
-    if (isNullOrUndef(Arg)) {
-      Changed = true;
-      ++NumNoops;
-      EraseInstruction(Inst);
-      continue;
-    }
-
-    // Keep track of which of retain, release, autorelease, and retain_block
-    // are actually present in this function.
-    UsedInThisFunction |= 1 << Class;
-
-    // If Arg is a PHI, and one or more incoming values to the
-    // PHI are null, and the call is control-equivalent to the PHI, and there
-    // are no relevant side effects between the PHI and the call, the call
-    // could be pushed up to just those paths with non-null incoming values.
-    // For now, don't bother splitting critical edges for this.
-    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
-    Worklist.push_back(std::make_pair(Inst, Arg));
-    do {
-      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
-      Inst = Pair.first;
-      Arg = Pair.second;
-
-      const PHINode *PN = dyn_cast<PHINode>(Arg);
-      if (!PN) continue;
-
-      // Determine if the PHI has any null operands, or any incoming
-      // critical edges.
-      bool HasNull = false;
-      bool HasCriticalEdges = false;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        Value *Incoming =
-          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-        if (isNullOrUndef(Incoming))
-          HasNull = true;
-        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
-                   .getNumSuccessors() != 1) {
-          HasCriticalEdges = true;
-          break;
-        }
-      }
-      // If we have null operands and no critical edges, optimize.
-      if (!HasCriticalEdges && HasNull) {
-        SmallPtrSet<Instruction *, 4> DependingInstructions;
-        SmallPtrSet<const BasicBlock *, 4> Visited;
-
-        // Check that there is nothing that cares about the reference
-        // count between the call and the phi.
-        switch (Class) {
-        case IC_Retain:
-        case IC_RetainBlock:
-          // These can always be moved up.
-          break;
-        case IC_Release:
-          // These can't be moved across things that care about the retain
-          // count.
-          FindDependencies(NeedsPositiveRetainCount, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case IC_Autorelease:
-          // These can't be moved across autorelease pool scope boundaries.
-          FindDependencies(AutoreleasePoolBoundary, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case IC_RetainRV:
-        case IC_AutoreleaseRV:
-          // Don't move these; the RV optimization depends on the autoreleaseRV
-          // being tail called, and the retainRV being immediately after a call
-          // (which might still happen if we get lucky with codegen layout, but
-          // it's not worth taking the chance).
-          continue;
-        default:
-          llvm_unreachable("Invalid dependence flavor");
-        }
-
-        if (DependingInstructions.size() == 1 &&
-            *DependingInstructions.begin() == PN) {
-          Changed = true;
-          ++NumPartialNoops;
-          // Clone the call into each predecessor that has a non-null value.
-          CallInst *CInst = cast<CallInst>(Inst);
-          Type *ParamTy = CInst->getArgOperand(0)->getType();
-          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-            Value *Incoming =
-              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-            if (!isNullOrUndef(Incoming)) {
-              CallInst *Clone = cast<CallInst>(CInst->clone());
-              Value *Op = PN->getIncomingValue(i);
-              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
-              if (Op->getType() != ParamTy)
-                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
-              Clone->setArgOperand(0, Op);
-              Clone->insertBefore(InsertPos);
-              Worklist.push_back(std::make_pair(Clone, Incoming));
-            }
-          }
-          // Erase the original call.
-          EraseInstruction(CInst);
-          continue;
-        }
-      }
-    } while (!Worklist.empty());
-  }
-}
-
-/// CheckForCFGHazards - Check for critical edges, loop boundaries, irreducible
-/// control flow, or other CFG structures where moving code across the edge
-/// would result in it being executed more.
-void
-ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
-                               DenseMap<const BasicBlock *, BBState> &BBStates,
-                               BBState &MyStates) const {
-  // If any top-down local-use or possible-dec has a succ which is earlier in
-  // the sequence, forget it.
-  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
-       E = MyStates.top_down_ptr_end(); I != E; ++I)
-    switch (I->second.GetSeq()) {
-    default: break;
-    case S_Use: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      // If the terminator is an invoke marked with the
-      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-      // ignored, for ARC purposes.
-      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-        --SE;
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None:
-        case S_CanRelease: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_Use:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
-    case S_CanRelease: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      // If the terminator is an invoke marked with the
-      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-      // ignored, for ARC purposes.
-      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-        --SE;
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_CanRelease:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Use:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
-    }
-}
-
-bool
-ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
-                                     BasicBlock *BB,
-                                     MapVector<Value *, RRInfo> &Retains,
-                                     BBState &MyStates) {
-  bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
-
-  switch (Class) {
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-
-    // If we see two releases in a row on the same pointer. If so, make
-    // a note, and we'll cicle back to revisit it after we've
-    // hopefully eliminated the second release, which may allow us to
-    // eliminate the first release too.
-    // Theoretically we could implement removal of nested retain+release
-    // pairs by making PtrState hold a stack of states, but this is
-    // simple and avoids adding overhead for the non-nested case.
-    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
-      NestingDetected = true;
-
-    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
-    S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownIncremented();
-    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-    S.RRI.Calls.insert(Inst);
-
-    S.SetKnownPositiveRefCount();
-    break;
-  }
-  case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-    S.SetKnownPositiveRefCount();
-
-    switch (S.GetSeq()) {
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-    case S_Use:
-      S.RRI.ReverseInsertPts.clear();
-      // FALL THROUGH
-    case S_CanRelease:
-      // Don't do retain+release tracking for IC_RetainRV, because it's
-      // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        Retains[Inst] = S.RRI;
-      }
-      S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
-    return NestingDetected;
-  }
-  case IC_AutoreleasepoolPop:
-    // Conservatively, clear MyStates for all known pointers.
-    MyStates.clearBottomUpPointers();
-    return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
-    // These are irrelevant.
-    return NestingDetected;
-  default:
-    break;
-  }
-
-  // Consider any other possible effects of this instruction on each
-  // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
-       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
-    const Value *Ptr = MI->first;
-    if (Ptr == Arg)
-      continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
-      switch (Seq) {
-      case S_Use:
-        S.SetSeq(S_CanRelease);
-        continue;
-      case S_CanRelease:
-      case S_Release:
-      case S_MovableRelease:
-      case S_Stop:
-      case S_None:
-        break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
-      }
-    }
-
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_Release:
-    case S_MovableRelease:
-      if (CanUse(Inst, Ptr, PA, Class)) {
-        assert(S.RRI.ReverseInsertPts.empty());
-        // If this is an invoke instruction, we're scanning it as part of
-        // one of its successor blocks, since we can't insert code after it
-        // in its own block, and we don't want to split critical edges.
-        if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
-        else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
-        S.SetSeq(S_Use);
-      } else if (Seq == S_Release &&
-                 (Class == IC_User || Class == IC_CallOrUser)) {
-        // Non-movable releases depend on any possible objc pointer use.
-        S.SetSeq(S_Stop);
-        assert(S.RRI.ReverseInsertPts.empty());
-        // As above; handle invoke specially.
-        if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
-        else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
-      }
-      break;
-    case S_Stop:
-      if (CanUse(Inst, Ptr, PA, Class))
-        S.SetSeq(S_Use);
-      break;
-    case S_CanRelease:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
-                          DenseMap<const BasicBlock *, BBState> &BBStates,
-                          MapVector<Value *, RRInfo> &Retains) {
-  bool NestingDetected = false;
-  BBState &MyStates = BBStates[BB];
-
-  // Merge the states from each successor to compute the initial state
-  // for the current block.
-  BBState::edge_iterator SI(MyStates.succ_begin()),
-                         SE(MyStates.succ_end());
-  if (SI != SE) {
-    const BasicBlock *Succ = *SI;
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
-    assert(I != BBStates.end());
-    MyStates.InitFromSucc(I->second);
-    ++SI;
-    for (; SI != SE; ++SI) {
-      Succ = *SI;
-      I = BBStates.find(Succ);
-      assert(I != BBStates.end());
-      MyStates.MergeSucc(I->second);
-    }
-  }
-
-  // Visit all the instructions, bottom-up.
-  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
-    Instruction *Inst = llvm::prior(I);
-
-    // Invoke instructions are visited as part of their successors (below).
-    if (isa<InvokeInst>(Inst))
-      continue;
-
-    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
-  }
-
-  // If there's a predecessor with an invoke, visit the invoke as if it were
-  // part of this block, since we can't insert code after an invoke in its own
-  // block, and we don't want to split critical edges.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
-    BasicBlock *Pred = *PI;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
-      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
-                                    DenseMap<Value *, RRInfo> &Releases,
-                                    BBState &MyStates) {
-  bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
-
-  switch (Class) {
-  case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-
-    // Don't do retain+release tracking for IC_RetainRV, because it's
-    // better to let it remain as the first instruction after a call.
-    if (Class != IC_RetainRV) {
-      // If we see two retains in a row on the same pointer. If so, make
-      // a note, and we'll cicle back to revisit it after we've
-      // hopefully eliminated the second retain, which may allow us to
-      // eliminate the first retain too.
-      // Theoretically we could implement removal of nested retain+release
-      // pairs by making PtrState hold a stack of states, but this is
-      // simple and avoids adding overhead for the non-nested case.
-      if (S.GetSeq() == S_Retain)
-        NestingDetected = true;
-
-      S.ResetSequenceProgress(S_Retain);
-      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      S.RRI.KnownSafe = S.IsKnownIncremented();
-      S.RRI.Calls.insert(Inst);
-    }
-
-    S.SetKnownPositiveRefCount();
-
-    // A retain can be a potential use; procede to the generic checking
-    // code below.
-    break;
-  }
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.ClearRefCount();
-
-    switch (S.GetSeq()) {
-    case S_Retain:
-    case S_CanRelease:
-      S.RRI.ReverseInsertPts.clear();
-      // FALL THROUGH
-    case S_Use:
-      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-      Releases[Inst] = S.RRI;
-      S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
-    break;
-  }
-  case IC_AutoreleasepoolPop:
-    // Conservatively, clear MyStates for all known pointers.
-    MyStates.clearTopDownPointers();
-    return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
-    // These are irrelevant.
-    return NestingDetected;
-  default:
-    break;
-  }
-
-  // Consider any other possible effects of this instruction on each
-  // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
-       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
-    const Value *Ptr = MI->first;
-    if (Ptr == Arg)
-      continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
-      switch (Seq) {
-      case S_Retain:
-        S.SetSeq(S_CanRelease);
-        assert(S.RRI.ReverseInsertPts.empty());
-        S.RRI.ReverseInsertPts.insert(Inst);
-
-        // One call can't cause a transition from S_Retain to S_CanRelease
-        // and S_CanRelease to S_Use. If we've made the first transition,
-        // we're done.
-        continue;
-      case S_Use:
-      case S_CanRelease:
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-    }
-
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_CanRelease:
-      if (CanUse(Inst, Ptr, PA, Class))
-        S.SetSeq(S_Use);
-      break;
-    case S_Retain:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitTopDown(BasicBlock *BB,
-                         DenseMap<const BasicBlock *, BBState> &BBStates,
-                         DenseMap<Value *, RRInfo> &Releases) {
-  bool NestingDetected = false;
-  BBState &MyStates = BBStates[BB];
-
-  // Merge the states from each predecessor to compute the initial state
-  // for the current block.
-  BBState::edge_iterator PI(MyStates.pred_begin()),
-                         PE(MyStates.pred_end());
-  if (PI != PE) {
-    const BasicBlock *Pred = *PI;
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-    assert(I != BBStates.end());
-    MyStates.InitFromPred(I->second);
-    ++PI;
-    for (; PI != PE; ++PI) {
-      Pred = *PI;
-      I = BBStates.find(Pred);
-      assert(I != BBStates.end());
-      MyStates.MergePred(I->second);
-    }
-  }
-
-  // Visit all the instructions, top-down.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    Instruction *Inst = I;
-    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
-  }
-
-  CheckForCFGHazards(BB, BBStates, MyStates);
-  return NestingDetected;
-}
-
-static void
-ComputePostOrders(Function &F,
-                  SmallVectorImpl<BasicBlock *> &PostOrder,
-                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
-                  unsigned NoObjCARCExceptionsMDKind,
-                  DenseMap<const BasicBlock *, BBState> &BBStates) {
-  /// Visited - The visited set, for doing DFS walks.
-  SmallPtrSet<BasicBlock *, 16> Visited;
-
-  // Do DFS, computing the PostOrder.
-  SmallPtrSet<BasicBlock *, 16> OnStack;
-  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
-
-  // Functions always have exactly one entry block, and we don't have
-  // any other block that we treat like an entry block.
-  BasicBlock *EntryBB = &F.getEntryBlock();
-  BBState &MyStates = BBStates[EntryBB];
-  MyStates.SetAsEntry();
-  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
-  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
-  Visited.insert(EntryBB);
-  OnStack.insert(EntryBB);
-  do {
-  dfs_next_succ:
-    BasicBlock *CurrBB = SuccStack.back().first;
-    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
-    succ_iterator SE(TI, false);
-
-    // If the terminator is an invoke marked with the
-    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-    // ignored, for ARC purposes.
-    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-      --SE;
-
-    while (SuccStack.back().second != SE) {
-      BasicBlock *SuccBB = *SuccStack.back().second++;
-      if (Visited.insert(SuccBB)) {
-        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
-        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
-        BBStates[CurrBB].addSucc(SuccBB);
-        BBState &SuccStates = BBStates[SuccBB];
-        SuccStates.addPred(CurrBB);
-        OnStack.insert(SuccBB);
-        goto dfs_next_succ;
-      }
-
-      if (!OnStack.count(SuccBB)) {
-        BBStates[CurrBB].addSucc(SuccBB);
-        BBStates[SuccBB].addPred(CurrBB);
-      }
-    }
-    OnStack.erase(CurrBB);
-    PostOrder.push_back(CurrBB);
-    SuccStack.pop_back();
-  } while (!SuccStack.empty());
-
-  Visited.clear();
-
-  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
-  // Functions may have many exits, and there also blocks which we treat
-  // as exits due to ignored edges.
-  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *ExitBB = I;
-    BBState &MyStates = BBStates[ExitBB];
-    if (!MyStates.isExit())
-      continue;
-
-    MyStates.SetAsExit();
-
-    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
-    Visited.insert(ExitBB);
-    while (!PredStack.empty()) {
-    reverse_dfs_next_succ:
-      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
-      while (PredStack.back().second != PE) {
-        BasicBlock *BB = *PredStack.back().second++;
-        if (Visited.insert(BB)) {
-          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
-          goto reverse_dfs_next_succ;
-        }
-      }
-      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
-    }
-  }
-}
-
-// Visit - Visit the function both top-down and bottom-up.
-bool
-ObjCARCOpt::Visit(Function &F,
-                  DenseMap<const BasicBlock *, BBState> &BBStates,
-                  MapVector<Value *, RRInfo> &Retains,
-                  DenseMap<Value *, RRInfo> &Releases) {
-
-  // Use reverse-postorder traversals, because we magically know that loops
-  // will be well behaved, i.e. they won't repeatedly call retain on a single
-  // pointer without doing a release. We can't use the ReversePostOrderTraversal
-  // class here because we want the reverse-CFG postorder to consider each
-  // function exit point, and we want to ignore selected cycle edges.
-  SmallVector<BasicBlock *, 16> PostOrder;
-  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
-  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
-                    NoObjCARCExceptionsMDKind,
-                    BBStates);
-
-  // Use reverse-postorder on the reverse CFG for bottom-up.
-  bool BottomUpNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
-       I != E; ++I)
-    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
-
-  // Use reverse-postorder for top-down.
-  bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       PostOrder.rbegin(), E = PostOrder.rend();
-       I != E; ++I)
-    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
-
-  return TopDownNestingDetected && BottomUpNestingDetected;
-}
-
-/// MoveCalls - Move the calls in RetainsToMove and ReleasesToMove.
-void ObjCARCOpt::MoveCalls(Value *Arg,
-                           RRInfo &RetainsToMove,
-                           RRInfo &ReleasesToMove,
-                           MapVector<Value *, RRInfo> &Retains,
-                           DenseMap<Value *, RRInfo> &Releases,
-                           SmallVectorImpl<Instruction *> &DeadInsts,
-                           Module *M) {
-  Type *ArgTy = Arg->getType();
-  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
-
-  // Insert the new retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = ReleasesToMove.ReverseInsertPts.begin(),
-       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
-    Value *MyArg = ArgTy == ParamTy ? Arg :
-                   new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call =
-      CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         getRetainBlockCallee(M) : getRetainCallee(M),
-                       MyArg, "", InsertPt);
-    Call->setDoesNotThrow();
-    if (RetainsToMove.IsRetainBlock)
-      Call->setMetadata(CopyOnEscapeMDKind,
-                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
-    else
-      Call->setTailCall();
-  }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = RetainsToMove.ReverseInsertPts.begin(),
-       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
-    Value *MyArg = ArgTy == ParamTy ? Arg :
-                   new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
-                                      "", InsertPt);
-    // Attach a clang.imprecise_release metadata tag, if appropriate.
-    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
-      Call->setMetadata(ImpreciseReleaseMDKind, M);
-    Call->setDoesNotThrow();
-    if (ReleasesToMove.IsTailCallRelease)
-      Call->setTailCall();
-  }
-
-  // Delete the original retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = RetainsToMove.Calls.begin(),
-       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRetain = *AI;
-    Retains.blot(OrigRetain);
-    DeadInsts.push_back(OrigRetain);
-  }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = ReleasesToMove.Calls.begin(),
-       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRelease = *AI;
-    Releases.erase(OrigRelease);
-    DeadInsts.push_back(OrigRelease);
-  }
-}
-
-/// PerformCodePlacement - Identify pairings between the retains and releases,
-/// and delete and/or move them.
-bool
-ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
-                                   &BBStates,
-                                 MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 Module *M) {
-  bool AnyPairsCompletelyEliminated = false;
-  RRInfo RetainsToMove;
-  RRInfo ReleasesToMove;
-  SmallVector<Instruction *, 4> NewRetains;
-  SmallVector<Instruction *, 4> NewReleases;
-  SmallVector<Instruction *, 8> DeadInsts;
-
-  // Visit each retain.
-  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ++I) {
-    Value *V = I->first;
-    if (!V) continue; // blotted
-
-    Instruction *Retain = cast<Instruction>(V);
-    Value *Arg = GetObjCArg(Retain);
-
-    // If the object being released is in static or stack storage, we know it's
-    // not being managed by ObjC reference counting, so we can delete pairs
-    // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
-
-    // A constant pointer can't be pointing to an object on the heap. It may
-    // be reference-counted, but it won't be deleted.
-    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
-      if (const GlobalVariable *GV =
-            dyn_cast<GlobalVariable>(
-              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
-        if (GV->isConstant())
-          KnownSafe = true;
-
-    // If a pair happens in a region where it is known that the reference count
-    // is already incremented, we can similarly ignore possible decrements.
-    bool KnownSafeTD = true, KnownSafeBU = true;
-
-    // Connect the dots between the top-down-collected RetainsToMove and
-    // bottom-up-collected ReleasesToMove to form sets of related calls.
-    // This is an iterative process so that we connect multiple releases
-    // to multiple retains if needed.
-    unsigned OldDelta = 0;
-    unsigned NewDelta = 0;
-    unsigned OldCount = 0;
-    unsigned NewCount = 0;
-    bool FirstRelease = true;
-    bool FirstRetain = true;
-    NewRetains.push_back(Retain);
-    for (;;) {
-      for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
-        Instruction *NewRetain = *NI;
-        MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
-        assert(It != Retains.end());
-        const RRInfo &NewRetainRRI = It->second;
-        KnownSafeTD &= NewRetainRRI.KnownSafe;
-        for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewRetainRRI.Calls.begin(),
-             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
-          Instruction *NewRetainRelease = *LI;
-          DenseMap<Value *, RRInfo>::const_iterator Jt =
-            Releases.find(NewRetainRelease);
-          if (Jt == Releases.end())
-            goto next_retain;
-          const RRInfo &NewRetainReleaseRRI = Jt->second;
-          assert(NewRetainReleaseRRI.Calls.count(NewRetain));
-          if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
-            OldDelta -=
-              BBStates[NewRetainRelease->getParent()].GetAllPathCount();
-
-            // Merge the ReleaseMetadata and IsTailCallRelease values.
-            if (FirstRelease) {
-              ReleasesToMove.ReleaseMetadata =
-                NewRetainReleaseRRI.ReleaseMetadata;
-              ReleasesToMove.IsTailCallRelease =
-                NewRetainReleaseRRI.IsTailCallRelease;
-              FirstRelease = false;
-            } else {
-              if (ReleasesToMove.ReleaseMetadata !=
-                    NewRetainReleaseRRI.ReleaseMetadata)
-                ReleasesToMove.ReleaseMetadata = 0;
-              if (ReleasesToMove.IsTailCallRelease !=
-                    NewRetainReleaseRRI.IsTailCallRelease)
-                ReleasesToMove.IsTailCallRelease = false;
-            }
-
-            // Collect the optimal insertion points.
-            if (!KnownSafe)
-              for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
-                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
-                   RI != RE; ++RI) {
-                Instruction *RIP = *RI;
-                if (ReleasesToMove.ReverseInsertPts.insert(RIP))
-                  NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
-              }
-            NewReleases.push_back(NewRetainRelease);
-          }
-        }
-      }
-      NewRetains.clear();
-      if (NewReleases.empty()) break;
-
-      // Back the other way.
-      for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
-        Instruction *NewRelease = *NI;
-        DenseMap<Value *, RRInfo>::const_iterator It =
-          Releases.find(NewRelease);
-        assert(It != Releases.end());
-        const RRInfo &NewReleaseRRI = It->second;
-        KnownSafeBU &= NewReleaseRRI.KnownSafe;
-        for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewReleaseRRI.Calls.begin(),
-             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
-          Instruction *NewReleaseRetain = *LI;
-          MapVector<Value *, RRInfo>::const_iterator Jt =
-            Retains.find(NewReleaseRetain);
-          if (Jt == Retains.end())
-            goto next_retain;
-          const RRInfo &NewReleaseRetainRRI = Jt->second;
-          assert(NewReleaseRetainRRI.Calls.count(NewRelease));
-          if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
-            unsigned PathCount =
-              BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
-            OldDelta += PathCount;
-            OldCount += PathCount;
-
-            // Merge the IsRetainBlock values.
-            if (FirstRetain) {
-              RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
-              FirstRetain = false;
-            } else if (ReleasesToMove.IsRetainBlock !=
-                       NewReleaseRetainRRI.IsRetainBlock)
-              // It's not possible to merge the sequences if one uses
-              // objc_retain and the other uses objc_retainBlock.
-              goto next_retain;
-
-            // Collect the optimal insertion points.
-            if (!KnownSafe)
-              for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
-                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
-                   RI != RE; ++RI) {
-                Instruction *RIP = *RI;
-                if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
-                  PathCount = BBStates[RIP->getParent()].GetAllPathCount();
-                  NewDelta += PathCount;
-                  NewCount += PathCount;
-                }
-              }
-            NewRetains.push_back(NewReleaseRetain);
-          }
-        }
-      }
-      NewReleases.clear();
-      if (NewRetains.empty()) break;
-    }
-
-    // If the pointer is known incremented or nested, we can safely delete the
-    // pair regardless of what's between them.
-    if (KnownSafeTD || KnownSafeBU) {
-      RetainsToMove.ReverseInsertPts.clear();
-      ReleasesToMove.ReverseInsertPts.clear();
-      NewCount = 0;
-    } else {
-      // Determine whether the new insertion points we computed preserve the
-      // balance of retain and release calls through the program.
-      // TODO: If the fully aggressive solution isn't valid, try to find a
-      // less aggressive solution which is.
-      if (NewDelta != 0)
-        goto next_retain;
-    }
-
-    // Determine whether the original call points are balanced in the retain and
-    // release calls through the program. If not, conservatively don't touch
-    // them.
-    // TODO: It's theoretically possible to do code motion in this case, as
-    // long as the existing imbalances are maintained.
-    if (OldDelta != 0)
-      goto next_retain;
-
-    // Ok, everything checks out and we're all set. Let's move some code!
-    Changed = true;
-    assert(OldCount != 0 && "Unreachable code?");
-    AnyPairsCompletelyEliminated = NewCount == 0;
-    NumRRs += OldCount - NewCount;
-    MoveCalls(Arg, RetainsToMove, ReleasesToMove,
-              Retains, Releases, DeadInsts, M);
-
-  next_retain:
-    NewReleases.clear();
-    NewRetains.clear();
-    RetainsToMove.clear();
-    ReleasesToMove.clear();
-  }
-
-  // Now that we're done moving everything, we can delete the newly dead
-  // instructions, as we no longer need them as insert points.
-  while (!DeadInsts.empty())
-    EraseInstruction(DeadInsts.pop_back_val());
-
-  return AnyPairsCompletelyEliminated;
-}
-
-/// OptimizeWeakCalls - Weak pointer optimizations.
-void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
-  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
-  // itself because it uses AliasAnalysis and we need to do provenance
-  // queries instead.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
-      continue;
-
-    // Delete objc_loadWeak calls with no users.
-    if (Class == IC_LoadWeak && Inst->use_empty()) {
-      Inst->eraseFromParent();
-      continue;
-    }
-
-    // TODO: For now, just look for an earlier available version of this value
-    // within the same block. Theoretically, we could do memdep-style non-local
-    // analysis too, but that would want caching. A better approach would be to
-    // use the technique that EarlyCSE uses.
-    inst_iterator Current = llvm::prior(I);
-    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
-    for (BasicBlock::iterator B = CurrentBB->begin(),
-                              J = Current.getInstructionIterator();
-         J != B; --J) {
-      Instruction *EarlierInst = &*llvm::prior(J);
-      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
-      switch (EarlierClass) {
-      case IC_LoadWeak:
-      case IC_LoadWeakRetained: {
-        // If this is loading from the same pointer, replace this load's value
-        // with that one.
-        CallInst *Call = cast<CallInst>(Inst);
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
-        Value *Arg = Call->getArgOperand(0);
-        Value *EarlierArg = EarlierCall->getArgOperand(0);
-        switch (PA.getAA()->alias(Arg, EarlierArg)) {
-        case AliasAnalysis::MustAlias:
-          Changed = true;
-          // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
-            CI->setTailCall();
-          }
-          // Zap the fully redundant load.
-          Call->replaceAllUsesWith(EarlierCall);
-          Call->eraseFromParent();
-          goto clobbered;
-        case AliasAnalysis::MayAlias:
-        case AliasAnalysis::PartialAlias:
-          goto clobbered;
-        case AliasAnalysis::NoAlias:
-          break;
-        }
-        break;
-      }
-      case IC_StoreWeak:
-      case IC_InitWeak: {
-        // If this is storing to the same pointer and has the same size etc.
-        // replace this load's value with the stored value.
-        CallInst *Call = cast<CallInst>(Inst);
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
-        Value *Arg = Call->getArgOperand(0);
-        Value *EarlierArg = EarlierCall->getArgOperand(0);
-        switch (PA.getAA()->alias(Arg, EarlierArg)) {
-        case AliasAnalysis::MustAlias:
-          Changed = true;
-          // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
-            CI->setTailCall();
-          }
-          // Zap the fully redundant load.
-          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
-          Call->eraseFromParent();
-          goto clobbered;
-        case AliasAnalysis::MayAlias:
-        case AliasAnalysis::PartialAlias:
-          goto clobbered;
-        case AliasAnalysis::NoAlias:
-          break;
-        }
-        break;
-      }
-      case IC_MoveWeak:
-      case IC_CopyWeak:
-        // TOOD: Grab the copied value.
-        goto clobbered;
-      case IC_AutoreleasepoolPush:
-      case IC_None:
-      case IC_User:
-        // Weak pointers are only modified through the weak entry points
-        // (and arbitrary calls, which could call the weak entry points).
-        break;
-      default:
-        // Anything else could modify the weak pointer.
-        goto clobbered;
-      }
-    }
-  clobbered:;
-  }
-
-  // Then, for each destroyWeak with an alloca operand, check to see if
-  // the alloca and all its users can be zapped.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_DestroyWeak)
-      continue;
-
-    CallInst *Call = cast<CallInst>(Inst);
-    Value *Arg = Call->getArgOperand(0);
-    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
-      for (Value::use_iterator UI = Alloca->use_begin(),
-           UE = Alloca->use_end(); UI != UE; ++UI) {
-        const Instruction *UserInst = cast<Instruction>(*UI);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-        case IC_DestroyWeak:
-          continue;
-        default:
-          goto done;
-        }
-      }
-      Changed = true;
-      for (Value::use_iterator UI = Alloca->use_begin(),
-           UE = Alloca->use_end(); UI != UE; ) {
-        CallInst *UserInst = cast<CallInst>(*UI++);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-          // These functions return their second argument.
-          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
-          break;
-        case IC_DestroyWeak:
-          // No return value.
-          break;
-        default:
-          llvm_unreachable("alloca really is used!");
-        }
-        UserInst->eraseFromParent();
-      }
-      Alloca->eraseFromParent();
-    done:;
-    }
-  }
-}
-
-/// OptimizeSequences - Identify program paths which execute sequences of
-/// retains and releases which can be eliminated.
-bool ObjCARCOpt::OptimizeSequences(Function &F) {
-  /// Releases, Retains - These are used to store the results of the main flow
-  /// analysis. These use Value* as the key instead of Instruction* so that the
-  /// map stays valid when we get around to rewriting code and calls get
-  /// replaced by arguments.
-  DenseMap<Value *, RRInfo> Releases;
-  MapVector<Value *, RRInfo> Retains;
-
-  /// BBStates, This is used during the traversal of the function to track the
-  /// states for each identified object at each block.
-  DenseMap<const BasicBlock *, BBState> BBStates;
-
-  // Analyze the CFG of the function, and all instructions.
-  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
-
-  // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
-         NestingDetected;
-}
-
-/// OptimizeReturns - Look for this pattern:
-/// \code
-///    %call = call i8* @something(...)
-///    %2 = call i8* @objc_retain(i8* %call)
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// And delete the retain and autorelease.
-///
-/// Otherwise if it's just this:
-/// \code
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// convert the autorelease to autoreleaseRV.
-void ObjCARCOpt::OptimizeReturns(Function &F) {
-  if (!F.getReturnType()->isPointerTy())
-    return;
-
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-    BasicBlock *BB = FI;
-    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
-    if (!Ret) continue;
-
-    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
-    FindDependencies(NeedsPositiveRetainCount, Arg,
-                     BB, Ret, DependingInstructions, Visited, PA);
-    if (DependingInstructions.size() != 1)
-      goto next_block;
-
-    {
-      CallInst *Autorelease =
-        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-      if (!Autorelease)
-        goto next_block;
-      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
-      if (!IsAutorelease(AutoreleaseClass))
-        goto next_block;
-      if (GetObjCArg(Autorelease) != Arg)
-        goto next_block;
-
-      DependingInstructions.clear();
-      Visited.clear();
-
-      // Check that there is nothing that can affect the reference
-      // count between the autorelease and the retain.
-      FindDependencies(CanChangeRetainCount, Arg,
-                       BB, Autorelease, DependingInstructions, Visited, PA);
-      if (DependingInstructions.size() != 1)
-        goto next_block;
-
-      {
-        CallInst *Retain =
-          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-        // Check that we found a retain with the same argument.
-        if (!Retain ||
-            !IsRetain(GetBasicInstructionClass(Retain)) ||
-            GetObjCArg(Retain) != Arg)
-          goto next_block;
-
-        DependingInstructions.clear();
-        Visited.clear();
-
-        // Convert the autorelease to an autoreleaseRV, since it's
-        // returning the value.
-        if (AutoreleaseClass == IC_Autorelease) {
-          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
-          AutoreleaseClass = IC_AutoreleaseRV;
-        }
-
-        // Check that there is nothing that can affect the reference
-        // count between the retain and the call.
-        // Note that Retain need not be in BB.
-        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
-                         DependingInstructions, Visited, PA);
-        if (DependingInstructions.size() != 1)
-          goto next_block;
-
-        {
-          CallInst *Call =
-            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-          // Check that the pointer is the return value of the call.
-          if (!Call || Arg != Call)
-            goto next_block;
-
-          // Check that the call is a regular call.
-          InstructionClass Class = GetBasicInstructionClass(Call);
-          if (Class != IC_CallOrUser && Class != IC_Call)
-            goto next_block;
-
-          // If so, we can zap the retain and autorelease.
-          Changed = true;
-          ++NumRets;
-          EraseInstruction(Retain);
-          EraseInstruction(Autorelease);
-        }
-      }
-    }
-
-  next_block:
-    DependingInstructions.clear();
-    Visited.clear();
-  }
-}
-
-bool ObjCARCOpt::doInitialization(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  // Identify the imprecise release metadata kind.
-  ImpreciseReleaseMDKind =
-    M.getContext().getMDKindID("clang.imprecise_release");
-  CopyOnEscapeMDKind =
-    M.getContext().getMDKindID("clang.arc.copy_on_escape");
-  NoObjCARCExceptionsMDKind =
-    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
-
-  // Intuitively, objc_retain and others are nocapture, however in practice
-  // they are not, because they return their argument value. And objc_release
-  // calls finalizers which can have arbitrary side effects.
-
-  // These are initialized lazily.
-  RetainRVCallee = 0;
-  AutoreleaseRVCallee = 0;
-  ReleaseCallee = 0;
-  RetainCallee = 0;
-  RetainBlockCallee = 0;
-  AutoreleaseCallee = 0;
-
-  return false;
-}
-
-bool ObjCARCOpt::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // This pass performs several distinct transformations. As a compile-time aid
-  // when compiling code that isn't ObjC, skip these if the relevant ObjC
-  // library functions aren't declared.
-
-  // Preliminary optimizations. This also computs UsedInThisFunction.
-  OptimizeIndividualCalls(F);
-
-  // Optimizations for weak pointers.
-  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
-                            (1 << IC_LoadWeakRetained) |
-                            (1 << IC_StoreWeak) |
-                            (1 << IC_InitWeak) |
-                            (1 << IC_CopyWeak) |
-                            (1 << IC_MoveWeak) |
-                            (1 << IC_DestroyWeak)))
-    OptimizeWeakCalls(F);
-
-  // Optimizations for retain+release pairs.
-  if (UsedInThisFunction & ((1 << IC_Retain) |
-                            (1 << IC_RetainRV) |
-                            (1 << IC_RetainBlock)))
-    if (UsedInThisFunction & (1 << IC_Release))
-      // Run OptimizeSequences until it either stops making changes or
-      // no retain+release pair nesting is detected.
-      while (OptimizeSequences(F)) {}
-
-  // Optimizations if objc_autorelease is used.
-  if (UsedInThisFunction & ((1 << IC_Autorelease) |
-                            (1 << IC_AutoreleaseRV)))
-    OptimizeReturns(F);
-
-  return Changed;
-}
-
-void ObjCARCOpt::releaseMemory() {
-  PA.clear();
-}
-
-//===----------------------------------------------------------------------===//
-// ARC contraction.
-//===----------------------------------------------------------------------===//
-
-// TODO: ObjCARCContract could insert PHI nodes when uses aren't
-// dominated by single calls.
-
-#include "llvm/Operator.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/Analysis/Dominators.h"
-
-STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
-
-namespace {
-  /// ObjCARCContract - Late ARC optimizations.  These change the IR in a way
-  /// that makes it difficult to be analyzed by ObjCARCOpt, so it's run late.
-  class ObjCARCContract : public FunctionPass {
-    bool Changed;
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ProvenanceAnalysis PA;
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// StoreStrongCallee, etc. - Declarations for ObjC runtime
-    /// functions, for use in creating calls to them. These are initialized
-    /// lazily to avoid cluttering up the Module with unused declarations.
-    Constant *StoreStrongCallee,
-             *RetainAutoreleaseCallee, *RetainAutoreleaseRVCallee;
-
-    /// RetainRVMarker - The inline asm string to insert between calls and
-    /// RetainRV calls to make the optimization work on targets which need it.
-    const MDString *RetainRVMarker;
-
-    /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If
-    /// at the end of walking the function we have found no alloca
-    /// instructions, these calls can be marked "tail".
-    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
-
-    Constant *getStoreStrongCallee(Module *M);
-    Constant *getRetainAutoreleaseCallee(Module *M);
-    Constant *getRetainAutoreleaseRVCallee(Module *M);
-
-    bool ContractAutorelease(Function &F, Instruction *Autorelease,
-                             InstructionClass Class,
-                             SmallPtrSet<Instruction *, 4>
-                               &DependingInstructions,
-                             SmallPtrSet<const BasicBlock *, 4>
-                               &Visited);
-
-    void ContractRelease(Instruction *Release,
-                         inst_iterator &Iter);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-  public:
-    static char ID;
-    ObjCARCContract() : FunctionPass(ID) {
-      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCContract::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCContract,
-                      "objc-arc-contract", "ObjC ARC contraction", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
-INITIALIZE_PASS_END(ObjCARCContract,
-                    "objc-arc-contract", "ObjC ARC contraction", false, false)
-
-Pass *llvm::createObjCARCContractPass() {
-  return new ObjCARCContract();
-}
-
-void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
-  AU.addRequired<DominatorTree>();
-  AU.setPreservesCFG();
-}
-
-Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
-  if (!StoreStrongCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *I8XX = PointerType::getUnqual(I8X);
-    Type *Params[] = { I8XX, I8X };
-
-    AttrListPtr Attributes = AttrListPtr()
-      .addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-               Attributes::get(C, Attributes::NoUnwind))
-      .addAttr(M->getContext(), 1, Attributes::get(C, Attributes::NoCapture));
-
-    StoreStrongCallee =
-      M->getOrInsertFunction(
-        "objc_storeStrong",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attributes);
-  }
-  return StoreStrongCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
-  if (!RetainAutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    RetainAutoreleaseCallee =
-      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes);
-  }
-  return RetainAutoreleaseCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
-  if (!RetainAutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
-                            Attributes::get(C, Attributes::NoUnwind));
-    RetainAutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
-                             Attributes);
-  }
-  return RetainAutoreleaseRVCallee;
-}
-
-/// ContractAutorelease - Merge an autorelease with a retain into a fused call.
-bool
-ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
-                                     InstructionClass Class,
-                                     SmallPtrSet<Instruction *, 4>
-                                       &DependingInstructions,
-                                     SmallPtrSet<const BasicBlock *, 4>
-                                       &Visited) {
-  const Value *Arg = GetObjCArg(Autorelease);
-
-  // Check that there are no instructions between the retain and the autorelease
-  // (such as an autorelease_pop) which may change the count.
-  CallInst *Retain = 0;
-  if (Class == IC_AutoreleaseRV)
-    FindDependencies(RetainAutoreleaseRVDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-  else
-    FindDependencies(RetainAutoreleaseDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-
-  Visited.clear();
-  if (DependingInstructions.size() != 1) {
-    DependingInstructions.clear();
-    return false;
-  }
-
-  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-  DependingInstructions.clear();
-
-  if (!Retain ||
-      GetBasicInstructionClass(Retain) != IC_Retain ||
-      GetObjCArg(Retain) != Arg)
-    return false;
-
-  Changed = true;
-  ++NumPeeps;
-
-  if (Class == IC_AutoreleaseRV)
-    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
-  else
-    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
-
-  EraseInstruction(Autorelease);
-  return true;
-}
-
-/// ContractRelease - Attempt to merge an objc_release with a store, load, and
-/// objc_retain to form an objc_storeStrong. This can be a little tricky because
-/// the instructions don't always appear in order, and there may be unrelated
-/// intervening instructions.
-void ObjCARCContract::ContractRelease(Instruction *Release,
-                                      inst_iterator &Iter) {
-  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || !Load->isSimple()) return;
-
-  // For now, require everything to be in one basic block.
-  BasicBlock *BB = Release->getParent();
-  if (Load->getParent() != BB) return;
-
-  // Walk down to find the store and the release, which may be in either order.
-  BasicBlock::iterator I = Load, End = BB->end();
-  ++I;
-  AliasAnalysis::Location Loc = AA->getLocation(Load);
-  StoreInst *Store = 0;
-  bool SawRelease = false;
-  for (; !Store || !SawRelease; ++I) {
-    if (I == End)
-      return;
-
-    Instruction *Inst = I;
-    if (Inst == Release) {
-      SawRelease = true;
-      continue;
-    }
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-
-    // Unrelated retains are harmless.
-    if (IsRetain(Class))
-      continue;
-
-    if (Store) {
-      // The store is the point where we're going to put the objc_storeStrong,
-      // so make sure there are no uses after it.
-      if (CanUse(Inst, Load, PA, Class))
-        return;
-    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
-      // We are moving the load down to the store, so check for anything
-      // else which writes to the memory between the load and the store.
-      Store = dyn_cast<StoreInst>(Inst);
-      if (!Store || !Store->isSimple()) return;
-      if (Store->getPointerOperand() != Loc.Ptr) return;
-    }
-  }
-
-  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
-
-  // Walk up to find the retain.
-  I = Store;
-  BasicBlock::iterator Begin = BB->begin();
-  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
-    --I;
-  Instruction *Retain = I;
-  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
-  if (GetObjCArg(Retain) != New) return;
-
-  Changed = true;
-  ++NumStoreStrongs;
-
-  LLVMContext &C = Release->getContext();
-  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  Type *I8XX = PointerType::getUnqual(I8X);
-
-  Value *Args[] = { Load->getPointerOperand(), New };
-  if (Args[0]->getType() != I8XX)
-    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
-  if (Args[1]->getType() != I8X)
-    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
-  CallInst *StoreStrong =
-    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
-                     Args, "", Store);
-  StoreStrong->setDoesNotThrow();
-  StoreStrong->setDebugLoc(Store->getDebugLoc());
-
-  // We can't set the tail flag yet, because we haven't yet determined
-  // whether there are any escaping allocas. Remember this call, so that
-  // we can set the tail flag once we know it's safe.
-  StoreStrongCalls.insert(StoreStrong);
-
-  if (&*Iter == Store) ++Iter;
-  Store->eraseFromParent();
-  Release->eraseFromParent();
-  EraseInstruction(Retain);
-  if (Load->use_empty())
-    Load->eraseFromParent();
-}
-
-bool ObjCARCContract::doInitialization(Module &M) {
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  // These are initialized lazily.
-  StoreStrongCallee = 0;
-  RetainAutoreleaseCallee = 0;
-  RetainAutoreleaseRVCallee = 0;
-
-  // Initialize RetainRVMarker.
-  RetainRVMarker = 0;
-  if (NamedMDNode *NMD =
-        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
-    if (NMD->getNumOperands() == 1) {
-      const MDNode *N = NMD->getOperand(0);
-      if (N->getNumOperands() == 1)
-        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
-          RetainRVMarker = S;
-    }
-
-  return false;
-}
-
-bool ObjCARCContract::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-  AA = &getAnalysis<AliasAnalysis>();
-  DT = &getAnalysis<DominatorTree>();
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
-  // keyword. Be conservative if the function has variadic arguments.
-  // It seems that functions which "return twice" are also unsafe for the
-  // "tail" argument, because they are setjmp, which could need to
-  // return to an earlier stack state.
-  bool TailOkForStoreStrongs = !F.isVarArg() &&
-                               !F.callsFunctionThatReturnsTwice();
-
-  // For ObjC library calls which return their argument, replace uses of the
-  // argument with uses of the call return value, if it dominates the use. This
-  // reduces register pressure.
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    // Only these library routines return their argument. In particular,
-    // objc_retainBlock does not necessarily return its argument.
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    switch (Class) {
-    case IC_Retain:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV:
-      break;
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
-        continue;
-      break;
-    case IC_RetainRV: {
-      // If we're compiling for a target which needs a special inline-asm
-      // marker to do the retainAutoreleasedReturnValue optimization,
-      // insert it now.
-      if (!RetainRVMarker)
-        break;
-      BasicBlock::iterator BBI = Inst;
-      BasicBlock *InstParent = Inst->getParent();
-
-      // Step up to see if the call immediately precedes the RetainRV call.
-      // If it's an invoke, we have to cross a block boundary. And we have
-      // to carefully dodge no-op instructions.
-      do {
-        if (&*BBI == InstParent->begin()) {
-          BasicBlock *Pred = InstParent->getSinglePredecessor();
-          if (!Pred)
-            goto decline_rv_optimization;
-          BBI = Pred->getTerminator();
-          break;
-        }
-        --BBI;
-      } while (isNoopInstruction(BBI));
-
-      if (&*BBI == GetObjCArg(Inst)) {
-        Changed = true;
-        InlineAsm *IA =
-          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
-                                           /*isVarArg=*/false),
-                         RetainRVMarker->getString(),
-                         /*Constraints=*/"", /*hasSideEffects=*/true);
-        CallInst::Create(IA, "", Inst);
-      }
-    decline_rv_optimization:
-      break;
-    }
-    case IC_InitWeak: {
-      // objc_initWeak(p, null) => *p = null
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(1))) {
-        Value *Null =
-          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
-        Changed = true;
-        new StoreInst(Null, CI->getArgOperand(0), CI);
-        CI->replaceAllUsesWith(Null);
-        CI->eraseFromParent();
-      }
-      continue;
-    }
-    case IC_Release:
-      ContractRelease(Inst, I);
-      continue;
-    case IC_User:
-      // Be conservative if the function has any alloca instructions.
-      // Technically we only care about escaping alloca instructions,
-      // but this is sufficient to handle some interesting cases.
-      if (isa<AllocaInst>(Inst))
-        TailOkForStoreStrongs = false;
-      continue;
-    default:
-      continue;
-    }
-
-    // Don't use GetObjCArg because we don't want to look through bitcasts
-    // and such; to do the replacement, the argument must have type i8*.
-    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
-    for (;;) {
-      // If we're compiling bugpointed code, don't get in trouble.
-      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
-        break;
-      // Look through the uses of the pointer.
-      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
-           UI != UE; ) {
-        Use &U = UI.getUse();
-        unsigned OperandNo = UI.getOperandNo();
-        ++UI; // Increment UI now, because we may unlink its element.
-
-        // If the call's return value dominates a use of the call's argument
-        // value, rewrite the use to use the return value. We check for
-        // reachability here because an unreachable call is considered to
-        // trivially dominate itself, which would lead us to rewriting its
-        // argument in terms of its return value, which would lead to
-        // infinite loops in GetObjCArg.
-        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
-          Changed = true;
-          Instruction *Replacement = Inst;
-          Type *UseTy = U.get()->getType();
-          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
-            // For PHI nodes, insert the bitcast in the predecessor block.
-            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
-            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            &BB->back());
-            // While we're here, rewrite all edges for this PHI, rather
-            // than just one use at a time, to minimize the number of
-            // bitcasts we emit.
-            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-              if (PHI->getIncomingBlock(i) == BB) {
-                // Keep the UI iterator valid.
-                if (&PHI->getOperandUse(
-                      PHINode::getOperandNumForIncomingValue(i)) ==
-                    &UI.getUse())
-                  ++UI;
-                PHI->setIncomingValue(i, Replacement);
-              }
-          } else {
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            cast<Instruction>(U.getUser()));
-            U.set(Replacement);
-          }
-        }
-      }
-
-      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
-      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
-        Arg = BI->getOperand(0);
-      else if (isa<GEPOperator>(Arg) &&
-               cast<GEPOperator>(Arg)->hasAllZeroIndices())
-        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
-      else if (isa<GlobalAlias>(Arg) &&
-               !cast<GlobalAlias>(Arg)->mayBeOverridden())
-        Arg = cast<GlobalAlias>(Arg)->getAliasee();
-      else
-        break;
-    }
-  }
-
-  // If this function has no escaping allocas or suspicious vararg usage,
-  // objc_storeStrong calls can be marked with the "tail" keyword.
-  if (TailOkForStoreStrongs)
-    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
-         E = StoreStrongCalls.end(); I != E; ++I)
-      (*I)->setTailCall();
-  StoreStrongCalls.clear();
-
-  return Changed;
-}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 7a40797..7ee4027 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -22,24 +22,24 @@
 
 #define DEBUG_TYPE "reassociate"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -110,6 +110,55 @@ namespace {
       }
     };
   };
+  
+  /// Utility class representing a non-constant Xor-operand. We classify
+  /// non-constant Xor-Operands into two categories:
+  ///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+  ///  C2)
+  ///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+  ///          constant.
+  ///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+  ///          operand as "E | 0"
+  class XorOpnd {
+  public:
+    XorOpnd(Value *V);
+    const XorOpnd &operator=(const XorOpnd &That);
+
+    bool isInvalid() const { return SymbolicPart == 0; }
+    bool isOrExpr() const { return isOr; }
+    Value *getValue() const { return OrigVal; }
+    Value *getSymbolicPart() const { return SymbolicPart; }
+    unsigned getSymbolicRank() const { return SymbolicRank; }
+    const APInt &getConstPart() const { return ConstPart; }
+
+    void Invalidate() { SymbolicPart = OrigVal = 0; }
+    void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+    // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
+    // The purpose is twofold:
+    // 1) Cluster together the operands sharing the same symbolic-value.
+    // 2) Operand having smaller symbolic-value-rank is permuted earlier, which 
+    //   could potentially shorten crital path, and expose more loop-invariants.
+    //   Note that values' rank are basically defined in RPO order (FIXME). 
+    //   So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
+    //   than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+    //   "z" in the order of X-Y-Z is better than any other orders.
+    class PtrSortFunctor {
+      ArrayRef<XorOpnd> A;
+
+    public:
+      PtrSortFunctor(ArrayRef<XorOpnd> Array) : A(Array) {}
+      bool operator()(unsigned LHSIndex, unsigned RHSIndex) {
+        return A[LHSIndex].getSymbolicRank() < A[RHSIndex].getSymbolicRank();
+      }
+    };
+  private:
+    Value *OrigVal;
+    Value *SymbolicPart;
+    APInt ConstPart;
+    unsigned SymbolicRank;
+    bool isOr;
+  };
 }
 
 namespace {
@@ -137,6 +186,11 @@ namespace {
     Value *OptimizeExpression(BinaryOperator *I,
                               SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
+                        Value *&Res);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                        APInt &ConstOpnd, Value *&Res);
     bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
                                 SmallVectorImpl<Factor> &Factors);
     Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
@@ -148,6 +202,42 @@ namespace {
   };
 }
 
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (isa<ConstantInt>(V0))
+      std::swap(V0, V1);
+
+    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
+      ConstPart = C->getValue();
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  isOr = true;
+}
+
+const XorOpnd &XorOpnd::operator=(const XorOpnd &That) {
+  OrigVal = That.OrigVal;
+  SymbolicPart = That.SymbolicPart;
+  ConstPart = That.ConstPart;
+  SymbolicRank = That.SymbolicRank;
+  isOr = That.isOr;
+  return *this;
+}
+
 char Reassociate::ID = 0;
 INITIALIZE_PASS(Reassociate, "reassociate",
                 "Reassociate expressions", false, false)
@@ -423,10 +513,6 @@ static bool LinearizeExprTree(BinaryOperator *I,
   assert(Instruction::isAssociative(Opcode) &&
          Instruction::isCommutative(Opcode) &&
          "Expected an associative and commutative operation!");
-  // If we see an absorbing element then the entire expression must be equal to
-  // it.  For example, if this is a multiplication expression and zero occurs as
-  // an operand somewhere in it then the result of the expression must be zero.
-  Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType());
 
   // Visit all operands of the expression, keeping track of their weight (the
   // number of paths from the expression root to the operand, or if you like
@@ -474,13 +560,6 @@ static bool LinearizeExprTree(BinaryOperator *I,
       DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
       assert(!Op->use_empty() && "No uses, so how did we get to it?!");
 
-      // If the expression contains an absorbing element then there is no need
-      // to analyze it further: it must evaluate to the absorbing element.
-      if (Op == Absorber && !Weight.isMinValue()) {
-        Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1)));
-        return MadeChange;
-      }
-
       // If this is a binary operation of the right kind with only one use then
       // add its operands to the expression.
       if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
@@ -1051,6 +1130,241 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
   return 0;
 }
 
+/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd != 0) {
+    if (!ConstOpnd.isAllOnesValue()) {
+      LLVMContext &Ctx = Opnd->getType()->getContext();
+      Instruction *I;
+      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
+                                    "and.ra", InsertBefore);
+      I->setDebugLoc(InsertBefore->getDebugLoc());
+      return I;
+    }
+    return Opnd;
+  }
+  return 0;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//  
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                 APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
+    if (!Opnd1->getValue()->hasOneUse())
+      return false;
+
+    const APInt &C1 = Opnd1->getConstPart();
+    if (C1 != ConstOpnd)
+      return false;
+
+    Value *X = Opnd1->getSymbolicPart();
+    Res = createAndInstr(I, X, ~C1);
+    // ConstOpnd was C2, now C1 ^ C2.
+    ConstOpnd ^= C1;
+
+    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+      RedoInsts.insert(T);
+    return true;
+  }
+  return false;
+}
+
+                           
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value. 
+// 
+// If it was successful, true is returned, and the "R" and "C" is returned 
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                                 APInt &ConstOpnd, Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  const APInt &C1 = Opnd1->getConstPart();
+  const APInt &C2 = Opnd2->getConstPart();
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    APInt C3 = C1 ^ C2;
+    
+    // Do not increase code size
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *Reassociate::OptimizeXor(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+      
+  if (Ops.size() == 1)
+    return 0;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<unsigned, 8> OpndIndices;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    if (!isa<ConstantInt>(V)) {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+      OpndIndices.push_back(Opnds.size() - 1);
+    } else
+      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+  }
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  std::sort(OpndIndices.begin(), OpndIndices.end(),
+            XorOpnd::PtrSortFunctor(Opnds));
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = 0;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = &Opnds[OpndIndices[i]];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
+    //    
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = 0;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (ConstOpnd != 0) {
+      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    int Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    else if (Sz == 0) {
+      assert(ConstOpnd == 0);
+      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    }
+  }
+
+  return 0;
+}
+
 /// OptimizeAdd - Optimize a series of operands to an 'add' instruction.  This
 /// optimizes based on identities.  If it can be reduced to a single Value, it
 /// is returned, otherwise the Ops list is mutated as necessary.
@@ -1442,11 +1756,15 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
   default: break;
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
     if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
       return Result;
     break;
 
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
   case Instruction::Add:
     if (Value *Result = OptimizeAdd(I, Ops))
       return Result;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index ea1de63..07f540a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -18,15 +18,15 @@
 
 #define DEBUG_TYPE "reg2mem"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Pass.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Instructions.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index 686520e..e30a274 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -19,26 +19,26 @@
 
 #define DEBUG_TYPE "sccp"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/InstVisitor.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -271,13 +271,6 @@ public:
     return I->second;
   }
 
-  /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const {
-    DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I =
-      StructValueState.find(std::make_pair(V, i));
-    assert(I != StructValueState.end() && "V is not in valuemap!");
-    return I->second;
-  }*/
-
   /// getTrackedRetVals - Get the inferred return value map.
   ///
   const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
@@ -710,9 +703,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     markConstant(&PN, OperandVal);      // Acquire operand value
 }
 
-
-
-
 void SCCPSolver::visitReturnInst(ReturnInst &I) {
   if (I.getNumOperands() == 0) return;  // ret void
 
@@ -1185,7 +1175,7 @@ void SCCPSolver::Solve() {
       DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
 
       // "I" got into the work list because it either made the transition from
-      // bottom to constant
+      // bottom to constant, or to overdefined.
       //
       // Anything on this worklist that is overdefined need not be visited
       // since all of its users will have already been marked as overdefined
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index 640ea31..f6bb365 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -25,44 +25,47 @@
 
 #define DEBUG_TYPE "sroa"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Operator.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
-STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
-STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
+STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
-STATISTIC(NumDeleted,         "Number of instructions deleted");
-STATISTIC(NumVectorized,      "Number of vectorized aggregates");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
 /// Hidden option to force the pass to not use DomTree and mem2reg, instead
 /// forming SSA values through the SSAUpdater infrastructure.
@@ -70,112 +73,167 @@ static cl::opt<bool>
 ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
 
 namespace {
-/// \brief Alloca partitioning representation.
-///
-/// This class represents a partitioning of an alloca into slices, and
-/// information about the nature of uses of each slice of the alloca. The goal
-/// is that this information is sufficient to decide if and how to split the
-/// alloca apart and replace slices with scalars. It is also intended that this
-/// structure can capture the relevant information needed both to decide about
-/// and to enact these transformations.
-class AllocaPartitioning {
+/// \brief A custom IRBuilder inserter which prefixes all names if they are
+/// preserved.
+template <bool preserveNames = true>
+class IRBuilderPrefixedInserter :
+    public IRBuilderDefaultInserter<preserveNames> {
+  std::string Prefix;
+
 public:
-  /// \brief A common base class for representing a half-open byte range.
-  struct ByteRange {
-    /// \brief The beginning offset of the range.
-    uint64_t BeginOffset;
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
 
-    /// \brief The ending offset, not included in the range.
-    uint64_t EndOffset;
+protected:
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
+        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+  }
+};
 
-    ByteRange() : BeginOffset(), EndOffset() {}
-    ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
-        : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+// Specialization for not preserving the name is trivial.
+template <>
+class IRBuilderPrefixedInserter<false> :
+    public IRBuilderDefaultInserter<false> {
+public:
+  void SetNamePrefix(const Twine &P) {}
+};
 
-    /// \brief Support for ordering ranges.
-    ///
-    /// This provides an ordering over ranges such that start offsets are
-    /// always increasing, and within equal start offsets, the end offsets are
-    /// decreasing. Thus the spanning range comes first in a cluster with the
-    /// same start position.
-    bool operator<(const ByteRange &RHS) const {
-      if (BeginOffset < RHS.BeginOffset) return true;
-      if (BeginOffset > RHS.BeginOffset) return false;
-      if (EndOffset > RHS.EndOffset) return true;
-      return false;
-    }
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+#ifndef NDEBUG
+typedef llvm::IRBuilder<true, ConstantFolder,
+                        IRBuilderPrefixedInserter<true> > IRBuilderTy;
+#else
+typedef llvm::IRBuilder<false, ConstantFolder,
+                        IRBuilderPrefixedInserter<false> > IRBuilderTy;
+#endif
+}
 
-    /// \brief Support comparison with a single offset to allow binary searches.
-    friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
-      return LHS.BeginOffset < RHSOffset;
-    }
+namespace {
+/// \brief A common base class for representing a half-open byte range.
+struct ByteRange {
+  /// \brief The beginning offset of the range.
+  uint64_t BeginOffset;
 
-    friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
-                                                const ByteRange &RHS) {
-      return LHSOffset < RHS.BeginOffset;
-    }
+  /// \brief The ending offset, not included in the range.
+  uint64_t EndOffset;
 
-    bool operator==(const ByteRange &RHS) const {
-      return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
-    }
-    bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
-  };
+  ByteRange() : BeginOffset(), EndOffset() {}
+  ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
 
-  /// \brief A partition of an alloca.
+  /// \brief Support for ordering ranges.
   ///
-  /// This structure represents a contiguous partition of the alloca. These are
-  /// formed by examining the uses of the alloca. During formation, they may
-  /// overlap but once an AllocaPartitioning is built, the Partitions within it
-  /// are all disjoint.
-  struct Partition : public ByteRange {
-    /// \brief Whether this partition is splittable into smaller partitions.
-    ///
-    /// We flag partitions as splittable when they are formed entirely due to
-    /// accesses by trivially splittable operations such as memset and memcpy.
-    bool IsSplittable;
+  /// This provides an ordering over ranges such that start offsets are
+  /// always increasing, and within equal start offsets, the end offsets are
+  /// decreasing. Thus the spanning range comes first in a cluster with the
+  /// same start position.
+  bool operator<(const ByteRange &RHS) const {
+    if (BeginOffset < RHS.BeginOffset) return true;
+    if (BeginOffset > RHS.BeginOffset) return false;
+    if (EndOffset > RHS.EndOffset) return true;
+    return false;
+  }
 
-    /// \brief Test whether a partition has been marked as dead.
-    bool isDead() const {
-      if (BeginOffset == UINT64_MAX) {
-        assert(EndOffset == UINT64_MAX);
-        return true;
-      }
-      return false;
-    }
+  /// \brief Support comparison with a single offset to allow binary searches.
+  friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
+    return LHS.BeginOffset < RHSOffset;
+  }
+
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                              const ByteRange &RHS) {
+    return LHSOffset < RHS.BeginOffset;
+  }
+
+  bool operator==(const ByteRange &RHS) const {
+    return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+  }
+  bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+};
 
-    /// \brief Kill a partition.
-    /// This is accomplished by setting both its beginning and end offset to
-    /// the maximum possible value.
-    void kill() {
-      assert(!isDead() && "He's Dead, Jim!");
-      BeginOffset = EndOffset = UINT64_MAX;
+/// \brief A partition of an alloca.
+///
+/// This structure represents a contiguous partition of the alloca. These are
+/// formed by examining the uses of the alloca. During formation, they may
+/// overlap but once an AllocaPartitioning is built, the Partitions within it
+/// are all disjoint.
+struct Partition : public ByteRange {
+  /// \brief Whether this partition is splittable into smaller partitions.
+  ///
+  /// We flag partitions as splittable when they are formed entirely due to
+  /// accesses by trivially splittable operations such as memset and memcpy.
+  bool IsSplittable;
+
+  /// \brief Test whether a partition has been marked as dead.
+  bool isDead() const {
+    if (BeginOffset == UINT64_MAX) {
+      assert(EndOffset == UINT64_MAX);
+      return true;
     }
+    return false;
+  }
 
-    Partition() : ByteRange(), IsSplittable() {}
-    Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
-        : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
-  };
+  /// \brief Kill a partition.
+  /// This is accomplished by setting both its beginning and end offset to
+  /// the maximum possible value.
+  void kill() {
+    assert(!isDead() && "He's Dead, Jim!");
+    BeginOffset = EndOffset = UINT64_MAX;
+  }
+
+  Partition() : ByteRange(), IsSplittable() {}
+  Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
+      : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
+};
+
+/// \brief A particular use of a partition of the alloca.
+///
+/// This structure is used to associate uses of a partition with it. They
+/// mark the range of bytes which are referenced by a particular instruction,
+/// and includes a handle to the user itself and the pointer value in use.
+/// The bounds of these uses are determined by intersecting the bounds of the
+/// memory use itself with a particular partition. As a consequence there is
+/// intentionally overlap between various uses of the same partition.
+class PartitionUse : public ByteRange {
+  /// \brief Combined storage for both the Use* and split state.
+  PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit;
+
+public:
+  PartitionUse() : ByteRange(), UsePtrAndIsSplit() {}
+  PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U,
+               bool IsSplit)
+      : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {}
 
-  /// \brief A particular use of a partition of the alloca.
+  /// \brief The use in question. Provides access to both user and used value.
   ///
-  /// This structure is used to associate uses of a partition with it. They
-  /// mark the range of bytes which are referenced by a particular instruction,
-  /// and includes a handle to the user itself and the pointer value in use.
-  /// The bounds of these uses are determined by intersecting the bounds of the
-  /// memory use itself with a particular partition. As a consequence there is
-  /// intentionally overlap between various uses of the same partition.
-  struct PartitionUse : public ByteRange {
-    /// \brief The use in question. Provides access to both user and used value.
-    ///
-    /// Note that this may be null if the partition use is *dead*, that is, it
-    /// should be ignored.
-    Use *U;
+  /// Note that this may be null if the partition use is *dead*, that is, it
+  /// should be ignored.
+  Use *getUse() const { return UsePtrAndIsSplit.getPointer(); }
 
-    PartitionUse() : ByteRange(), U() {}
-    PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U)
-        : ByteRange(BeginOffset, EndOffset), U(U) {}
-  };
+  /// \brief Set the use for this partition use range.
+  void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); }
+
+  /// \brief Whether this use is split across multiple partitions.
+  bool isSplit() const { return UsePtrAndIsSplit.getInt(); }
+};
+}
 
+namespace llvm {
+template <> struct isPodLike<Partition> : llvm::true_type {};
+template <> struct isPodLike<PartitionUse> : llvm::true_type {};
+}
+
+namespace {
+/// \brief Alloca partitioning representation.
+///
+/// This class represents a partitioning of an alloca into slices, and
+/// information about the nature of uses of each slice of the alloca. The goal
+/// is that this information is sufficient to decide if and how to split the
+/// alloca apart and replace slices with scalars. It is also intended that this
+/// structure can capture the relevant information needed both to decide about
+/// and to enact these transformations.
+class AllocaPartitioning {
+public:
   /// \brief Construct a partitioning of a particular alloca.
   ///
   /// Construction does most of the work for partitioning the alloca. This
@@ -334,7 +392,7 @@ private:
   class UseBuilder;
   friend class AllocaPartitioning::UseBuilder;
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// \brief Handle to alloca instruction to simplify method interfaces.
   AllocaInst &AI;
 #endif
@@ -404,106 +462,17 @@ private:
 };
 }
 
-template <typename DerivedT, typename RetT>
-class AllocaPartitioning::BuilderBase
-    : public InstVisitor<DerivedT, RetT> {
-public:
-  BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : TD(TD),
-        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
-        P(P) {
-    enqueueUsers(AI, 0);
-  }
-
-protected:
-  const DataLayout &TD;
-  const uint64_t AllocSize;
-  AllocaPartitioning &P;
-
-  SmallPtrSet<Use *, 8> VisitedUses;
+static Value *foldSelectInst(SelectInst &SI) {
+  // If the condition being selected on is a constant or the same value is
+  // being selected between, fold the select. Yes this does (rarely) happen
+  // early on.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+    return SI.getOperand(1+CI->isZero());
+  if (SI.getOperand(1) == SI.getOperand(2))
+    return SI.getOperand(1);
 
-  struct OffsetUse {
-    Use *U;
-    int64_t Offset;
-  };
-  SmallVector<OffsetUse, 8> Queue;
-
-  // The active offset and use while visiting.
-  Use *U;
-  int64_t Offset;
-
-  void enqueueUsers(Instruction &I, int64_t UserOffset) {
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
-         UI != UE; ++UI) {
-      if (VisitedUses.insert(&UI.getUse())) {
-        OffsetUse OU = { &UI.getUse(), UserOffset };
-        Queue.push_back(OU);
-      }
-    }
-  }
-
-  bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) {
-    GEPOffset = Offset;
-    for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
-         GTI != GTE; ++GTI) {
-      ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
-      if (!OpC)
-        return false;
-      if (OpC->isZero())
-        continue;
-
-      // Handle a struct index, which adds its field offset to the pointer.
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        unsigned ElementIdx = OpC->getZExtValue();
-        const StructLayout *SL = TD.getStructLayout(STy);
-        uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
-        // Check that we can continue to model this GEP in a signed 64-bit offset.
-        if (ElementOffset > INT64_MAX ||
-            (GEPOffset >= 0 &&
-             ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) {
-          DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
-                       << "what can be represented in an int64_t!\n"
-                       << "  alloca: " << P.AI << "\n");
-          return false;
-        }
-        if (GEPOffset < 0)
-          GEPOffset = ElementOffset + (uint64_t)-GEPOffset;
-        else
-          GEPOffset += ElementOffset;
-        continue;
-      }
-
-      APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits());
-      Index *= APInt(Index.getBitWidth(),
-                     TD.getTypeAllocSize(GTI.getIndexedType()));
-      Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset,
-                     /*isSigned*/true);
-      // Check if the result can be stored in our int64_t offset.
-      if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) {
-        DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
-                     << "what can be represented in an int64_t!\n"
-                     << "  alloca: " << P.AI << "\n");
-        return false;
-      }
-
-      GEPOffset = Index.getSExtValue();
-    }
-    return true;
-  }
-
-  Value *foldSelectInst(SelectInst &SI) {
-    // If the condition being selected on is a constant or the same value is
-    // being selected between, fold the select. Yes this does (rarely) happen
-    // early on.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
-      return SI.getOperand(1+CI->isZero());
-    if (SI.getOperand(1) == SI.getOperand(2)) {
-      assert(*U == SI.getOperand(1));
-      return SI.getOperand(1);
-    }
-    return 0;
-  }
-};
+  return 0;
+}
 
 /// \brief Builder for the alloca partitioning.
 ///
@@ -511,67 +480,45 @@ protected:
 /// of an alloca and splitting the partitions for each load and store at each
 /// offset.
 class AllocaPartitioning::PartitionBuilder
-    : public BuilderBase<PartitionBuilder, bool> {
-  friend class InstVisitor<PartitionBuilder, bool>;
+    : public PtrUseVisitor<PartitionBuilder> {
+  friend class PtrUseVisitor<PartitionBuilder>;
+  friend class InstVisitor<PartitionBuilder>;
+  typedef PtrUseVisitor<PartitionBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
 
   SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
 
 public:
-  PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {}
-
-  /// \brief Run the builder over the allocation.
-  bool operator()() {
-    // Note that we have to re-evaluate size on each trip through the loop as
-    // the queue grows at the tail.
-    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
-      U = Queue[Idx].U;
-      Offset = Queue[Idx].Offset;
-      if (!visit(cast<Instruction>(U->getUser())))
-        return false;
-    }
-    return true;
-  }
+  PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P)
+      : PtrUseVisitor<PartitionBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {}
 
 private:
-  bool markAsEscaping(Instruction &I) {
-    P.PointerEscapingInstr = &I;
-    return false;
-  }
-
-  void insertUse(Instruction &I, int64_t Offset, uint64_t Size,
+  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
                  bool IsSplittable = false) {
-    // Completely skip uses which have a zero size or don't overlap the
-    // allocation.
-    if (Size == 0 ||
-        (Offset >= 0 && (uint64_t)Offset >= AllocSize) ||
-        (Offset < 0 && (uint64_t)-Offset >= Size)) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) {
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
-                   << " which starts past the end of the " << AllocSize
-                   << " byte alloca:\n"
+                   << " which has zero size or starts outside of the "
+                   << AllocSize << " byte alloca:\n"
                    << "    alloca: " << P.AI << "\n"
                    << "       use: " << I << "\n");
       return;
     }
 
-    // Clamp the start to the beginning of the allocation.
-    if (Offset < 0) {
-      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
-                   << " to start at the beginning of the alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
-                   << "       use: " << I << "\n");
-      Size -= (uint64_t)-Offset;
-      Offset = 0;
-    }
-
-    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
 
     // Clamp the end offset to the end of the allocation. Note that this is
     // formulated to handle even the case where "BeginOffset + Size" overflows.
-    // NOTE! This may appear superficially to be something we could ignore
-    // entirely, but that is not so! There may be PHI-node uses where some
-    // instructions are dead but not others. We can't completely ignore the
-    // PHI node, and so have to record at least the information here.
+    // This may appear superficially to be something we could ignore entirely,
+    // but that is not so! There may be widened loads or PHI-node uses where
+    // some instructions are dead but not others. We can't completely ignore
+    // them, and so have to record at least the information here.
     assert(AllocSize >= BeginOffset); // Established above.
     if (Size > AllocSize - BeginOffset) {
       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
@@ -585,9 +532,41 @@ private:
     P.Partitions.push_back(New);
   }
 
-  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset,
-                         bool IsVolatile) {
-    uint64_t Size = TD.getTypeStoreSize(Ty);
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+                         uint64_t Size, bool IsVolatile) {
+    // We allow splitting of loads and stores where the type is an integer type
+    // and cover the entire alloca. This prevents us from splitting over
+    // eagerly.
+    // FIXME: In the great blue eventually, we should eagerly split all integer
+    // loads and stores, and then have a separate step that merges adjacent
+    // alloca partitions into a single partition suitable for integer widening.
+    // Or we should skip the merge step and rely on GVN and other passes to
+    // merge adjacent loads and stores that survive mem2reg.
+    bool IsSplittable =
+        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+
+    insertUse(I, Offset, Size, IsSplittable);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&LI);
+
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return PI.setEscapedAndAborted(&SI);
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
+
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
 
     // If this memory access can be shown to *statically* extend outside the
     // bounds of of the allocation, it's behavior is undefined, so simply
@@ -596,73 +575,52 @@ private:
     // risk of overflow.
     // FIXME: We should instead consider the pointer to have escaped if this
     // function is being instrumented for addressing bugs or race conditions.
-    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
-        Size > (AllocSize - (uint64_t)Offset)) {
-      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte "
-                   << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
                    << " which extends past the end of the " << AllocSize
                    << " byte alloca:\n"
                    << "    alloca: " << P.AI << "\n"
-                   << "       use: " << I << "\n");
-      return true;
+                   << "       use: " << SI << "\n");
+      return;
     }
 
-    // We allow splitting of loads and stores where the type is an integer type
-    // and which cover the entire alloca. Such integer loads and stores
-    // often require decomposition into fine grained loads and stores.
-    bool IsSplittable = false;
-    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
-      IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8;
-
-    insertUse(I, Offset, Size, IsSplittable);
-    return true;
-  }
-
-  bool visitBitCastInst(BitCastInst &BC) {
-    enqueueUsers(BC, Offset);
-    return true;
-  }
-
-  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    int64_t GEPOffset;
-    if (!computeConstantGEPOffset(GEPI, GEPOffset))
-      return markAsEscaping(GEPI);
-
-    enqueueUsers(GEPI, GEPOffset);
-    return true;
-  }
-
-  bool visitLoadInst(LoadInst &LI) {
-    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
-           "All simple FCA loads should have been pre-split");
-    return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile());
-  }
-
-  bool visitStoreInst(StoreInst &SI) {
-    Value *ValOp = SI.getValueOperand();
-    if (ValOp == *U)
-      return markAsEscaping(SI);
-
     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
            "All simple FCA stores should have been pre-split");
-    return handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
+    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
   }
 
 
-  bool visitMemSetInst(MemSetInst &II) {
+  void visitMemSetInst(MemSetInst &II) {
     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    insertUse(II, Offset, Size, Length);
-    return true;
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
+              (bool)Length);
   }
 
-  bool visitMemTransferInst(MemTransferInst &II) {
+  void visitMemTransferInst(MemTransferInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    if (!Size)
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
-      return true;
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    uint64_t RawOffset = Offset.getLimitedValue();
+    uint64_t Size = Length ? Length->getLimitedValue()
+                           : AllocSize - RawOffset;
 
     MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
 
@@ -670,12 +628,12 @@ private:
     Offsets.IsSplittable = Length;
 
     if (*U == II.getRawDest()) {
-      Offsets.DestBegin = Offset;
-      Offsets.DestEnd = Offset + Size;
+      Offsets.DestBegin = RawOffset;
+      Offsets.DestEnd = RawOffset + Size;
     }
     if (*U == II.getRawSource()) {
-      Offsets.SourceBegin = Offset;
-      Offsets.SourceEnd = Offset + Size;
+      Offsets.SourceBegin = RawOffset;
+      Offsets.SourceEnd = RawOffset + Size;
     }
 
     // If we have set up end offsets for both the source and the destination,
@@ -688,7 +646,7 @@ private:
       // In that case, we can completely elide the transfer.
       if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
         P.Partitions[PrevIdx].kill();
-        return true;
+        return;
       }
 
       // Otherwise we have an offset transfer within the same alloca. We can't
@@ -701,7 +659,7 @@ private:
 
       // For non-volatile transfers this is a no-op.
       if (!II.isVolatile())
-        return true;
+        return;
 
       // Otherwise just suppress splitting.
       Offsets.IsSplittable = false;
@@ -721,23 +679,25 @@ private:
              "Already have intrinsic in map but haven't seen both ends");
       (void)Inserted;
     }
-
-    return true;
   }
 
   // Disable SRoA for any intrinsics except for lifetime invariants.
-  // FIXME: What about debug instrinsics? This matches old behavior, but
+  // FIXME: What about debug intrinsics? This matches old behavior, but
   // doesn't make sense.
-  bool visitIntrinsicInst(IntrinsicInst &II) {
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
     if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
         II.getIntrinsicID() == Intrinsic::lifetime_end) {
       ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-      uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue());
+      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+                               Length->getLimitedValue());
       insertUse(II, Offset, Size, true);
-      return true;
+      return;
     }
 
-    return markAsEscaping(II);
+    Base::visitIntrinsicInst(II);
   }
 
   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
@@ -757,14 +717,14 @@ private:
       llvm::tie(UsedI, I) = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        Size = std::max(Size, TD.getTypeStoreSize(LI->getType()));
+        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         Value *Op = SI->getOperand(0);
         if (Op == UsedI)
           return SI;
-        Size = std::max(Size, TD.getTypeStoreSize(Op->getType()));
+        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
         continue;
       }
 
@@ -785,54 +745,62 @@ private:
     return 0;
   }
 
-  bool visitPHINode(PHINode &PN) {
+  void visitPHINode(PHINode &PN) {
+    if (PN.use_empty())
+      return;
+    if (!IsOffsetKnown)
+      return PI.setAborted(&PN);
+
     // See if we already have computed info on this node.
     std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
     if (PHIInfo.first) {
       PHIInfo.second = true;
       insertUse(PN, Offset, PHIInfo.first);
-      return true;
+      return;
     }
 
     // Check for an unsafe use of the PHI node.
-    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
-      return markAsEscaping(*EscapingI);
+    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
+      return PI.setAborted(UnsafeI);
 
     insertUse(PN, Offset, PHIInfo.first);
-    return true;
   }
 
-  bool visitSelectInst(SelectInst &SI) {
+  void visitSelectInst(SelectInst &SI) {
+    if (SI.use_empty())
+      return;
     if (Value *Result = foldSelectInst(SI)) {
       if (Result == *U)
         // If the result of the constant fold will be the pointer, recurse
         // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI, Offset);
+        enqueueUsers(SI);
 
-      return true;
+      return;
     }
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
 
     // See if we already have computed info on this node.
     std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
     if (SelectInfo.first) {
       SelectInfo.second = true;
       insertUse(SI, Offset, SelectInfo.first);
-      return true;
+      return;
     }
 
     // Check for an unsafe use of the PHI node.
-    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
-      return markAsEscaping(*EscapingI);
+    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
+      return PI.setAborted(UnsafeI);
 
     insertUse(SI, Offset, SelectInfo.first);
-    return true;
   }
 
   /// \brief Disable SROA entirely if there are unhandled users of the alloca.
-  bool visitInstruction(Instruction &I) { return markAsEscaping(I); }
+  void visitInstruction(Instruction &I) {
+    PI.setAborted(&I);
+  }
 };
 
-
 /// \brief Use adder for the alloca partitioning.
 ///
 /// This class adds the uses of an alloca to all of the partitions which they
@@ -851,26 +819,22 @@ private:
 /// partition space is pre-sorted, and do a logarithmic search for the
 /// partition needed, making the total visit a classical ((N + M) * log(N))
 /// complexity operation.
-class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> {
+class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> {
+  friend class PtrUseVisitor<UseBuilder>;
   friend class InstVisitor<UseBuilder>;
+  typedef PtrUseVisitor<UseBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
 
   /// \brief Set to de-duplicate dead instructions found in the use walk.
   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
   UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : BuilderBase<UseBuilder>(TD, AI, P) {}
-
-  /// \brief Run the builder over the allocation.
-  void operator()() {
-    // Note that we have to re-evaluate size on each trip through the loop as
-    // the queue grows at the tail.
-    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
-      U = Queue[Idx].U;
-      Offset = Queue[Idx].Offset;
-      this->visit(cast<Instruction>(U->getUser()));
-    }
-  }
+      : PtrUseVisitor<UseBuilder>(TD),
+        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {}
 
 private:
   void markAsDead(Instruction &I) {
@@ -878,20 +842,14 @@ private:
       P.DeadUsers.push_back(&I);
   }
 
-  void insertUse(Instruction &User, int64_t Offset, uint64_t Size) {
+  void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) {
     // If the use has a zero size or extends outside of the allocation, record
     // it as a dead use for elimination later.
-    if (Size == 0 || (uint64_t)Offset >= AllocSize ||
-        (Offset < 0 && (uint64_t)-Offset >= Size))
+    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize))
       return markAsDead(User);
 
-    // Clamp the start to the beginning of the allocation.
-    if (Offset < 0) {
-      Size -= (uint64_t)-Offset;
-      Offset = 0;
-    }
-
-    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
 
     // Clamp the end offset to the end of the allocation. Note that this is
     // formulated to handle even the case where "BeginOffset + Size" overflows.
@@ -900,13 +858,14 @@ private:
       EndOffset = AllocSize;
 
     // NB: This only works if we have zero overlapping partitions.
-    iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset);
-    if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset)
-      B = llvm::prior(B);
-    for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset;
-         ++I) {
+    iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset);
+    if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset)
+      I = llvm::prior(I);
+    iterator E = P.end();
+    bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset;
+    for (; I != E && I->BeginOffset < EndOffset; ++I) {
       PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
-                         std::min(I->EndOffset, EndOffset), U);
+                         std::min(I->EndOffset, EndOffset), U, IsSplit);
       P.use_push_back(I, NewPU);
       if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
         P.PHIOrSelectOpMap[U]
@@ -914,59 +873,63 @@ private:
     }
   }
 
-  void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
-    uint64_t Size = TD.getTypeStoreSize(Ty);
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
-    // ignore it. Note that this is more strict than the generic clamping
-    // behavior of insertUse.
-    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
-        Size > (AllocSize - (uint64_t)Offset))
-      return markAsDead(I);
-
-    insertUse(I, Offset, Size);
-  }
-
   void visitBitCastInst(BitCastInst &BC) {
     if (BC.use_empty())
       return markAsDead(BC);
 
-    enqueueUsers(BC, Offset);
+    return Base::visitBitCastInst(BC);
   }
 
   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     if (GEPI.use_empty())
       return markAsDead(GEPI);
 
-    int64_t GEPOffset;
-    if (!computeConstantGEPOffset(GEPI, GEPOffset))
-      llvm_unreachable("Unable to compute constant offset for use");
-
-    enqueueUsers(GEPI, GEPOffset);
+    return Base::visitGetElementPtrInst(GEPI);
   }
 
   void visitLoadInst(LoadInst &LI) {
-    handleLoadOrStore(LI.getType(), LI, Offset);
+    assert(IsOffsetKnown);
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    insertUse(LI, Offset, Size);
   }
 
   void visitStoreInst(StoreInst &SI) {
-    handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
+    assert(IsOffsetKnown);
+    uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse.
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size))
+      return markAsDead(SI);
+
+    insertUse(SI, Offset, Size);
   }
 
   void visitMemSetInst(MemSetInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    insertUse(II, Offset, Size);
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      return markAsDead(II);
+
+    assert(IsOffsetKnown);
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue());
   }
 
   void visitMemTransferInst(MemTransferInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    if (!Size)
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       return markAsDead(II);
 
-    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+    assert(IsOffsetKnown);
+    uint64_t Size = Length ? Length->getLimitedValue()
+                           : AllocSize - Offset.getLimitedValue();
+
+    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
     if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
         Offsets.DestBegin == Offsets.SourceBegin)
       return markAsDead(II); // Skip identity transfers without side-effects.
@@ -975,34 +938,39 @@ private:
   }
 
   void visitIntrinsicInst(IntrinsicInst &II) {
+    assert(IsOffsetKnown);
     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
            II.getIntrinsicID() == Intrinsic::lifetime_end);
 
     ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-    insertUse(II, Offset,
-              std::min(AllocSize - Offset, Length->getLimitedValue()));
+    insertUse(II, Offset, std::min(Length->getLimitedValue(),
+                                   AllocSize - Offset.getLimitedValue()));
   }
 
-  void insertPHIOrSelect(Instruction &User, uint64_t Offset) {
+  void insertPHIOrSelect(Instruction &User, const APInt &Offset) {
     uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
 
     // For PHI and select operands outside the alloca, we can't nuke the entire
     // phi or select -- the other side might still be relevant, so we special
     // case them here and use a separate structure to track the operands
     // themselves which should be replaced with undef.
-    if (Offset >= AllocSize) {
+    if ((Offset.isNegative() && Offset.uge(Size)) ||
+        (!Offset.isNegative() && Offset.uge(AllocSize))) {
       P.DeadOperands.push_back(U);
       return;
     }
 
     insertUse(User, Offset, Size);
   }
+
   void visitPHINode(PHINode &PN) {
     if (PN.use_empty())
       return markAsDead(PN);
 
+    assert(IsOffsetKnown);
     insertPHIOrSelect(PN, Offset);
   }
+
   void visitSelectInst(SelectInst &SI) {
     if (SI.use_empty())
       return markAsDead(SI);
@@ -1011,7 +979,7 @@ private:
       if (Result == *U)
         // If the result of the constant fold will be the pointer, recurse
         // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI, Offset);
+        enqueueUsers(SI);
       else
         // Otherwise the operand to the select is dead, and we can replace it
         // with undef.
@@ -1020,6 +988,7 @@ private:
       return;
     }
 
+    assert(IsOffsetKnown);
     insertPHIOrSelect(SI, Offset);
   }
 
@@ -1126,13 +1095,20 @@ void AllocaPartitioning::splitAndMergePartitions() {
 
 AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
     :
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
       PointerEscapingInstr(0) {
   PartitionBuilder PB(TD, AI, *this);
-  if (!PB())
+  PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    // FIXME: We should sink the escape vs. abort info into the caller nicely,
+    // possibly by just storing the PtrInfo in the AllocaPartitioning.
+    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+                                                  : PtrI.getAbortingInst();
+    assert(PointerEscapingInstr && "Did not track a bad instruction");
     return;
+  }
 
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
@@ -1162,31 +1138,45 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
     splitAndMergePartitions();
   }
 
+  // Record how many partitions we end up with.
+  NumAllocaPartitions += Partitions.size();
+  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
+
   // Now build up the user lists for each of these disjoint partitions by
   // re-walking the recursive users of the alloca.
   Uses.resize(Partitions.size());
   UseBuilder UB(TD, AI, *this);
-  UB();
+  PtrI = UB.visitPtr(AI);
+  assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
+  assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
+
+  unsigned NumUses = 0;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
+    NumUses += Uses[Idx].size();
+#endif
+  NumAllocaPartitionUses += NumUses;
+  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
 }
 
 Type *AllocaPartitioning::getCommonType(iterator I) const {
   Type *Ty = 0;
   for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (!UI->U)
+    Use *U = UI->getUse();
+    if (!U)
       continue; // Skip dead uses.
-    if (isa<IntrinsicInst>(*UI->U->getUser()))
+    if (isa<IntrinsicInst>(*U->getUser()))
       continue;
     if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
       continue;
 
     Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
       UserTy = LI->getType();
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) {
+    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
       UserTy = SI->getValueOperand()->getType();
-    } else {
+    else
       return 0; // Bail if we have weird uses.
-    }
 
     if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
@@ -1222,13 +1212,13 @@ void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
 
 void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
                                     StringRef Indent) const {
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I);
-       UI != UE; ++UI) {
-    if (!UI->U)
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
+    if (!UI->getUse())
       continue; // Skip dead uses.
     OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
-       << "used by: " << *UI->U->getUser() << "\n";
-    if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) {
+       << "used by: " << *UI->getUse()->getUser() << "\n";
+    if (MemTransferInst *II =
+            dyn_cast<MemTransferInst>(UI->getUse()->getUser())) {
       const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
       bool IsDest;
       if (!MTO.IsSplittable)
@@ -1251,8 +1241,7 @@ void AllocaPartitioning::print(raw_ostream &OS) const {
   }
 
   OS << "Partitioning of alloca: " << AI << "\n";
-  unsigned Num = 0;
-  for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
     print(OS, I);
     printUsers(OS, I);
   }
@@ -1323,18 +1312,18 @@ public:
     for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
-      Value *Arg = NULL;
+      Value *Arg = 0;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // If an argument is zero extended then use argument directly. The ZExt
         // may be zapped by an optimization pass in future.
         if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
           Arg = dyn_cast<Argument>(ZExt->getOperand(0));
-        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
           Arg = dyn_cast<Argument>(SExt->getOperand(0));
         if (!Arg)
-          Arg = SI->getOperand(0);
+          Arg = SI->getValueOperand();
       } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Arg = LI->getOperand(0);
+        Arg = LI->getPointerOperand();
       } else {
         continue;
       }
@@ -1358,7 +1347,7 @@ namespace {
 /// 1) It takes allocations of aggregates and analyzes the ways in which they
 ///    are used to try to split them into smaller allocations, ideally of
 ///    a single scalar data type. It will split up memcpy and memset accesses
-///    as necessary and try to isolate invidual scalar accesses.
+///    as necessary and try to isolate individual scalar accesses.
 /// 2) It will transform accesses into forms which are suitable for SSA value
 ///    promotion. This can be replacing a memset with a scalar store of an
 ///    integer value, or it can involve speculating operations on a PHI or
@@ -1460,11 +1449,11 @@ public:
     // may be grown during speculation. However, we never need to re-visit the
     // new uses, and so we can use the initial size bound.
     for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
-      const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx);
-      if (!PU.U)
+      const PartitionUse &PU = P.getUse(PI, Idx);
+      if (!PU.getUse())
         continue; // Skip dead use.
 
-      visit(cast<Instruction>(PU.U->getUser()));
+      visit(cast<Instruction>(PU.getUse()->getUser()));
     }
   }
 
@@ -1520,8 +1509,7 @@ private:
     // We can only transform this if it is safe to push the loads into the
     // predecessor blocks. The only thing to watch out for is that we can't put
     // a possibly trapping load in the predecessor if it is a critical edge.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
-         ++Idx) {
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
       TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
       Value *InVal = PN.getIncomingValue(Idx);
 
@@ -1559,12 +1547,12 @@ private:
     assert(!Loads.empty());
 
     Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
-    IRBuilder<> PHIBuilder(&PN);
+    IRBuilderTy PHIBuilder(&PN);
     PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
                                           PN.getName() + ".sroa.speculated");
 
     // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
-    // matter which one we get and if any differ, it doesn't matter.
+    // matter which one we get and if any differ.
     LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
     unsigned Align = SomeLoad->getAlignment();
@@ -1582,7 +1570,7 @@ private:
       TerminatorInst *TI = Pred->getTerminator();
       Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
       Value *InVal = PN.getIncomingValue(Idx);
-      IRBuilder<> PredBuilder(TI);
+      IRBuilderTy PredBuilder(TI);
 
       LoadInst *Load
         = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
@@ -1609,8 +1597,8 @@ private:
       // inside the load.
       AllocaPartitioning::use_iterator UI
         = P.findPartitionUseForPHIOrSelectOperand(InUse);
-      assert(isa<PHINode>(*UI->U->getUser()));
-      UI->U = &Load->getOperandUse(Load->getPointerOperandIndex());
+      assert(isa<PHINode>(*UI->getUse()->getUser()));
+      UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex()));
     }
     DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
   }
@@ -1657,16 +1645,16 @@ private:
 
   void visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
-    IRBuilder<> IRB(&SI);
 
     // If the select isn't safe to speculate, just use simple logic to emit it.
     SmallVector<LoadInst *, 4> Loads;
     if (!isSafeSelectToSpeculate(SI, Loads))
       return;
 
+    IRBuilderTy IRB(&SI);
     Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
     AllocaPartitioning::iterator PIs[2];
-    AllocaPartitioning::PartitionUse PUs[2];
+    PartitionUse PUs[2];
     for (unsigned i = 0, e = 2; i != e; ++i) {
       PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
       if (PIs[i] != P.end()) {
@@ -1677,7 +1665,7 @@ private:
         PUs[i] = *UI;
         // Clear out the use here so that the offsets into the use list remain
         // stable but this use is ignored when rewriting.
-        UI->U = 0;
+        UI->setUse(0);
       }
     }
 
@@ -1709,8 +1697,8 @@ private:
       for (unsigned i = 0, e = 2; i != e; ++i) {
         if (PIs[i] != P.end()) {
           Use *LoadUse = &Loads[i]->getOperandUse(0);
-          assert(PUs[i].U->get() == LoadUse->get());
-          PUs[i].U = LoadUse;
+          assert(PUs[i].getUse()->get() == LoadUse->get());
+          PUs[i].setUse(LoadUse);
           P.use_push_back(PIs[i], PUs[i]);
         }
       }
@@ -1723,51 +1711,12 @@ private:
 };
 }
 
-/// \brief Accumulate the constant offsets in a GEP into a single APInt offset.
-///
-/// If the provided GEP is all-constant, the total byte offset formed by the
-/// GEP is computed and Offset is set to it. If the GEP has any non-constant
-/// operands, the function returns false and the value of Offset is unmodified.
-static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP,
-                                 APInt &Offset) {
-  APInt GEPOffset(Offset.getBitWidth(), 0);
-  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
-       GTI != GTE; ++GTI) {
-    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
-    if (!OpC)
-      return false;
-    if (OpC->isZero()) continue;
-
-    // Handle a struct index, which adds its field offset to the pointer.
-    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-      unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = TD.getStructLayout(STy);
-      GEPOffset += APInt(Offset.getBitWidth(),
-                         SL->getElementOffset(ElementIdx));
-      continue;
-    }
-
-    APInt TypeSize(Offset.getBitWidth(),
-                   TD.getTypeAllocSize(GTI.getIndexedType()));
-    if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) {
-      assert((VTy->getScalarSizeInBits() % 8) == 0 &&
-             "vector element size is not a multiple of 8, cannot GEP over it");
-      TypeSize = VTy->getScalarSizeInBits() / 8;
-    }
-
-    GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize;
-  }
-  Offset = GEPOffset;
-  return true;
-}
-
 /// \brief Build a GEP out of a base pointer and indices.
 ///
 /// This will return the BasePtr if that is valid, or build a new GEP
 /// instruction using the IRBuilder if GEP-ing is needed.
-static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
-                       SmallVectorImpl<Value *> &Indices,
-                       const Twine &Prefix) {
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices) {
   if (Indices.empty())
     return BasePtr;
 
@@ -1776,7 +1725,7 @@ static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
     return BasePtr;
 
-  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
 }
 
 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1788,12 +1737,11 @@ static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
 /// TargetTy. If we can't find one with the same type, we at least try to use
 /// one with the same size. If none of that works, we just produce the GEP as
 /// indicated by Indices to have the correct offset.
-static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
-                                    SmallVectorImpl<Value *> &Indices,
-                                    const Twine &Prefix) {
+                                    SmallVectorImpl<Value *> &Indices) {
   if (Ty == TargetTy)
-    return buildGEP(IRB, BasePtr, Indices, Prefix);
+    return buildGEP(IRB, BasePtr, Indices);
 
   // See if we can descend into a struct and locate a field with the correct
   // type.
@@ -1820,20 +1768,19 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
   if (ElementTy != TargetTy)
     Indices.erase(Indices.end() - NumLayers, Indices.end());
 
-  return buildGEP(IRB, BasePtr, Indices, Prefix);
+  return buildGEP(IRB, BasePtr, Indices);
 }
 
 /// \brief Recursively compute indices for a natural GEP.
 ///
 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
 /// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
-                                       SmallVectorImpl<Value *> &Indices,
-                                       const Twine &Prefix) {
+                                       SmallVectorImpl<Value *> &Indices) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
+    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1843,7 +1790,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = VecTy->getScalarSizeInBits();
+    unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8)
       return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
@@ -1853,7 +1800,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
-                                    Offset, TargetTy, Indices, Prefix);
+                                    Offset, TargetTy, Indices);
   }
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -1866,7 +1813,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                    Indices, Prefix);
+                                    Indices);
   }
 
   StructType *STy = dyn_cast<StructType>(Ty);
@@ -1885,7 +1832,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
 
   Indices.push_back(IRB.getInt32(Index));
   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, Prefix);
+                                  Indices);
 }
 
 /// \brief Get a natural GEP from a base pointer to a particular offset and
@@ -1898,10 +1845,9 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
 /// Indices, and setting Ty to the result subtype.
 ///
 /// If no natural GEP can be constructed, this function returns null.
-static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
-                                      SmallVectorImpl<Value *> &Indices,
-                                      const Twine &Prefix) {
+                                      SmallVectorImpl<Value *> &Indices) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
 
   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
@@ -1920,7 +1866,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, Prefix);
+                                  Indices);
 }
 
 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
@@ -1935,12 +1881,11 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
 /// The strategy for finding the more natural GEPs is to peel off layers of the
 /// pointer, walking back through bit casts and GEPs, searching for a base
 /// pointer from which we can compute a natural GEP with the desired
-/// properities. The algorithm tries to fold as many constant indices into
+/// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
-static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
-                             Value *Ptr, APInt Offset, Type *PointerTy,
-                             const Twine &Prefix) {
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
+                             Value *Ptr, APInt Offset, Type *PointerTy) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1963,7 +1908,7 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
     // First fold any existing GEPs into the offset.
     while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
       APInt GEPOffset(Offset.getBitWidth(), 0);
-      if (!accumulateGEPOffsets(TD, *GEP, GEPOffset))
+      if (!GEP->accumulateConstantOffset(TD, GEPOffset))
         break;
       Offset += GEPOffset;
       Ptr = GEP->getPointerOperand();
@@ -1974,7 +1919,7 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
     // See if we can perform a natural GEP here.
     Indices.clear();
     if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
-                                           Indices, Prefix)) {
+                                           Indices)) {
       if (P->getType() == PointerTy) {
         // Zap any offset pointer that we ended up computing in previous rounds.
         if (OffsetPtr && OffsetPtr->use_empty())
@@ -2009,19 +1954,19 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
   if (!OffsetPtr) {
     if (!Int8Ptr) {
       Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
-                                  Prefix + ".raw_cast");
+                                  "raw_cast");
       Int8PtrOffset = Offset;
     }
 
     OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
       IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
-                            Prefix + ".raw_idx");
+                            "raw_idx");
   }
   Ptr = OffsetPtr;
 
   // On the off chance we were targeting i8*, guard the bitcast here.
   if (Ptr->getType() != PointerTy)
-    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
 
   return Ptr;
 }
@@ -2035,6 +1980,10 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (OldTy == NewTy)
     return true;
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
+      if (NewITy->getBitWidth() >= OldITy->getBitWidth())
+        return true;
   if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
@@ -2057,12 +2006,16 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// This will try various different casting techniques, such as bitcasts,
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                            Type *Ty) {
   assert(canConvertValue(DL, V->getType(), Ty) &&
          "Value not convertable to type");
   if (V->getType() == Ty)
     return V;
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+      if (NewITy->getBitWidth() > OldITy->getBitWidth())
+        return IRB.CreateZExt(V, NewITy);
   if (V->getType()->isIntegerTy() && Ty->isPointerTy())
     return IRB.CreateIntToPtr(V, Ty);
   if (V->getType()->isPointerTy() && Ty->isIntegerTy())
@@ -2090,19 +2043,19 @@ static bool isVectorPromotionViable(const DataLayout &TD,
   if (!Ty)
     return false;
 
-  uint64_t VecSize = TD.getTypeSizeInBits(Ty);
-  uint64_t ElementSize = Ty->getScalarSizeInBits();
+  uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType());
 
   // While the definition of LLVM vectors is bitpacked, we don't support sizes
   // that aren't byte sized.
   if (ElementSize % 8)
     return false;
-  assert((VecSize % 8) == 0 && "vector size not a multiple of element size?");
-  VecSize /= 8;
+  assert((TD.getTypeSizeInBits(Ty) % 8) == 0 &&
+         "vector size not a multiple of element size?");
   ElementSize /= 8;
 
   for (; I != E; ++I) {
-    if (!I->U)
+    Use *U = I->getUse();
+    if (!U)
       continue; // Skip dead use.
 
     uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
@@ -2116,30 +2069,34 @@ static bool isVectorPromotionViable(const DataLayout &TD,
         EndIndex > Ty->getNumElements())
       return false;
 
-    // FIXME: We should build shuffle vector instructions to handle
-    // non-element-sized accesses.
-    if ((EndOffset - BeginOffset) != ElementSize &&
-        (EndOffset - BeginOffset) != VecSize)
-      return false;
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    uint64_t NumElements = EndIndex - BeginIndex;
+    Type *PartitionTy
+      = (NumElements == 1) ? Ty->getElementType()
+                           : VectorType::get(Ty->getElementType(), NumElements);
 
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
       if (MI->isVolatile())
         return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
         const AllocaPartitioning::MemTransferOffsets &MTO
           = P.getMemTransferOffsets(*MTI);
         if (!MTO.IsSplittable)
           return false;
       }
-    } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) {
+    } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
       // Disable vector promotion when there are loads or stores of an FCA.
       return false;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       if (LI->isVolatile())
         return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+      if (!canConvertValue(TD, PartitionTy, LI->getType()))
+        return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       if (SI->isVolatile())
         return false;
+      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
+        return false;
     } else {
       return false;
     }
@@ -2178,13 +2135,14 @@ static bool isIntegerWideningViable(const DataLayout &TD,
 
   uint64_t Size = TD.getTypeStoreSize(AllocaTy);
 
-  // Check the uses to ensure the uses are (likely) promoteable integer uses.
+  // Check the uses to ensure the uses are (likely) promotable integer uses.
   // Also ensure that the alloca has a covering load or store. We don't want
-  // to widen the integer operotains only to fail to promote due to some other
+  // to widen the integer operations only to fail to promote due to some other
   // unsplittable entry (which we may make splittable later).
   bool WholeAllocaOp = false;
   for (; I != E; ++I) {
-    if (!I->U)
+    Use *U = I->getUse();
+    if (!U)
       continue; // Skip dead use.
 
     uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
@@ -2195,7 +2153,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
     if (RelEnd > Size)
       return false;
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       if (LI->isVolatile())
         return false;
       if (RelBegin == 0 && RelEnd == Size)
@@ -2210,7 +2168,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
       if (RelBegin != 0 || RelEnd != Size ||
           !canConvertValue(TD, AllocaTy, LI->getType()))
         return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       Type *ValueTy = SI->getValueOperand()->getType();
       if (SI->isVolatile())
         return false;
@@ -2226,16 +2184,16 @@ static bool isIntegerWideningViable(const DataLayout &TD,
       if (RelBegin != 0 || RelEnd != Size ||
           !canConvertValue(TD, ValueTy, AllocaTy))
         return false;
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
       if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
         return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
         const AllocaPartitioning::MemTransferOffsets &MTO
           = P.getMemTransferOffsets(*MTI);
         if (!MTO.IsSplittable)
           return false;
       }
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) {
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
       if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
           II->getIntrinsicID() != Intrinsic::lifetime_end)
         return false;
@@ -2246,7 +2204,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
   return WholeAllocaOp;
 }
 
-static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                              IntegerType *Ty, uint64_t Offset,
                              const Twine &Name) {
   DEBUG(dbgs() << "       start: " << *V << "\n");
@@ -2269,7 +2227,7 @@ static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
   return V;
 }
 
-static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
                             Value *V, uint64_t Offset, const Twine &Name) {
   IntegerType *IntTy = cast<IntegerType>(Old->getType());
   IntegerType *Ty = cast<IntegerType>(V->getType());
@@ -2300,6 +2258,84 @@ static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
   return V;
 }
 
+static Value *extractVector(IRBuilderTy &IRB, Value *V,
+                            unsigned BeginIndex, unsigned EndIndex,
+                            const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(V->getType());
+  unsigned NumElements = EndIndex - BeginIndex;
+  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+  if (NumElements == VecTy->getNumElements())
+    return V;
+
+  if (NumElements == 1) {
+    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                 Name + ".extract");
+    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    return V;
+  }
+
+  SmallVector<Constant*, 8> Mask;
+  Mask.reserve(NumElements);
+  for (unsigned i = BeginIndex; i != EndIndex; ++i)
+    Mask.push_back(IRB.getInt32(i));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask),
+                              Name + ".extract");
+  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+                           unsigned BeginIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(Old->getType());
+  assert(VecTy && "Can only insert a vector into a vector");
+
+  VectorType *Ty = dyn_cast<VectorType>(V->getType());
+  if (!Ty) {
+    // Single element to insert.
+    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+                                Name + ".insert");
+    DEBUG(dbgs() <<  "     insert: " << *V << "\n");
+    return V;
+  }
+
+  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+         "Too many elements!");
+  if (Ty->getNumElements() == VecTy->getNumElements()) {
+    assert(V->getType() == VecTy && "Vector type mismatch");
+    return V;
+  }
+  unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+  // When inserting a smaller vector into the larger to store, we first
+  // use a shuffle vector to widen it with undef elements, and then
+  // a second shuffle vector to select between the loaded vector and the
+  // incoming vector.
+  SmallVector<Constant*, 8> Mask;
+  Mask.reserve(VecTy->getNumElements());
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i - BeginIndex));
+    else
+      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask),
+                              Name + ".expand");
+  DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+
+  Mask.clear();
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i));
+    else
+      Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
+  V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask),
+                              Name + "insert");
+  DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+  return V;
+}
+
 namespace {
 /// \brief Visitor to rewrite instructions using a partition of an alloca to
 /// use a new alloca.
@@ -2321,7 +2357,7 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
 
   // If we are rewriting an alloca partition which can be written as pure
   // vector operations, we stash extra information here. When VecTy is
-  // non-null, we have some strict guarantees about the rewriten alloca:
+  // non-null, we have some strict guarantees about the rewritten alloca:
   //   - The new alloca is exactly the size of the vector type here.
   //   - The accesses all either map to the entire vector or to a single
   //     element.
@@ -2340,11 +2376,13 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
 
   // The offset of the partition user currently being rewritten.
   uint64_t BeginOffset, EndOffset;
+  bool IsSplit;
   Use *OldUse;
   Instruction *OldPtr;
 
-  // The name prefix to use when rewriting instructions for this alloca.
-  std::string NamePrefix;
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
 
 public:
   AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
@@ -2357,7 +2395,8 @@ public:
       NewAllocaEndOffset(NewEndOffset),
       NewAllocaTy(NewAI.getAllocatedType()),
       VecTy(), ElementTy(), ElementSize(), IntTy(),
-      BeginOffset(), EndOffset() {
+      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
+      IRB(NewAI.getContext(), ConstantFolder()) {
   }
 
   /// \brief Visit the users of the alloca partition and rewrite them.
@@ -2369,9 +2408,9 @@ public:
       ++NumVectorized;
       VecTy = cast<VectorType>(NewAI.getAllocatedType());
       ElementTy = VecTy->getElementType();
-      assert((VecTy->getScalarSizeInBits() % 8) == 0 &&
+      assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
-      ElementSize = VecTy->getScalarSizeInBits() / 8;
+      ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8;
     } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
                                        NewAllocaBeginOffset, P, I, E)) {
       IntTy = Type::getIntNTy(NewAI.getContext(),
@@ -2379,14 +2418,21 @@ public:
     }
     bool CanSROA = true;
     for (; I != E; ++I) {
-      if (!I->U)
+      if (!I->getUse())
         continue; // Skip dead uses.
       BeginOffset = I->BeginOffset;
       EndOffset = I->EndOffset;
-      OldUse = I->U;
-      OldPtr = cast<Instruction>(I->U->get());
-      NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
-      CanSROA &= visit(cast<Instruction>(I->U->getUser()));
+      IsSplit = I->isSplit();
+      OldUse = I->getUse();
+      OldPtr = cast<Instruction>(OldUse->get());
+
+      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+      IRB.SetInsertPoint(OldUserI);
+      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                        ".");
+
+      CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     }
     if (VecTy) {
       assert(CanSROA);
@@ -2408,14 +2454,10 @@ private:
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
-  Twine getName(const Twine &Suffix) {
-    return NamePrefix + Suffix;
-  }
-
-  Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
+  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
     assert(BeginOffset >= NewAllocaBeginOffset);
     APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
+    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
   }
 
   /// \brief Compute suitable alignment to access an offset into the new alloca.
@@ -2450,13 +2492,13 @@ private:
     return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
   }
 
-  ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
+  unsigned getIndex(uint64_t Offset) {
     assert(VecTy && "Can only call getIndex when rewriting a vector");
     uint64_t RelOffset = Offset - NewAllocaBeginOffset;
     assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
     uint32_t Index = RelOffset / ElementSize;
     assert(Index * ElementSize == RelOffset);
-    return IRB.getInt32(Index);
+    return Index;
   }
 
   void deleteIfTriviallyDead(Value *V) {
@@ -2465,28 +2507,27 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
+  Value *rewriteVectorizedLoadInst() {
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
-    if (LI.getType() == VecTy->getElementType() ||
-        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
-      V = IRB.CreateExtractElement(V, getIndex(IRB, BeginOffset),
-                                   getName(".extract"));
-    }
-    return V;
+                                     "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
-  Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+  Value *rewriteIntegerLoad(LoadInst &LI) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
+                                     "load");
     V = convertValue(TD, IRB, V, IntTy);
     assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
     if (Offset > 0 || EndOffset < NewAllocaEndOffset)
       V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
-                         getName(".extract"));
+                         "extract");
     return V;
   }
 
@@ -2494,58 +2535,39 @@ private:
     DEBUG(dbgs() << "    original: " << LI << "\n");
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
-    IRBuilder<> IRB(&LI);
 
     uint64_t Size = EndOffset - BeginOffset;
-    bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType());
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of the original allocation it's behavior is undefined. Rather
-    // than trying to transform it, just replace it with undef.
-    // FIXME: We should do something more clever for functions being
-    // instrumented by asan.
-    // FIXME: Eventually, once ASan and friends can flush out bugs here, this
-    // should be transformed to a load of null making it unreachable.
-    uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType());
-    if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) {
-      LI.replaceAllUsesWith(UndefValue::get(LI.getType()));
-      Pass.DeadInsts.insert(&LI);
-      deleteIfTriviallyDead(OldOp);
-      DEBUG(dbgs() << "          to: undef!!\n");
-      return true;
-    }
 
-    Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8)
-                                    : LI.getType();
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
+                             : LI.getType();
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst(IRB, LI, OldOp);
+      V = rewriteVectorizedLoadInst();
     } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(IRB, LI);
+      V = rewriteIntegerLoad(LI);
     } else if (BeginOffset == NewAllocaBeginOffset &&
                canConvertValue(TD, NewAllocaTy, LI.getType())) {
       V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                LI.isVolatile(), getName(".load"));
+                                LI.isVolatile(), "load");
     } else {
       Type *LTy = TargetTy->getPointerTo();
       V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
                                 getPartitionTypeAlign(TargetTy),
-                                LI.isVolatile(), getName(".load"));
+                                LI.isVolatile(), "load");
       IsPtrAdjusted = true;
     }
     V = convertValue(TD, IRB, V, TargetTy);
 
-    if (IsSplitIntLoad) {
+    if (IsSplit) {
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
+      assert(Size < TD.getTypeStoreSize(LI.getType()) &&
+             "Split load isn't smaller than original load");
       assert(LI.getType()->getIntegerBitWidth() ==
              TD.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
-      assert(LI.getType()->getIntegerBitWidth() ==
-             TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) &&
-             "Only alloca-wide loads can be split and recomposed");
       // Move the insertion point just past the load so that we can refer to it.
       IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
       // Create a placeholder value with the same type as LI to use as the
@@ -2555,7 +2577,7 @@ private:
       Value *Placeholder
         = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
       V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
-                        getName(".insert"));
+                        "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
       delete Placeholder;
@@ -2569,19 +2591,24 @@ private:
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
+  bool rewriteVectorizedStoreInst(Value *V,
                                   StoreInst &SI, Value *OldOp) {
-    if (V->getType() == ElementTy ||
-        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
-      if (V->getType() != ElementTy)
-        V = convertValue(TD, IRB, V, ElementTy);
-      LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                           getName(".load"));
-      V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset),
-                                  getName(".insert"));
-    } else if (V->getType() != VecTy) {
-      V = convertValue(TD, IRB, V, VecTy);
-    }
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    unsigned NumElements = EndIndex - BeginIndex;
+    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+    Type *PartitionTy
+      = (NumElements == 1) ? ElementTy
+                           : VectorType::get(ElementTy, NumElements);
+    if (V->getType() != PartitionTy)
+      V = convertValue(TD, IRB, V, PartitionTy);
+
+    // Mix in the existing elements.
+    Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                       "load");
+    V = insertVector(IRB, Old, V, BeginIndex, "vec");
+
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
 
@@ -2590,17 +2617,17 @@ private:
     return true;
   }
 
-  bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
     if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
+                                         "oldload");
       Old = convertValue(TD, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
       V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
-                        getName(".insert"));
+                        "insert");
     }
     V = convertValue(TD, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2614,7 +2641,6 @@ private:
     DEBUG(dbgs() << "    original: " << SI << "\n");
     Value *OldOp = SI.getOperand(1);
     assert(OldOp == OldPtr);
-    IRBuilder<> IRB(&SI);
 
     Value *V = SI.getValueOperand();
 
@@ -2627,23 +2653,21 @@ private:
     uint64_t Size = EndOffset - BeginOffset;
     if (Size < TD.getTypeStoreSize(V->getType())) {
       assert(!SI.isVolatile());
+      assert(IsSplit && "A seemingly split store isn't splittable");
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
       assert(V->getType()->getIntegerBitWidth() ==
              TD.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
-      assert(V->getType()->getIntegerBitWidth() ==
-             TD.getTypeSizeInBits(OldAI.getAllocatedType()) &&
-             "Only alloca-wide stores can be split and recomposed");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
       V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
-                         getName(".extract"));
+                         "extract");
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(IRB, V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(IRB, V, SI);
+      return rewriteIntegerStore(V, SI);
 
     StoreInst *NewSI;
     if (BeginOffset == NewAllocaBeginOffset &&
@@ -2665,9 +2689,42 @@ private:
     return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
   }
 
+  /// \brief Compute an integer value from splatting an i8 across the given
+  /// number of bytes.
+  ///
+  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+  /// call this routine.
+  /// FIXME: Heed the advice above.
+  ///
+  /// \param V The i8 value to splat.
+  /// \param Size The number of bytes in the output (assuming i8 is one byte)
+  Value *getIntegerSplat(Value *V, unsigned Size) {
+    assert(Size > 0 && "Expected a positive number of bytes.");
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+    if (Size == 1)
+      return V;
+
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
+    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
+                      ConstantExpr::getUDiv(
+                        Constant::getAllOnesValue(SplatIntTy),
+                        ConstantExpr::getZExt(
+                          Constant::getAllOnesValue(V->getType()),
+                          SplatIntTy)),
+                      "isplat");
+    return V;
+  }
+
+  /// \brief Compute a vector splat for a given element value.
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    return V;
+  }
+
   bool visitMemSetInst(MemSetInst &II) {
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
     assert(II.getRawDest() == OldPtr);
 
     // If the memset has a variable size, it cannot be split, just adjust the
@@ -2693,7 +2750,8 @@ private:
         (BeginOffset != NewAllocaBeginOffset ||
          EndOffset != NewAllocaEndOffset ||
          !AllocaTy->isSingleValueType() ||
-         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
+         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
+         TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
       Type *SizeTy = II.getLength()->getType();
       Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
       CallInst *New
@@ -2709,53 +2767,61 @@ private:
     // If we can represent this as a simple value, we have to build the actual
     // value to store, which requires expanding the byte present in memset to
     // a sensible representation for the alloca type. This is essentially
-    // splatting the byte to a sufficiently wide integer, bitcasting to the
-    // desired scalar type, and splatting it across any desired vector type.
-    uint64_t Size = EndOffset - BeginOffset;
-    Value *V = II.getValue();
-    IntegerType *VTy = cast<IntegerType>(V->getType());
-    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
-    if (Size*8 > VTy->getBitWidth())
-      V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
-                        ConstantExpr::getUDiv(
-                          Constant::getAllOnesValue(SplatIntTy),
-                          ConstantExpr::getZExt(
-                            Constant::getAllOnesValue(V->getType()),
-                            SplatIntTy)),
-                        getName(".isplat"));
-
-    // If this is an element-wide memset of a vectorizable alloca, insert it.
-    if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
-                  EndOffset < NewAllocaEndOffset)) {
-      if (V->getType() != ScalarTy)
-        V = convertValue(TD, IRB, V, ScalarTy);
-      StoreInst *Store = IRB.CreateAlignedStore(
-        IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
-                                                      NewAI.getAlignment(),
-                                                      getName(".load")),
-                                V, getIndex(IRB, BeginOffset),
-                                getName(".insert")),
-        &NewAI, NewAI.getAlignment());
-      (void)Store;
-      DEBUG(dbgs() << "          to: " << *Store << "\n");
-      return true;
-    }
+    // splatting the byte to a sufficiently wide integer, splatting it across
+    // any desired vector width, and bitcasting to the final type.
+    Value *V;
+
+    if (VecTy) {
+      // If this is a memset of a vectorized alloca, insert it.
+      assert(ElementTy == ScalarTy);
+
+      unsigned BeginIndex = getIndex(BeginOffset);
+      unsigned EndIndex = getIndex(EndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+      Value *Splat =
+          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
+      Splat = convertValue(TD, IRB, Splat, ElementTy);
+      if (NumElements > 1)
+        Splat = getVectorSplat(Splat, NumElements);
 
-    // If this is a memset on an alloca where we can widen stores, insert the
-    // set integer.
-    if (IntTy && (BeginOffset > NewAllocaBeginOffset ||
-                  EndOffset < NewAllocaEndOffset)) {
-      assert(!II.isVolatile());
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
-      Old = convertValue(TD, IRB, Old, IntTy);
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
-    }
+                                         "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+    } else if (IntTy) {
+      // If this is a memset on an alloca where we can widen stores, insert the
+      // set integer.
+      assert(!II.isVolatile());
+
+      uint64_t Size = EndOffset - BeginOffset;
+      V = getIntegerSplat(II.getValue(), Size);
+
+      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                    EndOffset != NewAllocaBeginOffset)) {
+        Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                           "oldload");
+        Old = convertValue(TD, IRB, Old, IntTy);
+        assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+        uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
+      } else {
+        assert(V->getType() == IntTy &&
+               "Wrong type for an alloca wide integer!");
+      }
+      V = convertValue(TD, IRB, V, AllocaTy);
+    } else {
+      // Established these invariants above.
+      assert(BeginOffset == NewAllocaBeginOffset);
+      assert(EndOffset == NewAllocaEndOffset);
+
+      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
+      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+        V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
-    if (V->getType() != AllocaTy)
       V = convertValue(TD, IRB, V, AllocaTy);
+    }
 
     Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                         II.isVolatile());
@@ -2769,7 +2835,6 @@ private:
     // them into two categories: split intrinsics and unsplit intrinsics.
 
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
 
     assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
     bool IsDest = II.getRawDest() == OldPtr;
@@ -2840,37 +2905,21 @@ private:
     // Record this instruction for deletion.
     Pass.DeadInsts.insert(&II);
 
-    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
-                         EndOffset == NewAllocaEndOffset;
-    bool IsVectorElement = VecTy && !IsWholeAlloca;
-    uint64_t Size = EndOffset - BeginOffset;
-    IntegerType *SubIntTy
-      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
-
-    Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
-                              : II.getRawDest()->getType();
-    if (!EmitMemCpy) {
-      if (IsVectorElement)
-        OtherPtrTy = VecTy->getElementType()->getPointerTo();
-      else if (IntTy && !IsWholeAlloca)
-        OtherPtrTy = SubIntTy->getPointerTo();
-      else
-        OtherPtrTy = NewAI.getType();
-    }
-
-    // Compute the other pointer, folding as much as possible to produce
-    // a single, simple GEP in most cases.
-    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
-    OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
-                              getName("." + OtherPtr->getName()));
-
     // Strip all inbounds GEPs and pointer casts to try to dig out any root
     // alloca that should be re-examined after rewriting this instruction.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
     if (AllocaInst *AI
           = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
       Pass.Worklist.insert(AI);
 
     if (EmitMemCpy) {
+      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
+                                : II.getRawDest()->getType();
+
+      // Compute the other pointer, folding as much as possible to produce
+      // a single, simple GEP in most cases.
+      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+
       Value *OurPtr
         = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
                                            : II.getRawSource()->getType());
@@ -2891,48 +2940,63 @@ private:
     if (!Align)
       Align = 1;
 
-    Value *SrcPtr = OtherPtr;
+    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
+                         EndOffset == NewAllocaEndOffset;
+    uint64_t Size = EndOffset - BeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0;
+    unsigned NumElements = EndIndex - BeginIndex;
+    IntegerType *SubIntTy
+      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
+
+    Type *OtherPtrTy = NewAI.getType();
+    if (VecTy && !IsWholeAlloca) {
+      if (NumElements == 1)
+        OtherPtrTy = VecTy->getElementType();
+      else
+        OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+      OtherPtrTy = OtherPtrTy->getPointerTo();
+    } else if (IntTy && !IsWholeAlloca) {
+      OtherPtrTy = SubIntTy->getPointerTo();
+    }
+
+    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
     Value *DstPtr = &NewAI;
     if (!IsDest)
       std::swap(SrcPtr, DstPtr);
 
     Value *Src;
-    if (IsVectorElement && !IsDest) {
-      // We have to extract rather than load.
-      Src = IRB.CreateExtractElement(
-        IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
-        getIndex(IRB, BeginOffset),
-        getName(".copyextract"));
+    if (VecTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  getName(".load"));
+                                  "load");
       Src = convertValue(TD, IRB, Src, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract"));
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
     } else {
       Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
-                                  getName(".copyload"));
+                                  "copyload");
     }
 
-    if (IntTy && !IsWholeAlloca && IsDest) {
+    if (VecTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
+                                         "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "oldload");
       Old = convertValue(TD, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert"));
+      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
       Src = convertValue(TD, IRB, Src, NewAllocaTy);
     }
 
-    if (IsVectorElement && IsDest) {
-      // We have to insert into a loaded copy before storing.
-      Src = IRB.CreateInsertElement(
-        IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
-        Src, getIndex(IRB, BeginOffset),
-        getName(".insert"));
-    }
-
     StoreInst *Store = cast<StoreInst>(
       IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
     (void)Store;
@@ -2944,7 +3008,6 @@ private:
     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
            II.getIntrinsicID() == Intrinsic::lifetime_end);
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
     assert(II.getArgOperand(1) == OldPtr);
 
     // Record this instruction for deletion.
@@ -2960,6 +3023,7 @@ private:
     else
       New = IRB.CreateLifetimeEnd(Ptr, Size);
 
+    (void)New;
     DEBUG(dbgs() << "          to: " << *New << "\n");
     return true;
   }
@@ -2971,7 +3035,9 @@ private:
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr));
+    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
+    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                             ".");
 
     Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
     // Replace the operands which were using the old pointer.
@@ -2984,7 +3050,6 @@ private:
 
   bool visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
-    IRBuilder<> IRB(&SI);
 
     // Find the operand we need to rewrite here.
     bool IsTrueVal = SI.getTrueValue() == OldPtr;
@@ -3059,7 +3124,7 @@ private:
   class OpSplitter {
   protected:
     /// The builder used to form new instructions.
-    IRBuilder<> IRB;
+    IRBuilderTy IRB;
     /// The indices which to be used with insert- or extractvalue to select the
     /// appropriate value within the aggregate.
     SmallVector<unsigned, 4> Indices;
@@ -3136,9 +3201,8 @@ private:
     void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Load the single value and insert it using the indices.
-      Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
-                                                         Name + ".gep"),
-                                   Name + ".load");
+      Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep");
+      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
       DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
@@ -3272,12 +3336,13 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
     Type *ElementTy = SeqTy->getElementType();
     uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
     uint64_t NumSkippedElements = Offset / ElementSize;
-    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
       if (NumSkippedElements >= ArrTy->getNumElements())
         return 0;
-    if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
+    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
       if (NumSkippedElements >= VecTy->getNumElements())
         return 0;
+    }
     Offset -= NumSkippedElements * ElementSize;
 
     // First check if we need to recurse.
@@ -3375,7 +3440,7 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI,
   for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
                                         UE = P.use_end(PI);
        UI != UE && !IsLive; ++UI)
-    if (UI->U)
+    if (UI->getUse())
       IsLive = true;
   if (!IsLive)
     return false; // No live uses left of this partition.
@@ -3411,7 +3476,7 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI,
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
   // case, re-use the existing alloca, but still run through the rewriter to
-  // performe phi and select speculation.
+  // perform phi and select speculation.
   AllocaInst *NewAI;
   if (AllocaTy == AI.getAllocatedType()) {
     assert(PI->BeginOffset == 0 &&
@@ -3578,7 +3643,7 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 /// If there is a domtree available, we attempt to promote using the full power
 /// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
 /// based on the SSAUpdater utilities. This function returns whether any
-/// promotion occured.
+/// promotion occurred.
 bool SROA::promoteAllocas(Function &F) {
   if (PromotableAllocas.empty())
     return false;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index 39630fd..8a9c7da 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -13,14 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm-c/Initialization.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm-c/Transforms/Scalar.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
 
 using namespace llvm;
 
@@ -50,11 +50,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
-  initializeObjCARCAliasAnalysisPass(Registry);
-  initializeObjCARCAPElimPass(Registry);
-  initializeObjCARCExpandPass(Registry);
-  initializeObjCARCContractPass(Registry);
-  initializeObjCARCOptPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index a46d09c..e590a37 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -21,32 +21,32 @@
 
 #define DEBUG_TYPE "scalarrepl"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Operator.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 9f24bb6..c243d34 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -23,19 +23,19 @@
 
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#include "llvm/Attributes.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
@@ -48,12 +48,19 @@ namespace {
     }
 
     virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetTransformInfo>();
+    }
   };
 }
 
 char CFGSimplifyPass::ID = 0;
-INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg",
-                "Simplify the CFG", false, false)
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
+                      false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
+                    false, false)
 
 // Public interface to the CFGSimplification pass
 FunctionPass *llvm::createCFGSimplificationPass() {
@@ -111,13 +118,11 @@ static bool markAliveBlocks(BasicBlock *BB,
 
   SmallVector<BasicBlock*, 128> Worklist;
   Worklist.push_back(BB);
+  Reachable.insert(BB);
   bool Changed = false;
   do {
     BB = Worklist.pop_back_val();
 
-    if (!Reachable.insert(BB))
-      continue;
-
     // Do a quick scan of the basic block, turning any obviously unreachable
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
@@ -176,7 +181,8 @@ static bool markAliveBlocks(BasicBlock *BB,
 
     Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      Worklist.push_back(*SI);
+      if (Reachable.insert(*SI))
+        Worklist.push_back(*SI);
   } while (!Worklist.empty());
   return Changed;
 }
@@ -294,8 +300,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 
 /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-static bool iterativelySimplifyCFG(Function &F, const DataLayout *TD,
-                                   const TargetTransformInfo *TTI) {
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+                                   const DataLayout *TD) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -304,7 +310,7 @@ static bool iterativelySimplifyCFG(Function &F, const DataLayout *TD,
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TD, TTI)) {
+      if (SimplifyCFG(BBIt++, TTI, TD)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -318,12 +324,11 @@ static bool iterativelySimplifyCFG(Function &F, const DataLayout *TD,
 // simplify the CFG.
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
+  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-  const TargetTransformInfo *TTI =
-      getAnalysisIfAvailable<TargetTransformInfo>();
   bool EverChanged = removeUnreachableBlocksFromFn(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TD, TTI);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -337,7 +342,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TD, TTI);
+    EverChanged = iterativelySimplifyCFG(F, TTI, TD);
     EverChanged |= removeUnreachableBlocksFromFn(F);
   } while (EverChanged);
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 17d07cd..3514e6c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -17,32 +17,24 @@
 
 #define DEBUG_TYPE "simplify-libcalls"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 using namespace llvm;
 
-STATISTIC(NumSimplified, "Number of library calls simplified");
-STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
-static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
-                                   cl::init(false),
-                                   cl::desc("Enable unsafe double to float "
-                                            "shrinking for math lib calls"));
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
 //===----------------------------------------------------------------------===//
@@ -87,677 +79,6 @@ public:
 
 
 //===----------------------------------------------------------------------===//
-// Helper Functions
-//===----------------------------------------------------------------------===//
-
-static bool CallHasFloatingPointArgument(const CallInst *CI) {
-  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
-       it != e; ++it) {
-    if ((*it)->getType()->isFloatingPointTy())
-      return true;
-  }
-  return false;
-}
-
-namespace {
-//===----------------------------------------------------------------------===//
-// Math Library Optimizations
-//===----------------------------------------------------------------------===//
-
-//===---------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
-
-struct UnaryDoubleFPOpt : public LibCallOptimization {
-  bool CheckRetType;
-  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-        !FT->getParamType(0)->isDoubleTy())
-      return 0;
-
-    if (CheckRetType) {
-      // Check if all the uses for function like 'sin' are converted to float.
-      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
-          ++UseI) {
-        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
-        if (Cast == 0 || !Cast->getType()->isFloatTy())
-          return 0;
-      }
-    }
-
-    // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
-      return 0;
-
-    // floor((double)floatval) -> (double)floorf(floatval)
-    Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
-  }
-};
-
-//===---------------------------------------===//
-// 'cos*' Optimizations
-struct CosOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    Value *Ret = NULL;
-    if (UnsafeFPShrink && Callee->getName() == "cos" &&
-        TLI->has(LibFunc::cosf)) {
-      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
-    }
-
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 1 argument of FP type, which matches the
-    // result type.
-    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return Ret;
-
-    // cos(-x) -> cos(x)
-    Value *Op1 = CI->getArgOperand(0);
-    if (BinaryOperator::isFNeg(Op1)) {
-      BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
-      return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
-    }
-    return Ret;
-  }
-};
-
-//===---------------------------------------===//
-// 'pow*' Optimizations
-
-struct PowOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    Value *Ret = NULL;
-    if (UnsafeFPShrink && Callee->getName() == "pow" &&
-        TLI->has(LibFunc::powf)) {
-      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
-    }
-
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 2 arguments of the same FP type, which match the
-    // result type.
-    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return Ret;
-
-    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
-    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
-        return Op1C;
-      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
-        return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
-    }
-
-    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return Ret;
-
-    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
-      return ConstantFP::get(CI->getType(), 1.0);
-
-    if (Op2C->isExactlyValue(0.5)) {
-      // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
-      // This is faster than calling pow, and still handles negative zero
-      // and negative infinity correctly.
-      // TODO: In fast-math mode, this could be just sqrt(x).
-      // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
-      Value *Inf = ConstantFP::getInfinity(CI->getType());
-      Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
-      Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B,
-                                         Callee->getAttributes());
-      Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
-                                         Callee->getAttributes());
-      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
-      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
-      return Sel;
-    }
-
-    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
-      return Op1;
-    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
-      return B.CreateFMul(Op1, Op1, "pow2");
-    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
-      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0),
-                          Op1, "powrecip");
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'exp2' Optimizations
-
-struct Exp2Opt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    Value *Ret = NULL;
-    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
-        TLI->has(LibFunc::exp2)) {
-      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
-    }
-
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 1 argument of FP type, which matches the
-    // result type.
-    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return Ret;
-
-    Value *Op = CI->getArgOperand(0);
-    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
-    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
-    Value *LdExpArg = 0;
-    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
-      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
-    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
-      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
-    }
-
-    if (LdExpArg) {
-      const char *Name;
-      if (Op->getType()->isFloatTy())
-        Name = "ldexpf";
-      else if (Op->getType()->isDoubleTy())
-        Name = "ldexp";
-      else
-        Name = "ldexpl";
-
-      Constant *One = ConstantFP::get(*Context, APFloat(1.0f));
-      if (!Op->getType()->isFloatTy())
-        One = ConstantExpr::getFPExtend(One, Op->getType());
-
-      Module *M = Caller->getParent();
-      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
-                                             Op->getType(),
-                                             B.getInt32Ty(), NULL);
-      CallInst *CI = B.CreateCall2(Callee, One, LdExpArg);
-      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
-        CI->setCallingConv(F->getCallingConv());
-
-      return CI;
-    }
-    return Ret;
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Integer Optimizations
-//===----------------------------------------------------------------------===//
-
-//===---------------------------------------===//
-// 'ffs*' Optimizations
-
-struct FFSOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 2 arguments of the same FP type, which match the
-    // result type.
-    if (FT->getNumParams() != 1 ||
-        !FT->getReturnType()->isIntegerTy(32) ||
-        !FT->getParamType(0)->isIntegerTy())
-      return 0;
-
-    Value *Op = CI->getArgOperand(0);
-
-    // Constant fold.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-      if (CI->isZero()) // ffs(0) -> 0.
-        return B.getInt32(0);
-      // ffs(c) -> cttz(c)+1
-      return B.getInt32(CI->getValue().countTrailingZeros() + 1);
-    }
-
-    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
-    Type *ArgType = Op->getType();
-    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
-                                         Intrinsic::cttz, ArgType);
-    Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
-    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
-    V = B.CreateIntCast(V, B.getInt32Ty(), false);
-
-    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
-    return B.CreateSelect(Cond, V, B.getInt32(0));
-  }
-};
-
-//===---------------------------------------===//
-// 'isdigit' Optimizations
-
-struct IsDigitOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require integer(i32)
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-        !FT->getParamType(0)->isIntegerTy(32))
-      return 0;
-
-    // isdigit(c) -> (c-'0') <u 10
-    Value *Op = CI->getArgOperand(0);
-    Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
-    Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
-    return B.CreateZExt(Op, CI->getType());
-  }
-};
-
-//===---------------------------------------===//
-// 'isascii' Optimizations
-
-struct IsAsciiOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require integer(i32)
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-        !FT->getParamType(0)->isIntegerTy(32))
-      return 0;
-
-    // isascii(c) -> c <u 128
-    Value *Op = CI->getArgOperand(0);
-    Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
-    return B.CreateZExt(Op, CI->getType());
-  }
-};
-
-//===---------------------------------------===//
-// 'abs', 'labs', 'llabs' Optimizations
-
-struct AbsOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require integer(integer) where the types agree.
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-        FT->getParamType(0) != FT->getReturnType())
-      return 0;
-
-    // abs(x) -> x >s -1 ? x : -x
-    Value *Op = CI->getArgOperand(0);
-    Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()),
-                                 "ispos");
-    Value *Neg = B.CreateNeg(Op, "neg");
-    return B.CreateSelect(Pos, Op, Neg);
-  }
-};
-
-
-//===---------------------------------------===//
-// 'toascii' Optimizations
-
-struct ToAsciiOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require i32(i32)
-    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isIntegerTy(32))
-      return 0;
-
-    // isascii(c) -> c & 0x7f
-    return B.CreateAnd(CI->getArgOperand(0),
-                       ConstantInt::get(CI->getType(),0x7F));
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Formatting and IO Optimizations
-//===----------------------------------------------------------------------===//
-
-//===---------------------------------------===//
-// 'printf' Optimizations
-
-struct PrintFOpt : public LibCallOptimization {
-  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // Check for a fixed format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
-      return 0;
-
-    // Empty format string -> noop.
-    if (FormatStr.empty())  // Tolerate printf's declared void.
-      return CI->use_empty() ? (Value*)CI :
-                               ConstantInt::get(CI->getType(), 0);
-
-    // Do not do any of the following transformations if the printf return value
-    // is used, in general the printf return value is not compatible with either
-    // putchar() or puts().
-    if (!CI->use_empty())
-      return 0;
-
-    // printf("x") -> putchar('x'), even for '%'.
-    if (FormatStr.size() == 1) {
-      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI);
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
-    // printf("foo\n") --> puts("foo")
-    if (FormatStr[FormatStr.size()-1] == '\n' &&
-        FormatStr.find('%') == std::string::npos) {  // no format characters.
-      // Create a string literal with no \n on it.  We expect the constant merge
-      // pass to be run after this pass, to merge duplicate strings.
-      FormatStr = FormatStr.drop_back();
-      Value *GV = B.CreateGlobalString(FormatStr, "str");
-      Value *NewCI = EmitPutS(GV, B, TD, TLI);
-      return (CI->use_empty() || !NewCI) ?
-              NewCI :
-              ConstantInt::get(CI->getType(), FormatStr.size()+1);
-    }
-
-    // Optimize specific format strings.
-    // printf("%c", chr) --> putchar(chr)
-    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
-        CI->getArgOperand(1)->getType()->isIntegerTy()) {
-      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI);
-
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
-    // printf("%s\n", str) --> puts(str)
-    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
-        CI->getArgOperand(1)->getType()->isPointerTy()) {
-      return EmitPutS(CI->getArgOperand(1), B, TD, TLI);
-    }
-    return 0;
-  }
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require one fixed pointer argument and an integer/void result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-        !(FT->getReturnType()->isIntegerTy() ||
-          FT->getReturnType()->isVoidTy()))
-      return 0;
-
-    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // printf(format, ...) -> iprintf(format, ...) if no floating point
-    // arguments.
-    if (TLI->has(LibFunc::iprintf) && !CallHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *IPrintFFn =
-        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(IPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'sprintf' Optimizations
-
-struct SPrintFOpt : public LibCallOptimization {
-  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // Check for a fixed format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return 0;
-
-    // If we just have a format string (nothing else crazy) transform it.
-    if (CI->getNumArgOperands() == 2) {
-      // Make sure there's no % in the constant array.  We could try to handle
-      // %% -> % in the future if we cared.
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')
-          return 0; // we found a format specifier, bail out.
-
-      // These optimizations require DataLayout.
-      if (!TD) return 0;
-
-      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
-                                      FormatStr.size() + 1), 1);   // nul byte.
-      return ConstantInt::get(CI->getType(), FormatStr.size());
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c"
-    // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-        CI->getNumArgOperands() < 3)
-      return 0;
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
-      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
-      B.CreateStore(V, Ptr);
-      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
-      B.CreateStore(B.getInt8(0), Ptr);
-
-      return ConstantInt::get(CI->getType(), 1);
-    }
-
-    if (FormatStr[1] == 's') {
-      // These optimizations require DataLayout.
-      if (!TD) return 0;
-
-      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
-
-      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
-      if (!Len)
-        return 0;
-      Value *IncLen = B.CreateAdd(Len,
-                                  ConstantInt::get(Len->getType(), 1),
-                                  "leninc");
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
-
-      // The sprintf result is the unincremented number of bytes in the string.
-      return B.CreateIntCast(Len, CI->getType(), false);
-    }
-    return 0;
-  }
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require two fixed pointer arguments and an integer result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
-    // point arguments.
-    if (TLI->has(LibFunc::siprintf) && !CallHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *SIPrintFFn =
-        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(SIPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'fwrite' Optimizations
-
-struct FWriteOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require a pointer, an integer, an integer, a pointer, returning integer.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        !FT->getParamType(2)->isIntegerTy() ||
-        !FT->getParamType(3)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    // Get the element size and count.
-    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!SizeC || !CountC) return 0;
-    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
-
-    // If this is writing zero records, remove the call (it's a noop).
-    if (Bytes == 0)
-      return ConstantInt::get(CI->getType(), 0);
-
-    // If this is writing one byte, turn it into fputc.
-    // This optimisation is only valid, if the return value is unused.
-    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
-      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'fputs' Optimizations
-
-struct FPutsOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require DataLayout.
-    if (!TD) return 0;
-
-    // Require two pointers.  Also, we can't optimize if return value is used.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !CI->use_empty())
-      return 0;
-
-    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
-    uint64_t Len = GetStringLength(CI->getArgOperand(0));
-    if (!Len) return 0;
-    // Known to have no uses (see above).
-    return EmitFWrite(CI->getArgOperand(0),
-                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-                      CI->getArgOperand(1), B, TD, TLI);
-  }
-};
-
-//===---------------------------------------===//
-// 'fprintf' Optimizations
-
-struct FPrintFOpt : public LibCallOptimization {
-  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // All the optimizations depend on the format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return 0;
-
-    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
-    if (CI->getNumArgOperands() == 2) {
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
-          return 0; // We found a format specifier.
-
-      // These optimizations require DataLayout.
-      if (!TD) return 0;
-
-      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
-                                ConstantInt::get(TD->getIntPtrType(*Context),
-                                                 FormatStr.size()),
-                                CI->getArgOperand(0), B, TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c"
-    // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-        CI->getNumArgOperands() < 3)
-      return 0;
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      // fprintf(F, "%c", chr) --> fputc(chr, F)
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
-                               TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
-    }
-
-    if (FormatStr[1] == 's') {
-      // fprintf(F, "%s", str) --> fputs(str, F)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
-        return 0;
-      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
-    }
-    return 0;
-  }
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require two fixed paramters as pointers and integer result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
-    // floating point arguments.
-    if (TLI->has(LibFunc::fiprintf) && !CallHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *FIPrintFFn =
-        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(FIPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'puts' Optimizations
-
-struct PutsOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require one fixed pointer argument and an integer/void result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-        !(FT->getReturnType()->isIntegerTy() ||
-          FT->getReturnType()->isVoidTy()))
-      return 0;
-
-    // Check for a constant string.
-    StringRef Str;
-    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
-      return 0;
-
-    if (Str.empty() && CI->use_empty()) {
-      // puts("") -> putchar('\n')
-      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
-    return 0;
-  }
-};
-
-} // end anonymous namespace.
-
-//===----------------------------------------------------------------------===//
 // SimplifyLibCalls Pass Implementation
 //===----------------------------------------------------------------------===//
 
@@ -768,22 +89,9 @@ namespace {
     TargetLibraryInfo *TLI;
 
     StringMap<LibCallOptimization*> Optimizations;
-    // Math Library Optimizations
-    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
-    UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
-    // Integer Optimizations
-    FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
-    ToAsciiOpt ToAscii;
-    // Formatting and IO Optimizations
-    SPrintFOpt SPrintF; PrintFOpt PrintF;
-    FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
-    PutsOpt Puts;
-
-    bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID), UnaryDoubleFP(false),
-                         UnsafeUnaryDoubleFP(true) {
+    SimplifyLibCalls() : FunctionPass(ID) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
@@ -792,14 +100,6 @@ namespace {
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
-    void setDoesNotAccessMemory(Function &F);
-    void setOnlyReadsMemory(Function &F);
-    void setDoesNotThrow(Function &F);
-    void setDoesNotCapture(Function &F, unsigned n);
-    void setDoesNotAlias(Function &F, unsigned n);
-    bool doInitialization(Module &M);
-
-    void inferPrototypeAttributes(Function &F);
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetLibraryInfo>();
     }
@@ -833,77 +133,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
-  // Math Library Optimizations
-  Optimizations["cosf"] = &Cos;
-  Optimizations["cos"] = &Cos;
-  Optimizations["cosl"] = &Cos;
-  Optimizations["powf"] = &Pow;
-  Optimizations["pow"] = &Pow;
-  Optimizations["powl"] = &Pow;
-  Optimizations["llvm.pow.f32"] = &Pow;
-  Optimizations["llvm.pow.f64"] = &Pow;
-  Optimizations["llvm.pow.f80"] = &Pow;
-  Optimizations["llvm.pow.f128"] = &Pow;
-  Optimizations["llvm.pow.ppcf128"] = &Pow;
-  Optimizations["exp2l"] = &Exp2;
-  Optimizations["exp2"] = &Exp2;
-  Optimizations["exp2f"] = &Exp2;
-  Optimizations["llvm.exp2.ppcf128"] = &Exp2;
-  Optimizations["llvm.exp2.f128"] = &Exp2;
-  Optimizations["llvm.exp2.f80"] = &Exp2;
-  Optimizations["llvm.exp2.f64"] = &Exp2;
-  Optimizations["llvm.exp2.f32"] = &Exp2;
-
-  AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
-  AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
-  AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
-  AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
-  AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
-  AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
-  AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
-
-  if(UnsafeFPShrink) {
-    AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
-    AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
-  }
-
-  // Integer Optimizations
-  Optimizations["ffs"] = &FFS;
-  Optimizations["ffsl"] = &FFS;
-  Optimizations["ffsll"] = &FFS;
-  Optimizations["abs"] = &Abs;
-  Optimizations["labs"] = &Abs;
-  Optimizations["llabs"] = &Abs;
-  Optimizations["isdigit"] = &IsDigit;
-  Optimizations["isascii"] = &IsAscii;
-  Optimizations["toascii"] = &ToAscii;
-
-  // Formatting and IO Optimizations
-  Optimizations["sprintf"] = &SPrintF;
-  Optimizations["printf"] = &PrintF;
-  AddOpt(LibFunc::fwrite, &FWrite);
-  AddOpt(LibFunc::fputs, &FPuts);
-  Optimizations["fprintf"] = &FPrintF;
-  Optimizations["puts"] = &Puts;
 }
 
 
@@ -924,7 +153,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
       // Ignore non-calls.
       CallInst *CI = dyn_cast<CallInst>(I++);
-      if (!CI) continue;
+      if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue;
 
       // Ignore indirect calls and calls to non-external functions.
       Function *Callee = CI->getCalledFunction();
@@ -951,7 +180,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
 
       // Something changed!
       Changed = true;
-      ++NumSimplified;
 
       // Inspect the instruction after the call (which was potentially just
       // added) next.
@@ -968,697 +196,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
   return Changed;
 }
 
-// Utility methods for doInitialization.
-
-void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) {
-  if (!F.doesNotAccessMemory()) {
-    F.setDoesNotAccessMemory();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setOnlyReadsMemory(Function &F) {
-  if (!F.onlyReadsMemory()) {
-    F.setOnlyReadsMemory();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotThrow(Function &F) {
-  if (!F.doesNotThrow()) {
-    F.setDoesNotThrow();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) {
-  if (!F.doesNotCapture(n)) {
-    F.setDoesNotCapture(n);
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
-  if (!F.doesNotAlias(n)) {
-    F.setDoesNotAlias(n);
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-
-
-void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
-  FunctionType *FTy = F.getFunctionType();
-
-  StringRef Name = F.getName();
-  switch (Name[0]) {
-  case 's':
-    if (Name == "strlen") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "strchr" ||
-               Name == "strrchr") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isIntegerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-    } else if (Name == "strcpy" ||
-               Name == "stpcpy" ||
-               Name == "strcat" ||
-               Name == "strtol" ||
-               Name == "strtod" ||
-               Name == "strtof" ||
-               Name == "strtoul" ||
-               Name == "strtoll" ||
-               Name == "strtold" ||
-               Name == "strncat" ||
-               Name == "strncpy" ||
-               Name == "stpncpy" ||
-               Name == "strtoull") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strxfrm") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strcmp" ||
-               Name == "strspn" ||
-               Name == "strncmp" ||
-               Name == "strcspn" ||
-               Name == "strcoll" ||
-               Name == "strcasecmp" ||
-               Name == "strncasecmp") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strstr" ||
-               Name == "strpbrk") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strtok" ||
-               Name == "strtok_r") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "scanf" ||
-               Name == "setbuf" ||
-               Name == "setvbuf") {
-      if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "strdup" ||
-               Name == "strndup") {
-      if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "stat" ||
-               Name == "sscanf" ||
-               Name == "sprintf" ||
-               Name == "statvfs") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "snprintf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "setitimer") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "system") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "system" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'm':
-    if (Name == "malloc") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "memcmp") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "memchr" ||
-               Name == "memrchr") {
-      if (FTy->getNumParams() != 3)
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-    } else if (Name == "modf" ||
-               Name == "modff" ||
-               Name == "modfl" ||
-               Name == "memcpy" ||
-               Name == "memccpy" ||
-               Name == "memmove") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "memalign") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotAlias(F, 0);
-    } else if (Name == "mkdir" ||
-               Name == "mktime") {
-      if (FTy->getNumParams() == 0 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'r':
-    if (Name == "realloc") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "read") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; "read" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 2);
-    } else if (Name == "rmdir" ||
-               Name == "rewind" ||
-               Name == "remove" ||
-               Name == "realpath") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "rename" ||
-               Name == "readlink") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'w':
-    if (Name == "write") {
-      if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; "write" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'b':
-    if (Name == "bcopy") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "bcmp") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "bzero") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'c':
-    if (Name == "calloc") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "chmod" ||
-               Name == "chown" ||
-               Name == "ctermid" ||
-               Name == "clearerr" ||
-               Name == "closedir") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'a':
-    if (Name == "atoi" ||
-        Name == "atol" ||
-        Name == "atof" ||
-        Name == "atoll") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "access") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'f':
-    if (Name == "fopen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "fdopen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "feof" ||
-               Name == "free" ||
-               Name == "fseek" ||
-               Name == "ftell" ||
-               Name == "fgetc" ||
-               Name == "fseeko" ||
-               Name == "ftello" ||
-               Name == "fileno" ||
-               Name == "fflush" ||
-               Name == "fclose" ||
-               Name == "fsetpos" ||
-               Name == "flockfile" ||
-               Name == "funlockfile" ||
-               Name == "ftrylockfile") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "ferror") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setOnlyReadsMemory(F);
-    } else if (Name == "fputc" ||
-               Name == "fstat" ||
-               Name == "frexp" ||
-               Name == "frexpf" ||
-               Name == "frexpl" ||
-               Name == "fstatvfs") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "fgets") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "fread" ||
-               Name == "fwrite") {
-      if (FTy->getNumParams() != 4 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(3)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 4);
-    } else if (Name == "fputs" ||
-               Name == "fscanf" ||
-               Name == "fprintf" ||
-               Name == "fgetpos") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'g':
-    if (Name == "getc" ||
-        Name == "getlogin_r" ||
-        Name == "getc_unlocked") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "getenv") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "gets" ||
-               Name == "getchar") {
-      setDoesNotThrow(F);
-    } else if (Name == "getitimer") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "getpwnam") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'u':
-    if (Name == "ungetc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "uname" ||
-               Name == "unlink" ||
-               Name == "unsetenv") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "utime" ||
-               Name == "utimes") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'p':
-    if (Name == "putc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "puts" ||
-               Name == "printf" ||
-               Name == "perror") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "pread" ||
-               Name == "pwrite") {
-      if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; these are valid pthread cancellation points.
-      setDoesNotCapture(F, 2);
-    } else if (Name == "putchar") {
-      setDoesNotThrow(F);
-    } else if (Name == "popen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "pclose") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'v':
-    if (Name == "vscanf") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "vsscanf" ||
-               Name == "vfscanf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "valloc") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "vprintf") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "vfprintf" ||
-               Name == "vsprintf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "vsnprintf") {
-      if (FTy->getNumParams() != 4 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 3);
-    }
-    break;
-  case 'o':
-    if (Name == "open") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "open" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    } else if (Name == "opendir") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 't':
-    if (Name == "tmpfile") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "times") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'h':
-    if (Name == "htonl" ||
-        Name == "htons") {
-      setDoesNotThrow(F);
-      setDoesNotAccessMemory(F);
-    }
-    break;
-  case 'n':
-    if (Name == "ntohl" ||
-        Name == "ntohs") {
-      setDoesNotThrow(F);
-      setDoesNotAccessMemory(F);
-    }
-    break;
-  case 'l':
-    if (Name == "lstat") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "lchown") {
-      if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'q':
-    if (Name == "qsort") {
-      if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
-        return;
-      // May throw; places call through function pointer.
-      setDoesNotCapture(F, 4);
-    }
-    break;
-  case '_':
-    if (Name == "__strdup" ||
-        Name == "__strndup") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "__strtok_r") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "_IO_getc") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "_IO_putc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 1:
-    if (Name == "\1__isoc99_scanf") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "\1stat64" ||
-               Name == "\1lstat64" ||
-               Name == "\1statvfs64" ||
-               Name == "\1__isoc99_sscanf") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1fopen64") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1fseeko64" ||
-               Name == "\1ftello64") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "\1tmpfile64") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "\1fstat64" ||
-               Name == "\1fstatvfs64") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1open64") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "open" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  }
-}
-
-/// doInitialization - Add attributes to well-known functions.
-///
-bool SimplifyLibCalls::doInitialization(Module &M) {
-  Modified = false;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
-    if (F.isDeclaration() && F.hasName())
-      inferPrototypeAttributes(F);
-  }
-  return Modified;
-}
-
 // TODO:
 //   Additional cases that we need to add to this file:
 //
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index 34f1d6c..d4595bb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -14,13 +14,13 @@
 
 #define DEBUG_TYPE "sink"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6557d63..2002e68 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -52,25 +52,26 @@
 
 #define DEBUG_TYPE "tailcallelim"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Support/CallSite.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumEliminated, "Number of tail calls removed");
@@ -79,11 +80,15 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
 namespace {
   struct TailCallElim : public FunctionPass {
+    const TargetTransformInfo *TTI;
+
     static char ID; // Pass identification, replacement for typeid
     TailCallElim() : FunctionPass(ID) {
       initializeTailCallElimPass(*PassRegistry::getPassRegistry());
     }
 
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
     virtual bool runOnFunction(Function &F);
 
   private:
@@ -109,14 +114,21 @@ namespace {
 }
 
 char TailCallElim::ID = 0;
-INITIALIZE_PASS(TailCallElim, "tailcallelim",
-                "Tail Call Elimination", false, false)
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
+                      "Tail Call Elimination", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
+                    "Tail Call Elimination", false, false)
 
 // Public interface to the TailCallElimination pass
 FunctionPass *llvm::createTailCallEliminationPass() {
   return new TailCallElim();
 }
 
+void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfo>();
+}
+
 /// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
 /// callees of this function.  We only do very simple analysis right now, this
 /// could be expanded in the future to use mod/ref information for particular
@@ -151,6 +163,7 @@ bool TailCallElim::runOnFunction(Function &F) {
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg()) return false;
 
+  TTI = &getAnalysis<TargetTransformInfo>();
   BasicBlock *OldEntry = 0;
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
@@ -391,7 +404,8 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   if (BB == &F->getEntryBlock() &&
       FirstNonDbg(BB->front()) == CI &&
       FirstNonDbg(llvm::next(BB->begin())) == TI &&
-      callIsSmall(CI)) {
+      CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
     CallSite::arg_iterator I = CallSite(CI).arg_begin(),
diff --git a/contrib/llvm/lib/Transforms/Utils/AddrModeMatcher.cpp b/contrib/llvm/lib/Transforms/Utils/AddrModeMatcher.cpp
deleted file mode 100644
index 6815e41..0000000
--- a/contrib/llvm/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ /dev/null
@@ -1,577 +0,0 @@
-//===- AddrModeMatcher.cpp - Addressing mode matching facility --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements target addressing mode matcher class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/AddrModeMatcher.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalValue.h"
-#include "llvm/Instruction.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/CallSite.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-void ExtAddrMode::print(raw_ostream &OS) const {
-  bool NeedPlus = false;
-  OS << "[";
-  if (BaseGV) {
-    OS << (NeedPlus ? " + " : "")
-       << "GV:";
-    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-
-  if (BaseOffs)
-    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
-
-  if (BaseReg) {
-    OS << (NeedPlus ? " + " : "")
-       << "Base:";
-    WriteAsOperand(OS, BaseReg, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-  if (Scale) {
-    OS << (NeedPlus ? " + " : "")
-       << Scale << "*";
-    WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-
-  OS << ']';
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void ExtAddrMode::dump() const {
-  print(dbgs());
-  dbgs() << '\n';
-}
-#endif
-
-
-/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
-/// Return true and update AddrMode if this addr mode is legal for the target,
-/// false if not.
-bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
-                                             unsigned Depth) {
-  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
-  // mode.  Just process that directly.
-  if (Scale == 1)
-    return MatchAddr(ScaleReg, Depth);
-  
-  // If the scale is 0, it takes nothing to add this.
-  if (Scale == 0)
-    return true;
-  
-  // If we already have a scale of this value, we can add to it, otherwise, we
-  // need an available scale field.
-  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
-    return false;
-
-  ExtAddrMode TestAddrMode = AddrMode;
-
-  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
-  // [A+B + A*7] -> [B+A*8].
-  TestAddrMode.Scale += Scale;
-  TestAddrMode.ScaledReg = ScaleReg;
-
-  // If the new address isn't legal, bail out.
-  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
-    return false;
-
-  // It was legal, so commit it.
-  AddrMode = TestAddrMode;
-  
-  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
-  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
-  // X*Scale + C*Scale to addr mode.
-  ConstantInt *CI = 0; Value *AddLHS = 0;
-  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
-      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
-    TestAddrMode.ScaledReg = AddLHS;
-    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
-      
-    // If this addressing mode is legal, commit it and remember that we folded
-    // this instruction.
-    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
-      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
-      AddrMode = TestAddrMode;
-      return true;
-    }
-  }
-
-  // Otherwise, not (x+c)*scale, just return what we have.
-  return true;
-}
-
-/// MightBeFoldableInst - This is a little filter, which returns true if an
-/// addressing computation involving I might be folded into a load/store
-/// accessing it.  This doesn't need to be perfect, but needs to accept at least
-/// the set of instructions that MatchOperationAddr can.
-static bool MightBeFoldableInst(Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::BitCast:
-    // Don't touch identity bitcasts.
-    if (I->getType() == I->getOperand(0)->getType())
-      return false;
-    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
-  case Instruction::PtrToInt:
-    // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return true;
-  case Instruction::IntToPtr:
-    // We know the input is intptr_t, so this is foldable.
-    return true;
-  case Instruction::Add:
-    return true;
-  case Instruction::Mul:
-  case Instruction::Shl:
-    // Can only handle X*C and X << C.
-    return isa<ConstantInt>(I->getOperand(1));
-  case Instruction::GetElementPtr:
-    return true;
-  default:
-    return false;
-  }
-}
-
-
-/// MatchOperationAddr - Given an instruction or constant expr, see if we can
-/// fold the operation into the addressing mode.  If so, update the addressing
-/// mode and return true, otherwise return false without modifying AddrMode.
-bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
-                                               unsigned Depth) {
-  // Avoid exponential behavior on extremely deep expression trees.
-  if (Depth >= 5) return false;
-  
-  switch (Opcode) {
-  case Instruction::PtrToInt:
-    // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return MatchAddr(AddrInst->getOperand(0), Depth);
-  case Instruction::IntToPtr:
-    // This inttoptr is a no-op if the integer type is pointer sized.
-    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
-    return false;
-  case Instruction::BitCast:
-    // BitCast is always a noop, and we can handle it as long as it is
-    // int->int or pointer->pointer (we don't want int<->fp or something).
-    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
-         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
-        // Don't touch identity bitcasts.  These were probably put here by LSR,
-        // and we don't want to mess around with them.  Assume it knows what it
-        // is doing.
-        AddrInst->getOperand(0)->getType() != AddrInst->getType())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
-    return false;
-  case Instruction::Add: {
-    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(0), Depth+1))
-      return true;
-    
-    // Restore the old addr mode info.
-    AddrMode = BackupAddrMode;
-    AddrModeInsts.resize(OldSize);
-    
-    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
-    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(1), Depth+1))
-      return true;
-    
-    // Otherwise we definitely can't merge the ADD in.
-    AddrMode = BackupAddrMode;
-    AddrModeInsts.resize(OldSize);
-    break;
-  }
-  //case Instruction::Or:
-  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
-  //break;
-  case Instruction::Mul:
-  case Instruction::Shl: {
-    // Can only handle X*C and X << C.
-    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
-    if (!RHS) return false;
-    int64_t Scale = RHS->getSExtValue();
-    if (Opcode == Instruction::Shl)
-      Scale = 1LL << Scale;
-    
-    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
-  }
-  case Instruction::GetElementPtr: {
-    // Scan the GEP.  We check it if it contains constant offsets and at most
-    // one variable offset.
-    int VariableOperand = -1;
-    unsigned VariableScale = 0;
-    
-    int64_t ConstantOffset = 0;
-    const DataLayout *TD = TLI.getDataLayout();
-    gep_type_iterator GTI = gep_type_begin(AddrInst);
-    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = TD->getStructLayout(STy);
-        unsigned Idx =
-          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
-        ConstantOffset += SL->getElementOffset(Idx);
-      } else {
-        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
-          ConstantOffset += CI->getSExtValue()*TypeSize;
-        } else if (TypeSize) {  // Scales of zero don't do anything.
-          // We only allow one variable index at the moment.
-          if (VariableOperand != -1)
-            return false;
-          
-          // Remember the variable index.
-          VariableOperand = i;
-          VariableScale = TypeSize;
-        }
-      }
-    }
-    
-    // A common case is for the GEP to only do a constant offset.  In this case,
-    // just add it to the disp field and check validity.
-    if (VariableOperand == -1) {
-      AddrMode.BaseOffs += ConstantOffset;
-      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
-        // Check to see if we can fold the base pointer in too.
-        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
-          return true;
-      }
-      AddrMode.BaseOffs -= ConstantOffset;
-      return false;
-    }
-
-    // Save the valid addressing mode in case we can't match.
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-
-    // See if the scale and offset amount is valid for this target.
-    AddrMode.BaseOffs += ConstantOffset;
-
-    // Match the base operand of the GEP.
-    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
-      // If it couldn't be matched, just stuff the value in a register.
-      if (AddrMode.HasBaseReg) {
-        AddrMode = BackupAddrMode;
-        AddrModeInsts.resize(OldSize);
-        return false;
-      }
-      AddrMode.HasBaseReg = true;
-      AddrMode.BaseReg = AddrInst->getOperand(0);
-    }
-
-    // Match the remaining variable portion of the GEP.
-    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
-                          Depth)) {
-      // If it couldn't be matched, try stuffing the base into a register
-      // instead of matching it, and retrying the match of the scale.
-      AddrMode = BackupAddrMode;
-      AddrModeInsts.resize(OldSize);
-      if (AddrMode.HasBaseReg)
-        return false;
-      AddrMode.HasBaseReg = true;
-      AddrMode.BaseReg = AddrInst->getOperand(0);
-      AddrMode.BaseOffs += ConstantOffset;
-      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
-                            VariableScale, Depth)) {
-        // If even that didn't work, bail.
-        AddrMode = BackupAddrMode;
-        AddrModeInsts.resize(OldSize);
-        return false;
-      }
-    }
-
-    return true;
-  }
-  }
-  return false;
-}
-
-/// MatchAddr - If we can, try to add the value of 'Addr' into the current
-/// addressing mode.  If Addr can't be added to AddrMode this returns false and
-/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
-/// or intptr_t for the target.
-///
-bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
-    // Fold in immediates if legal for the target.
-    AddrMode.BaseOffs += CI->getSExtValue();
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.BaseOffs -= CI->getSExtValue();
-  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
-    // If this is a global variable, try to fold it into the addressing mode.
-    if (AddrMode.BaseGV == 0) {
-      AddrMode.BaseGV = GV;
-      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-        return true;
-      AddrMode.BaseGV = 0;
-    }
-  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-
-    // Check to see if it is possible to fold this operation.
-    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
-      // Okay, it's possible to fold this.  Check to see if it is actually
-      // *profitable* to do so.  We use a simple cost model to avoid increasing
-      // register pressure too much.
-      if (I->hasOneUse() ||
-          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
-        AddrModeInsts.push_back(I);
-        return true;
-      }
-      
-      // It isn't profitable to do this, roll back.
-      //cerr << "NOT FOLDING: " << *I;
-      AddrMode = BackupAddrMode;
-      AddrModeInsts.resize(OldSize);
-    }
-  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
-    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
-      return true;
-  } else if (isa<ConstantPointerNull>(Addr)) {
-    // Null pointer gets folded without affecting the addressing mode.
-    return true;
-  }
-
-  // Worse case, the target should support [reg] addressing modes. :)
-  if (!AddrMode.HasBaseReg) {
-    AddrMode.HasBaseReg = true;
-    AddrMode.BaseReg = Addr;
-    // Still check for legality in case the target supports [imm] but not [i+r].
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.HasBaseReg = false;
-    AddrMode.BaseReg = 0;
-  }
-
-  // If the base register is already taken, see if we can do [r+r].
-  if (AddrMode.Scale == 0) {
-    AddrMode.Scale = 1;
-    AddrMode.ScaledReg = Addr;
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.Scale = 0;
-    AddrMode.ScaledReg = 0;
-  }
-  // Couldn't match.
-  return false;
-}
-
-
-/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
-/// inline asm call are due to memory operands.  If so, return true, otherwise
-/// return false.
-static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetLowering &TLI) {
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
-  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
-    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-    
-    // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
-
-    // If this asm operand is our Value*, and if it isn't an indirect memory
-    // operand, we can't fold it!
-    if (OpInfo.CallOperandVal == OpVal &&
-        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
-         !OpInfo.isIndirect))
-      return false;
-  }
-
-  return true;
-}
-
-
-/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
-/// memory use.  If we find an obviously non-foldable instruction, return true.
-/// Add the ultimately found memory instructions to MemoryUses.
-static bool FindAllMemoryUses(Instruction *I,
-                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
-                              const TargetLowering &TLI) {
-  // If we already considered this instruction, we're done.
-  if (!ConsideredInsts.insert(I))
-    return false;
-  
-  // If this is an obviously unfoldable instruction, bail out.
-  if (!MightBeFoldableInst(I))
-    return true;
-
-  // Loop over all the uses, recursively processing them.
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI) {
-    User *U = *UI;
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
-      continue;
-    }
-    
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      unsigned opNo = UI.getOperandNo();
-      if (opNo == 0) return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(SI, opNo));
-      continue;
-    }
-    
-    if (CallInst *CI = dyn_cast<CallInst>(U)) {
-      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
-      if (!IA) return true;
-      
-      // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
-        return true;
-      continue;
-    }
-    
-    if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
-                          TLI))
-      return true;
-  }
-
-  return false;
-}
-
-
-/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
-/// the use site that we're folding it into.  If so, there is no cost to
-/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
-/// that we know are live at the instruction already.
-bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
-                                                   Value *KnownLive2) {
-  // If Val is either of the known-live values, we know it is live!
-  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
-    return true;
-  
-  // All values other than instructions and arguments (e.g. constants) are live.
-  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
-  
-  // If Val is a constant sized alloca in the entry block, it is live, this is
-  // true because it is just a reference to the stack/frame pointer, which is
-  // live for the whole function.
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
-    if (AI->isStaticAlloca())
-      return true;
-  
-  // Check to see if this value is already used in the memory instruction's
-  // block.  If so, it's already live into the block at the very least, so we
-  // can reasonably fold it.
-  return Val->isUsedInBasicBlock(MemoryInst->getParent());
-}
-
-
-
-/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
-/// mode of the machine to fold the specified instruction into a load or store
-/// that ultimately uses it.  However, the specified instruction has multiple
-/// uses.  Given this, it may actually increase register pressure to fold it
-/// into the load.  For example, consider this code:
-///
-///     X = ...
-///     Y = X+1
-///     use(Y)   -> nonload/store
-///     Z = Y+1
-///     load Z
-///
-/// In this case, Y has multiple uses, and can be folded into the load of Z
-/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
-/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
-/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
-/// number of computations either.
-///
-/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
-/// X was live across 'load Z' for other reasons, we actually *would* want to
-/// fold the addressing mode in the Z case.  This would make Y die earlier.
-bool AddressingModeMatcher::
-IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
-                                     ExtAddrMode &AMAfter) {
-  if (IgnoreProfitability) return true;
-  
-  // AMBefore is the addressing mode before this instruction was folded into it,
-  // and AMAfter is the addressing mode after the instruction was folded.  Get
-  // the set of registers referenced by AMAfter and subtract out those
-  // referenced by AMBefore: this is the set of values which folding in this
-  // address extends the lifetime of.
-  //
-  // Note that there are only two potential values being referenced here,
-  // BaseReg and ScaleReg (global addresses are always available, as are any
-  // folded immediates).
-  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
-  
-  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
-  // lifetime wasn't extended by adding this instruction.
-  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    BaseReg = 0;
-  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    ScaledReg = 0;
-
-  // If folding this instruction (and it's subexprs) didn't extend any live
-  // ranges, we're ok with it.
-  if (BaseReg == 0 && ScaledReg == 0)
-    return true;
-
-  // If all uses of this instruction are ultimately load/store/inlineasm's,
-  // check to see if their addressing modes will include this instruction.  If
-  // so, we can fold it into all uses, so it doesn't matter if it has multiple
-  // uses.
-  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
-  SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
-    return false;  // Has a non-memory, non-foldable use!
-  
-  // Now that we know that all uses of this instruction are part of a chain of
-  // computation involving only operations that could theoretically be folded
-  // into a memory use, loop over each of these uses and see if they could
-  // *actually* fold the instruction.
-  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
-  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
-    Instruction *User = MemoryUses[i].first;
-    unsigned OpNo = MemoryUses[i].second;
-    
-    // Get the access type of this use.  If the use isn't a pointer, we don't
-    // know what it accesses.
-    Value *Address = User->getOperand(OpNo);
-    if (!Address->getType()->isPointerTy())
-      return false;
-    Type *AddressAccessTy =
-      cast<PointerType>(Address->getType())->getElementType();
-    
-    // Do a match against the root of this address, ignoring profitability. This
-    // will tell us if the addressing mode for the memory operation will
-    // *actually* cover the shared instruction.
-    ExtAddrMode Result;
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
-                                  MemoryInst, Result);
-    Matcher.IgnoreProfitability = true;
-    bool Success = Matcher.MatchAddr(Address, 0);
-    (void)Success; assert(Success && "Couldn't select *anything*?");
-
-    // If the match didn't cover I, then it won't be shared by it.
-    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
-                  I) == MatchedAddrModeInsts.end())
-      return false;
-    
-    MatchedAddrModeInsts.clear();
-  }
-  
-  return true;
-}
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 9fea113..ba99d2e 100644
--- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -13,20 +13,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Constant.h"
-#include "llvm/Type.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -37,12 +37,12 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
   TerminatorInst *BBTerm = BB->getTerminator();
-  
+
   // Loop through all of our successors and make sure they know that one
   // of their predecessors is going away.
   for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
     BBTerm->getSuccessor(i)->removePredecessor(BB);
-  
+
   // Zap all the instructions in the block.
   while (!BB->empty()) {
     Instruction &I = BB->back();
@@ -55,7 +55,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
       I.replaceAllUsesWith(UndefValue::get(I.getType()));
     BB->getInstList().pop_back();
   }
-  
+
   // Zap the block!
   BB->eraseFromParent();
 }
@@ -66,25 +66,25 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 /// when the block has exactly one predecessor.
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
   if (!isa<PHINode>(BB->begin())) return;
-  
+
   AliasAnalysis *AA = 0;
   MemoryDependenceAnalysis *MemDep = 0;
   if (P) {
     AA = P->getAnalysisIfAvailable<AliasAnalysis>();
     MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>();
   }
-  
+
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     if (PN->getIncomingValue(0) != PN)
       PN->replaceAllUsesWith(PN->getIncomingValue(0));
     else
       PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-    
+
     if (MemDep)
       MemDep->removeInstruction(PN);  // Memdep updates AA itself.
     else if (AA && isa<PointerType>(PN->getType()))
       AA->deleteValue(PN);
-    
+
     PN->eraseFromParent();
   }
 }
@@ -115,7 +115,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
-  
+
   // Can't merge if there are multiple predecessors, or no predecessors.
   BasicBlock *PredBB = BB->getUniquePredecessor();
   if (!PredBB) return false;
@@ -124,7 +124,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   if (PredBB == BB) return false;
   // Don't break invokes.
   if (isa<InvokeInst>(PredBB->getTerminator())) return false;
-  
+
   succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
   BasicBlock *OnlySucc = BB;
   for (; SI != SE; ++SI)
@@ -132,7 +132,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
       OnlySucc = 0;     // There are multiple distinct successors!
       break;
     }
-  
+
   // Can't merge if there are multiple successors.
   if (!OnlySucc) return false;
 
@@ -149,21 +149,21 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   // Begin by getting rid of unneeded PHIs.
   if (isa<PHINode>(BB->front()))
     FoldSingleEntryPHINodes(BB, P);
-  
+
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
-  
+
   // Make all PHI nodes that referred to BB now refer to Pred as their
   // source...
   BB->replaceAllUsesWith(PredBB);
-  
+
   // Move all definitions in the successor to the predecessor...
   PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
-  
+
   // Inherit predecessors name if it exists.
   if (!PredBB->hasName())
     PredBB->takeName(BB);
-  
+
   // Finally, erase the old block and update dominator info.
   if (P) {
     if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
@@ -176,16 +176,16 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
 
         DT->eraseNode(BB);
       }
-      
+
       if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
         LI->removeBlock(BB);
-      
+
       if (MemoryDependenceAnalysis *MD =
             P->getAnalysisIfAvailable<MemoryDependenceAnalysis>())
         MD->invalidateCachedPredecessors();
     }
   }
-  
+
   BB->eraseFromParent();
   return true;
 }
@@ -251,11 +251,11 @@ unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
   }
 }
 
-/// SplitEdge -  Split the edge connecting specified block. Pass P must 
-/// not be NULL. 
+/// SplitEdge -  Split the edge connecting specified block. Pass P must
+/// not be NULL.
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
-  
+
   // If this is a critical edge, let SplitCriticalEdge do it.
   TerminatorInst *LatchTerm = BB->getTerminator();
   if (SplitCriticalEdge(LatchTerm, SuccNum, P))
@@ -271,11 +271,11 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
     SP = NULL;
     return SplitBlock(Succ, Succ->begin(), P);
   }
-  
+
   // Otherwise, if BB has a single successor, split it at the bottom of the
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
-         "Should have a single succ!"); 
+         "Should have a single succ!");
   return SplitBlock(BB, BB->getTerminator(), P);
 }
 
@@ -301,12 +301,12 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
     if (DomTreeNode *OldNode = DT->getNode(Old)) {
       std::vector<DomTreeNode *> Children;
       for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-           I != E; ++I) 
+           I != E; ++I)
         Children.push_back(*I);
 
       DomTreeNode *NewNode = DT->addNewBlock(New,Old);
       for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
-             E = Children.end(); I != E; ++I) 
+             E = Children.end(); I != E; ++I)
         DT->changeImmediateDominator(*I, NewNode);
     }
   }
@@ -424,7 +424,7 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
       PHINode *NewPHI =
         PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
       if (AA) AA->copyValue(PN, NewPHI);
-      
+
       // Move all of the PHI values for 'Preds' to the new PHI.
       for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
         Value *V = PN->removeIncomingValue(Preds[i], false);
@@ -451,16 +451,16 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
 /// preserve LoopSimplify (because it's complicated to handle the case where one
 /// of the edges being split is an exit of a loop with other exits).
 ///
-BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock*> Preds,
                                          const char *Suffix, Pass *P) {
   // Create new basic block, insert right before the original block.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
                                          BB->getParent(), BB);
-  
+
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
-  
+
   // Move the edges from Preds to point to NewBB instead of BB.
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     // This is slightly more strict than necessary; the minimum requirement
@@ -497,13 +497,13 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 /// block gets the remaining predecessors of OrigBB. The landingpad instruction
 /// OrigBB is clone into both of the new basic blocks. The new blocks are given
 /// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
-/// 
+///
 /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
 /// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
 /// it does not preserve LoopSimplify (because it's complicated to handle the
 /// case where one of the edges being split is an exit of a loop with other
 /// exits).
-/// 
+///
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock*> Preds,
                                        const char *Suffix1, const char *Suffix2,
@@ -608,11 +608,11 @@ void llvm::FindFunctionBackedges(const Function &F,
   const BasicBlock *BB = &F.getEntryBlock();
   if (succ_begin(BB) == succ_end(BB))
     return;
-  
+
   SmallPtrSet<const BasicBlock*, 8> Visited;
   SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
   SmallPtrSet<const BasicBlock*, 8> InStack;
-  
+
   Visited.insert(BB);
   VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
   InStack.insert(BB);
@@ -620,7 +620,7 @@ void llvm::FindFunctionBackedges(const Function &F,
     std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
     const BasicBlock *ParentBB = Top.first;
     succ_const_iterator &I = Top.second;
-    
+
     bool FoundNew = false;
     while (I != succ_end(ParentBB)) {
       BB = *I++;
@@ -632,7 +632,7 @@ void llvm::FindFunctionBackedges(const Function &F,
       if (InStack.count(BB))
         Result.push_back(std::make_pair(ParentBB, BB));
     }
-    
+
     if (FoundNew) {
       // Go down one level if there is a unvisited successor.
       InStack.insert(BB);
@@ -641,7 +641,7 @@ void llvm::FindFunctionBackedges(const Function &F,
       // Go up one level.
       InStack.erase(VisitStack.pop_back_val().first);
     }
-  } while (!VisitStack.empty()); 
+  } while (!VisitStack.empty());
 }
 
 /// FoldReturnIntoUncondBranch - This method duplicates the specified return
@@ -655,7 +655,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   // Clone the return and add it to the end of the predecessor.
   Instruction *NewRet = RI->clone();
   Pred->getInstList().push_back(NewRet);
-      
+
   // If the return instruction returns a value, and if the value was a
   // PHI node in "BB", propagate the right value into the return.
   for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
@@ -679,7 +679,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
       }
     }
   }
-      
+
   // Update any PHI nodes in the returning block to realize that we no
   // longer branch to them.
   BB->removePredecessor(Pred);
diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 6b04e3d..8513772 100644
--- a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -17,17 +17,17 @@
 
 #define DEBUG_TYPE "break-crit-edges"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Type.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 using namespace llvm;
 
 STATISTIC(NumBroken, "Number of blocks inserted");
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 74b2ee1..6d13217 100644
--- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -12,17 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/DataLayout.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 
 using namespace llvm;
@@ -40,16 +38,16 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen",
-                                            AttrListPtr::get(M->getContext(),
-                                                             AWI),
+                                            AttributeSet::get(M->getContext(),
+                                                              AS),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -69,16 +67,16 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrNLen = M->getOrInsertFunction("strnlen",
-                                             AttrListPtr::get(M->getContext(),
-                                                              AWI),
+                                             AttributeSet::get(M->getContext(),
+                                                              AS),
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
                                              TD->getIntPtrType(Context),
@@ -99,16 +97,16 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AttributeWithIndex AWI =
-    AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                            ArrayRef<Attributes::AttrVal>(AVs, 2));
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AttributeSet AS =
+    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                      ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr",
-                                            AttrListPtr::get(M->getContext(),
-                                                             AWI),
+                                            AttributeSet::get(M->getContext(),
+                                                             AS),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -125,17 +123,17 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp",
-                                          AttrListPtr::get(M->getContext(),
-                                                           AWI),
+                                          AttributeSet::get(M->getContext(),
+                                                           AS),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -158,13 +156,13 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   Attributes::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name,
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -182,14 +180,14 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   Attributes::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name,
-                                          AttrListPtr::get(M->getContext(),
-                                                           AWI),
+                                          AttributeSet::get(M->getContext(),
+                                                            AS),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -209,12 +207,12 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                Attributes::NoUnwind);
+  AttributeSet AS;
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -237,13 +235,13 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI;
-  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                ArrayRef<Attributes::AttrVal>(AVs, 2));
+  AttributeSet AS;
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         ArrayRef<Attribute::AttrKind>(AVs, 2));
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr",
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -265,16 +263,16 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp",
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -293,7 +291,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
 /// returns one value with the same type.  If 'Op' is a long double, 'l' is
 /// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
 Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttrListPtr &Attrs) {
+                                  const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
   if (!Op->getType()->isDoubleTy()) {
     // If we need to add a suffix, copy into NameBuffer.
@@ -346,13 +344,13 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   Attributes::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
 
   Value *PutS = M->getOrInsertFunction("puts",
-                                       AttrListPtr::get(M->getContext(), AWI),
+                                       AttributeSet::get(M->getContext(), AS),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -370,14 +368,14 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   Attributes::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction("fputc",
-                               AttrListPtr::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -403,16 +401,16 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   Attributes::NoUnwind);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FPutsName,
-                               AttrListPtr::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -436,17 +434,17 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attributes::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
-                                   Attributes::NoUnwind);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 4, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FWriteName,
-                               AttrListPtr::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index bee2f7b..1f517d0 100644
--- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -16,11 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "bypass-slow-division"
-#include "llvm/Instructions.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
@@ -163,7 +163,7 @@ static bool insertFastDiv(Function &F,
   Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
 
   // Compare operand values and branch
-  Value *ZeroV = MainBuilder.getInt32(0);
+  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
   Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
   MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
 
@@ -244,7 +244,7 @@ bool llvm::bypassSlowDivision(Function &F,
 
     // Get bitwidth of div/rem instruction
     IntegerType *T = cast<IntegerType>(J->getType());
-    int bitwidth = T->getBitWidth();
+    unsigned int bitwidth = T->getBitWidth();
 
     // Continue if bitwidth is not bypassed
     DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 7ba9f6d..63d7a1d 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,22 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Constants.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/ADT/SmallVector.h"
 #include <map>
 using namespace llvm;
 
@@ -94,19 +94,20 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     //Some arguments were deleted with the VMap. Copy arguments one by one
     for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
            E = OldFunc->arg_end(); I != E; ++I)
-      if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
-        Anew->addAttr( OldFunc->getAttributes()
-                       .getParamAttributes(I->getArgNo() + 1));
+      if (Argument* Anew = dyn_cast<Argument>(VMap[I])) {
+        AttributeSet attrs = OldFunc->getAttributes()
+          .getParamAttributes(I->getArgNo() + 1);
+        if (attrs.getNumSlots() > 0)
+          Anew->addAttr(attrs);
+      }
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(NewFunc->getContext(),
-                                    AttrListPtr::ReturnIndex,
-                                    OldFunc->getAttributes()
-                                     .getRetAttributes()));
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::ReturnIndex,
+                                          OldFunc->getAttributes()));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(NewFunc->getContext(),
-                                    AttrListPtr::FunctionIndex,
-                                    OldFunc->getAttributes()
-                                     .getFnAttributes()));
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::FunctionIndex,
+                                          OldFunc->getAttributes()));
 
   }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
index 1dac6b5..64df089 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Module.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Constant.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
@@ -38,10 +38,6 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   New->setTargetTriple(M->getTargetTriple());
   New->setModuleInlineAsm(M->getModuleInlineAsm());
    
-  // Copy all of the dependent libraries over.
-  for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I)
-    New->addLibrary(*I);
-
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
   // don't worry about attributes or initializers, they will come later.
diff --git a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
index 9b09915..8fa412a 100644
--- a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CmpInstAnalysis.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 281714f..f7c659f 100644
--- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -14,25 +14,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <set>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index 99b5830..db525cd 100644
--- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
@@ -78,12 +79,21 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     InsertPt = &I;
     ++InsertPt;
   } else {
-    // We cannot demote invoke instructions to the stack if their normal edge
-    // is critical.
     InvokeInst &II = cast<InvokeInst>(I);
-    assert(II.getNormalDest()->getSinglePredecessor() &&
-           "Cannot demote invoke with a critical successor!");
-    InsertPt = II.getNormalDest()->begin();
+    if (II.getNormalDest()->getSinglePredecessor())
+      InsertPt = II.getNormalDest()->getFirstInsertionPt();
+    else {
+      // We cannot demote invoke instructions to the stack if their normal edge
+      // is critical.  Therefore, split the critical edge and insert the store
+      // in the newly created basic block.
+      unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest());
+      TerminatorInst *TI = &cast<TerminatorInst>(I);
+      assert (isCriticalEdge(TI, SuccNum) &&
+              "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(TI, SuccNum);
+      assert (BB && "Unable to split critical edge.");
+      InsertPt = BB->getFirstInsertionPt();
+    }
   }
 
   for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
@@ -124,7 +134,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   }
 
   // Insert a load in place of the PHI and replace all uses.
-  Value *V = new LoadInst(Slot, P->getName()+".reload", P);
+  BasicBlock::iterator InsertPt = P;
+
+  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+
+  Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt);
   P->replaceAllUsesWith(V);
 
   // Delete PHI.
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 009847f..e9828d6 100644
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -13,21 +13,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Attributes.h"
-#include "llvm/Constants.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -82,7 +82,8 @@ namespace {
     /// a simple branch. When there is more than one predecessor, we need to
     /// split the landing pad block after the landingpad instruction and jump
     /// to there.
-    void forwardResume(ResumeInst *RI);
+    void forwardResume(ResumeInst *RI,
+                       SmallPtrSet<LandingPadInst*, 16> &InlinedLPads);
 
     /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
     /// destination block for the given basic block, using the values for the
@@ -140,8 +141,10 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
 /// block. When the landing pad block has only one predecessor, this is a simple
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
-void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
+void InvokeInliningInfo::forwardResume(ResumeInst *RI,
+                               SmallPtrSet<LandingPadInst*, 16> &InlinedLPads) {
   BasicBlock *Dest = getInnerResumeDest();
+  LandingPadInst *OuterLPad = getLandingPadInst();
   BasicBlock *Src = RI->getParent();
 
   BranchInst::Create(Dest, Src);
@@ -152,6 +155,16 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
 
   InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
   RI->eraseFromParent();
+
+  // Append the clauses from the outer landing pad instruction into the inlined
+  // landing pad instructions.
+  for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(),
+         E = InlinedLPads.end(); I != E; ++I) {
+    LandingPadInst *InlinedLPad = *I;
+    for (unsigned OuterIdx = 0, OuterNum = OuterLPad->getNumClauses();
+         OuterIdx != OuterNum; ++OuterIdx)
+      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+  }
 }
 
 /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
@@ -229,19 +242,15 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
 
   // The inlined code is currently at the end of the function, scan from the
   // start of the inlined code to its end, checking for stuff we need to
-  // rewrite.  If the code doesn't have calls or unwinds, we know there is
-  // nothing to rewrite.
-  if (!InlinedCodeInfo.ContainsCalls) {
-    // Now that everything is happy, we have one final detail.  The PHI nodes in
-    // the exception destination block still have entries due to the original
-    // invoke instruction.  Eliminate these entries (which might even delete the
-    // PHI node) now.
-    InvokeDest->removePredecessor(II->getParent());
-    return;
-  }
-
+  // rewrite.
   InvokeInliningInfo Invoke(II);
-  
+
+  // Get all of the inlined landing pad instructions.
+  SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+  for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+      InlinedLPads.insert(II->getLandingPadInst());
+
   for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
     if (InlinedCodeInfo.ContainsCalls)
       if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
@@ -250,13 +259,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
         continue;
       }
 
+    // Forward any resumes that are remaining here.
     if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
-      Invoke.forwardResume(RI);
+      Invoke.forwardResume(RI, InlinedLPads);
   }
 
   // Now that everything is happy, we have one final detail.  The PHI nodes in
   // the exception destination block still have entries due to the original
-  // invoke instruction.  Eliminate these entries (which might even delete the
+  // invoke instruction. Eliminate these entries (which might even delete the
   // PHI node) now.
   InvokeDest->removePredecessor(II->getParent());
 }
@@ -668,10 +678,29 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       if (hasLifetimeMarkers(AI))
         continue;
 
-      builder.CreateLifetimeStart(AI);
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = 0;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        if (IFI.TD) {
+          Type *AllocaType = AI->getAllocatedType();
+          uint64_t AllocaTypeSize = IFI.TD->getTypeAllocSize(AllocaType);
+          uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+          assert(AllocaArraySize > 0 && "array size of AllocaInst is zero");
+          // Check that array size doesn't saturate uint64_t and doesn't
+          // overflow when it's multiplied by type size.
+          if (AllocaArraySize != ~0ULL &&
+              UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+            AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                          AllocaArraySize * AllocaTypeSize);
+          }
+        }
+      }
+
+      builder.CreateLifetimeStart(AI, AllocaSize);
       for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) {
         IRBuilder<> builder(Returns[ri]);
-        builder.CreateLifetimeEnd(AI);
+        builder.CreateLifetimeEnd(AI, AllocaSize);
       }
     }
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index 45c15de..a020bc7 100644
--- a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -15,9 +15,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Function.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Type.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
index 55227e2..3cb8ded 100644
--- a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -15,11 +15,11 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "integer-division"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/IRBuilder.h"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 
 using namespace llvm;
 
@@ -418,3 +418,107 @@ bool llvm::expandDivision(BinaryOperator *Div) {
 
   return true;
 }
+
+/// Generate code to compute the remainder of two integers of bitwidth up to 
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer 
+/// arithmetic.
+///
+/// @brief Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  if (RemTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  if (RemTyBitWidth > 32) 
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (RemTyBitWidth == 32) 
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// @brief Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  if (DivTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  if (DivTyBitWidth > 32)
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (DivTyBitWidth == 32)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);  
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
index 5e05c83..2d1b166 100644
--- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -29,17 +29,17 @@
 
 #define DEBUG_TYPE "lcssa"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Pass.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/PredIteratorCache.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumLCSSA, "Number of live out of a loop variables");
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
index a954d82..be80d34 100644
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -13,33 +13,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/MDBuilder.h"
-#include "llvm/Metadata.h"
-#include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -604,7 +605,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
   // possible to handle such cases, but difficult: it requires checking whether
   // BB dominates Succ, which is non-trivial to calculate in the case where
   // Succ has multiple predecessors.  Also, it requires checking whether
-  // constructing the necessary self-referential PHI node doesn't intoduce any
+  // constructing the necessary self-referential PHI node doesn't introduce any
   // conflicts; this isn't too difficult, but the previous code for doing this
   // was incorrect.
   //
@@ -928,3 +929,73 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
 
   return 0;
 }
+
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+  if (!DDI)
+    return false;
+  DIVariable DIVar(DDI->getVariable());
+  if (!DIVar.Verify())
+    return false;
+
+  // Create a copy of the original DIDescriptor for user variable, appending
+  // "deref" operation to a list of address elements, as new llvm.dbg.declare
+  // will take a value storing address of the memory for variable, not
+  // alloca itself.
+  Type *Int64Ty = Type::getInt64Ty(AI->getContext());
+  SmallVector<Value*, 4> NewDIVarAddress;
+  if (DIVar.hasComplexAddress()) {
+    for (unsigned i = 0, n = DIVar.getNumAddrElements(); i < n; ++i) {
+      NewDIVarAddress.push_back(
+          ConstantInt::get(Int64Ty, DIVar.getAddrElement(i)));
+    }
+  }
+  NewDIVarAddress.push_back(ConstantInt::get(Int64Ty, DIBuilder::OpDeref));
+  DIVariable NewDIVar = Builder.createComplexVariable(
+      DIVar.getTag(), DIVar.getContext(), DIVar.getName(),
+      DIVar.getFile(), DIVar.getLineNumber(), DIVar.getType(),
+      NewDIVarAddress, DIVar.getArgNumber());
+
+  // Insert llvm.dbg.declare in the same basic block as the original alloca,
+  // and remove old llvm.dbg.declare.
+  BasicBlock *BB = AI->getParent();
+  Builder.insertDeclare(NewAllocaAddress, NewDIVar, BB);
+  DDI->eraseFromParent();
+  return true;
+}
+
+bool llvm::removeUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 16> Reachable;
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(&F.getEntryBlock());
+  Reachable.insert(&F.getEntryBlock());
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.insert(*SI))
+        Worklist.push_back(*SI);
+  } while (!Worklist.empty());
+
+  if (Reachable.size() == F.size())
+    return false;
+
+  assert(Reachable.size() < F.size());
+  for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) {
+    if (Reachable.count(I))
+      continue;
+
+    for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(I);
+    I->dropAllReferences();
+  }
+
+  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 9d9e201..37819cc 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -39,26 +39,26 @@
 
 #define DEBUG_TYPE "loop-simplify"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Type.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 2023750..cb581b3 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -18,12 +18,12 @@
 
 #define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 67e17f4..d801d5f 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -23,12 +23,12 @@
 
 #define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 02bdcda..4aee8ff 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -12,17 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "lower-expect-intrinsic"
-#include "llvm/BasicBlock.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/MDBuilder.h"
-#include "llvm/Metadata.h"
-#include "llvm/Pass.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <vector>
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index 9305554..9ec84d7 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -36,19 +36,19 @@
 
 #define DEBUG_TYPE "lowerinvoke"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <csetjmp>
 #include <set>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 1547439..955b853 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -14,16 +14,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index f4ca81a..61b3965 100644
--- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -14,12 +14,12 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Instructions.h"
-#include "llvm/Function.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumPromoted, "Number of alloca's promoted");
diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index 233bc12..3716f58 100644
--- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -13,16 +13,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Module.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
 #include "llvm/Pass.h"
-#include "llvm/Type.h"
-#include "llvm/TypeFinder.h"
-
 using namespace llvm;
 
 namespace {
@@ -37,7 +36,7 @@ namespace {
       next = seed;
     }
 
-    int rand(void) {
+    int rand() {
       next = next * 1103515245 + 12345;
       return (unsigned int)(next / 65536) % 32768;
     }
@@ -73,13 +72,23 @@ namespace {
 
       // Rename all aliases
       for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
-           AI != AE; ++AI)
-        AI->setName("alias");
+           AI != AE; ++AI) {
+        StringRef Name = AI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
 
+        AI->setName("alias");
+      }
+      
       // Rename all global variables
       for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
-           GI != GE; ++GI)
+           GI != GE; ++GI) {
+        StringRef Name = GI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
         GI->setName("global");
+      }
 
       // Rename all struct types
       TypeFinder StructTypes;
@@ -96,6 +105,10 @@ namespace {
       // Rename all functions
       for (Module::iterator FI = M.begin(), FE = M.end();
            FI != FE; ++FI) {
+        StringRef Name = FI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
         FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
         runOnFunction(*FI);
       }
diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index dbcf3b2..d090b48 100644
--- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Module.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 558de9d..de335ec 100644
--- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -27,26 +27,26 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Constants.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Metadata.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <queue>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 72d4199..9d90fbe 100644
--- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -12,12 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ssaupdater"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CFG.h"
@@ -25,7 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c767da6..681bf9c 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -13,19 +13,6 @@
 
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/DataLayout.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/MDBuilder.h"
-#include "llvm/Metadata.h"
-#include "llvm/Module.h"
-#include "llvm/Operator.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -33,18 +20,31 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
-#include <set>
 #include <map>
+#include <set>
 using namespace llvm;
 
 static cl::opt<unsigned>
@@ -82,8 +82,8 @@ namespace {
   };
 
 class SimplifyCFGOpt {
+  const TargetTransformInfo &TTI;
   const DataLayout *const TD;
-  const TargetTransformInfo *const TTI;
 
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
@@ -103,8 +103,8 @@ class SimplifyCFGOpt {
   bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
-  SimplifyCFGOpt(const DataLayout *td, const TargetTransformInfo *tti)
-      : TD(td), TTI(tti) {}
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD)
+      : TTI(TTI), TD(TD) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -858,7 +858,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
       if (PredHasWeights) {
         GetBranchWeights(PTI, Weights);
-        // branch-weight metadata is inconsistant here.
+        // branch-weight metadata is inconsistent here.
         if (Weights.size() != 1 + PredCases.size())
           PredHasWeights = SuccHasWeights = false;
       } else if (SuccHasWeights)
@@ -870,7 +870,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       SmallVector<uint64_t, 8> SuccWeights;
       if (SuccHasWeights) {
         GetBranchWeights(TI, SuccWeights);
-        // branch-weight metadata is inconsistant here.
+        // branch-weight metadata is inconsistent here.
         if (SuccWeights.size() != 1 + BBCases.size())
           PredHasWeights = SuccHasWeights = false;
       } else if (PredHasWeights)
@@ -967,8 +967,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
-          if (PredHasWeights || SuccHasWeights) 
-            Weights.push_back(WeightsForHandled[*I]); 
+          if (PredHasWeights || SuccHasWeights)
+            Weights.push_back(WeightsForHandled[*I]);
           PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
@@ -1193,7 +1193,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
        I != E; ++I) {
     if (PHINode *PN = dyn_cast<PHINode>(I)) {
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
-      Value *BB2V = PN->getIncomingValueForBlock(BB2); 
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
       MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN);
     } else {
       FirstNonPhiInBBEnd = &*I;
@@ -1202,7 +1202,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   }
   if (!FirstNonPhiInBBEnd)
     return false;
-  
+
 
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  We scan backward for obviously identical
@@ -1332,149 +1332,180 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   return Changed;
 }
 
-/// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
-/// and an BB2 and the only successor of BB1 is BB2, hoist simple code
-/// (for now, restricted to a single instruction that's side effect free) from
-/// the BB1 into the branch block to speculatively execute it.
+/// \brief Speculate a conditional basic block flattening the CFG.
+///
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
 ///
-/// Turn
-/// BB:
-///     %t1 = icmp
-///     br i1 %t1, label %BB1, label %BB2
-/// BB1:
-///     %t3 = add %t2, c
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     %sub = sub %x, %y
 ///     br label BB2
-/// BB2:
-/// =>
-/// BB:
-///     %t1 = icmp
-///     %t4 = add %t2, c
-///     %t3 = select i1 %t1, %t2, %t3
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
-  // Only speculatively execution a single instruction (not counting the
-  // terminator) for now.
-  Instruction *HInst = NULL;
-  Instruction *Term = BB1->getTerminator();
-  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end();
+///   EndBB:
+///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+///     ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     %sub = sub %x, %y
+///     %cond = select i1 %cmp, 0, %sub
+///     ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<FCmpInst>(BrCond))
+    return false;
+
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+
+  // If ThenBB is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (ThenBB != BI->getSuccessor(0)) {
+    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+  // Keep a count of how many times instructions are used within CondBB when
+  // they are candidates for sinking into CondBB. Specifically:
+  // - They are defined in BB, and
+  // - They have no side effects, and
+  // - All of their uses are in CondBB.
+  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+  unsigned SpeculationCost = 0;
+  for (BasicBlock::iterator BBI = ThenBB->begin(),
+                            BBE = llvm::prior(ThenBB->end());
        BBI != BBE; ++BBI) {
     Instruction *I = BBI;
     // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I)) continue;
-    if (I == Term) break;
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
 
-    if (HInst)
+    // Only speculatively execution a single instruction (not counting the
+    // terminator) for now.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
       return false;
-    HInst = I;
-  }
-
-  BasicBlock *BIParent = BI->getParent();
 
-  // Check the instruction to be hoisted, if there is one.
-  if (HInst) {
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(HInst))
+    if (!isSafeToSpeculativelyExecute(I))
       return false;
-    if (ComputeSpeculationCost(HInst) > PHINodeFoldingThreshold)
+    if (ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
       return false;
 
     // Do not hoist the instruction if any of its operands are defined but not
     // used in this BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end();
+    for (User::op_iterator i = I->op_begin(), e = I->op_end();
          i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
-      if (OpI && OpI->getParent() == BIParent &&
-          !OpI->mayHaveSideEffects() &&
-          !OpI->isUsedInBasicBlock(BIParent))
-        return false;
+      if (!OpI || OpI->getParent() != BB ||
+          OpI->mayHaveSideEffects())
+        continue; // Not a candidate for sinking.
+
+      ++SinkCandidateUseCounts[OpI];
     }
   }
 
-  // Be conservative for now. FP select instruction can often be expensive.
-  Value *BrCond = BI->getCondition();
-  if (isa<FCmpInst>(BrCond))
-    return false;
-
-  // If BB1 is actually on the false edge of the conditional branch, remember
-  // to swap the select operands later.
-  bool Invert = false;
-  if (BB1 != BI->getSuccessor(0)) {
-    assert(BB1 == BI->getSuccessor(1) && "No edge from 'if' block?");
-    Invert = true;
-  }
+  // Consider any sink candidates which are only used in CondBB as costs for
+  // speculation. Note, while we iterate over a DenseMap here, we are summing
+  // and so iteration order isn't significant.
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator I =
+           SinkCandidateUseCounts.begin(), E = SinkCandidateUseCounts.end();
+       I != E; ++I)
+    if (I->first->getNumUses() == I->second) {
+      ++SpeculationCost;
+      if (SpeculationCost > 1)
+        return false;
+    }
 
-  // Collect interesting PHIs, and scan for hazards.
-  SmallSetVector<std::pair<Value *, Value *>, 4> PHIs;
-  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
-  for (BasicBlock::iterator I = BB2->begin();
+  // Check that the PHI nodes can be converted to selects.
+  bool HaveRewritablePHIs = false;
+  for (BasicBlock::iterator I = EndBB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    Value *BB1V = PN->getIncomingValueForBlock(BB1);
-    Value *BIParentV = PN->getIncomingValueForBlock(BIParent);
+    Value *OrigV = PN->getIncomingValueForBlock(BB);
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
 
     // Skip PHIs which are trivial.
-    if (BB1V == BIParentV)
+    if (ThenV == OrigV)
       continue;
 
-    // Check for saftey.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BB1V)) {
-      // An unfolded ConstantExpr could end up getting expanded into
-      // Instructions. Don't speculate this and another instruction at
-      // the same time.
-      if (HInst)
-        return false;
-      if (!isSafeToSpeculativelyExecute(CE))
-        return false;
-      if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
-        return false;
-    }
+    HaveRewritablePHIs = true;
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV);
+    if (!CE)
+      continue; // Known safe and cheap.
 
-    // Ok, we may insert a select for this PHI.
-    PHIs.insert(std::make_pair(BB1V, BIParentV));
+    if (!isSafeToSpeculativelyExecute(CE))
+      return false;
+    if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+      return false;
+
+    // Account for the cost of an unfolded ConstantExpr which could end up
+    // getting expanded into Instructions.
+    // FIXME: This doesn't account for how many operations are combined in the
+    // constant expression.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
   }
 
   // If there are no PHIs to process, bail early. This helps ensure idempotence
   // as well.
-  if (PHIs.empty())
+  if (!HaveRewritablePHIs)
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
-  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
+  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
-  // Hoist the instruction.
-  if (HInst)
-    BIParent->getInstList().splice(BI, BB1->getInstList(), HInst);
+  // Hoist the instructions.
+  BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
+                           llvm::prior(ThenBB->end()));
 
   // Insert selects and rewrite the PHI operands.
   IRBuilder<true, NoFolder> Builder(BI);
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-    Value *TrueV = PHIs[i].first;
-    Value *FalseV = PHIs[i].second;
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned OrigI = PN->getBasicBlockIndex(BB);
+    unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN->getIncomingValue(OrigI);
+    Value *ThenV = PN->getIncomingValue(ThenI);
+
+    // Skip PHIs which are trivial.
+    if (OrigV == ThenV)
+      continue;
 
     // Create a select whose true value is the speculatively executed value and
-    // false value is the previously determined FalseV.
-    SelectInst *SI;
+    // false value is the preexisting value. Swap them if the branch
+    // destinations were inverted.
+    Value *TrueV = ThenV, *FalseV = OrigV;
     if (Invert)
-      SI = cast<SelectInst>
-        (Builder.CreateSelect(BrCond, FalseV, TrueV,
-                              FalseV->getName() + "." + TrueV->getName()));
-    else
-      SI = cast<SelectInst>
-        (Builder.CreateSelect(BrCond, TrueV, FalseV,
-                              TrueV->getName() + "." + FalseV->getName()));
-
-    // Make the PHI node use the select for all incoming values for "then" and
-    // "if" blocks.
-    for (BasicBlock::iterator I = BB2->begin();
-         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-      unsigned BB1I = PN->getBasicBlockIndex(BB1);
-      unsigned BIParentI = PN->getBasicBlockIndex(BIParent);
-      Value *BB1V = PN->getIncomingValue(BB1I);
-      Value *BIParentV = PN->getIncomingValue(BIParentI);
-      if (TrueV == BB1V && FalseV == BIParentV) {
-        PN->setIncomingValue(BB1I, SI);
-        PN->setIncomingValue(BIParentI, SI);
-      }
-    }
+      std::swap(TrueV, FalseV);
+    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV,
+                                    TrueV->getName() + "." + FalseV->getName());
+    PN->setIncomingValue(OrigI, V);
+    PN->setIncomingValue(ThenI, V);
   }
 
   ++NumSpeculations;
@@ -2522,9 +2553,9 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 ///
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
-static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
-                                                  const DataLayout *TD,
-                                                  IRBuilder<> &Builder) {
+static bool TryToSimplifyUncondBranchWithICmpInIt(
+    ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI,
+    const DataLayout *TD) {
   BasicBlock *BB = ICI->getParent();
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
@@ -2557,7 +2588,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
       ICI->eraseFromParent();
     }
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // Ok, the block is reachable from the default dest.  If the constant we're
@@ -2573,7 +2604,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
@@ -2758,9 +2789,20 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
       return false;
 
   // Turn all invokes that unwind here into calls and delete the basic block.
+  bool InvokeRequiresTableEntry = false;
+  bool Changed = false;
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
     InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
+
+    if (II->hasFnAttr(Attribute::UWTable)) {
+      // Don't remove an `invoke' instruction if the ABI requires an entry into
+      // the table.
+      InvokeRequiresTableEntry = true;
+      continue;
+    }
+
     SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+
     // Insert a call instruction before the invoke.
     CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
     Call->takeName(II);
@@ -2780,11 +2822,14 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
 
     // Finally, delete the invoke instruction!
     II->eraseFromParent();
+    Changed = true;
   }
 
-  // The landingpad is now unreachable.  Zap it.
-  BB->eraseFromParent();
-  return true;
+  if (!InvokeRequiresTableEntry)
+    // The landingpad is now unreachable.  Zap it.
+    BB->eraseFromParent();
+
+  return Changed;
 }
 
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
@@ -3382,7 +3427,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
                                      ConstantInt *Offset,
                const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
                                      Constant *DefaultValue,
-                                     const DataLayout *TD) {
+                                     const DataLayout *TD)
+    : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) {
   assert(Values.size() && "Can't build lookup table without values!");
   assert(TableSize >= Values.size() && "Can't fit values in table!");
 
@@ -3510,23 +3556,44 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
 /// types of the results.
 static bool ShouldBuildLookupTable(SwitchInst *SI,
                                    uint64_t TableSize,
+                                   const TargetTransformInfo &TTI,
                                    const DataLayout *TD,
                             const SmallDenseMap<PHINode*, Type*>& ResultTypes) {
-  // The table density should be at least 40%. This is the same criterion as for
-  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
-  // FIXME: Find the best cut-off.
   if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
     return false; // TableSize overflowed, or mul below might overflow.
-  if (SI->getNumCases() * 10 >= TableSize * 4)
-    return true;
 
-  // If each table would fit in a register, we should build it anyway.
+  bool AllTablesFitInRegister = true;
+  bool HasIllegalType = false;
   for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(),
        E = ResultTypes.end(); I != E; ++I) {
-    if (!SwitchLookupTable::WouldFitInRegister(TD, TableSize, I->second))
-      return false;
+    Type *Ty = I->second;
+
+    // Saturate this flag to true.
+    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+
+    // Saturate this flag to false.
+    AllTablesFitInRegister = AllTablesFitInRegister &&
+      SwitchLookupTable::WouldFitInRegister(TD, TableSize, Ty);
+
+    // If both flags saturate, we're done. NOTE: This *only* works with
+    // saturating flags, and all flags have to saturate first due to the
+    // non-deterministic behavior of iterating over a dense map.
+    if (HasIllegalType && !AllTablesFitInRegister)
+      break;
   }
-  return true;
+
+  // If each table would fit in a register, we should build it anyway.
+  if (AllTablesFitInRegister)
+    return true;
+
+  // Don't build a table that doesn't fit in-register if it has illegal types.
+  if (HasIllegalType)
+    return false;
+
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  return SI->getNumCases() * 10 >= TableSize * 4;
 }
 
 /// SwitchToLookupTable - If the switch is only used to initialize one or more
@@ -3534,13 +3601,12 @@ static bool ShouldBuildLookupTable(SwitchInst *SI,
 /// replace the switch with lookup tables.
 static bool SwitchToLookupTable(SwitchInst *SI,
                                 IRBuilder<> &Builder,
-                                const DataLayout* TD,
-                                const TargetTransformInfo *TTI) {
+                                const TargetTransformInfo &TTI,
+                                const DataLayout* TD) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   // Only build lookup table when we have a target that supports it.
-  if (!TTI || !TTI->getScalarTargetTransformInfo() ||
-      !TTI->getScalarTargetTransformInfo()->shouldBuildLookupTables())
+  if (!TTI.shouldBuildLookupTables())
     return false;
 
   // FIXME: If the switch is too sparse for a lookup table, perhaps we could
@@ -3607,7 +3673,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
 
   APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
   uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
-  if (!ShouldBuildLookupTable(SI, TableSize, TD, ResultTypes))
+  if (!ShouldBuildLookupTable(SI, TableSize, TTI, TD, ResultTypes))
     return false;
 
   // Create the BB that does the lookups.
@@ -3672,12 +3738,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
     // see if that predecessor totally determines the outcome of this switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
 
     Value *Cond = SI->getCondition();
     if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
       if (SimplifySwitchOnSelect(SI, Select))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
@@ -3687,22 +3753,22 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
       ++BBI;
     if (SI == &*BBI)
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // Try to transform the switch into an icmp and a branch.
   if (TurnSwitchRangeIntoICmp(SI, Builder))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   // Remove unreachable cases.
   if (EliminateDeadSwitchCases(SI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   if (ForwardSwitchConditionToPHI(SI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
-  if (SwitchToLookupTable(SI, Builder, TD, TTI))
-    return SimplifyCFG(BB) | true;
+  if (SwitchToLookupTable(SI, Builder, TTI, TD))
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   return false;
 }
@@ -3739,7 +3805,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
-      return SimplifyCFG(BB) | true;
+      return SimplifyCFG(BB, TTI, TD) | true;
   }
   return Changed;
 }
@@ -3763,7 +3829,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
       if (I->isTerminator() &&
-          TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, TD))
         return true;
     }
 
@@ -3772,7 +3838,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
   if (FoldBranchToCommonDest(BI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
   return false;
 }
 
@@ -3787,7 +3853,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
@@ -3797,14 +3863,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       ++I;
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
     }
   }
 
@@ -3816,7 +3882,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
@@ -3825,7 +3891,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (BI->getSuccessor(0)->getSinglePredecessor() != 0) {
     if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
       if (HoistThenElseCodeToIf(BI))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to successor #1.
@@ -3833,7 +3899,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
-          return SimplifyCFG(BB) | true;
+          return SimplifyCFG(BB, TTI, TD) | true;
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
     // If Successor #0 has multiple preds, we may be able to conditionally
@@ -3842,7 +3908,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // If this is a branch on a phi node in the current block, thread control
@@ -3850,14 +3916,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, TD))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
 
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
         if (SimplifyCondBranchToCondBranch(PBI, BI))
-          return SimplifyCFG(BB) | true;
+          return SimplifyCFG(BB, TTI, TD) | true;
 
   return false;
 }
@@ -3892,11 +3958,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
 
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
-      return LI->getPointerAddressSpace() == 0;
+      if (!LI->isVolatile())
+        return LI->getPointerAddressSpace() == 0;
 
     // Store to null is undefined.
     if (StoreInst *SI = dyn_cast<StoreInst>(Use))
-      return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+      if (!SI->isVolatile())
+        return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
   }
   return false;
 }
@@ -3998,7 +4066,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// eliminates unreachable basic blocks, and does other "peephole" optimization
 /// of the CFG.  It returns true if a modification was made.
 ///
-bool llvm::SimplifyCFG(BasicBlock *BB, const DataLayout *TD,
-                       const TargetTransformInfo *TTI) {
-  return SimplifyCFGOpt(TD, TTI).run(BB);
+bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
+                       const DataLayout *TD) {
+  return SimplifyCFGOpt(TTI, TD).run(BB);
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 110f3808..41c207c 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -15,18 +15,18 @@
 
 #define DEBUG_TYPE "indvars"
 
-#include "llvm/Instructions.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
index 65353dc..f9687e4d 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -15,17 +15,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "instsimplify"
-#include "llvm/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c3ea638..c231704 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -15,12 +15,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-#include "llvm/DataLayout.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
@@ -48,6 +53,10 @@ public:
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
     =0;
 
+  /// ignoreCallingConv - Returns false if this transformation could possibly
+  /// change the calling convention.
+  virtual bool ignoreCallingConv() { return false; }
+
   Value *optimizeCall(CallInst *CI, const DataLayout *TD,
                       const TargetLibraryInfo *TLI,
                       const LibCallSimplifier *LCS, IRBuilder<> &B) {
@@ -59,7 +68,7 @@ public:
       Context = &CI->getCalledFunction()->getContext();
 
     // We never change the calling convention.
-    if (CI->getCallingConv() != llvm::CallingConv::C)
+    if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C)
       return NULL;
 
     return callOptimizer(CI->getCalledFunction(), CI, B);
@@ -100,6 +109,15 @@ static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
   return true;
 }
 
+static bool callHasFloatingPointArgument(const CallInst *CI) {
+  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
+       it != e; ++it) {
+    if ((*it)->getType()->isFloatingPointTy())
+      return true;
+  }
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -713,6 +731,7 @@ struct StrNCpyOpt : public LibCallOptimization {
 };
 
 struct StrLenOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 ||
@@ -781,8 +800,7 @@ struct StrToOpt : public LibCallOptimization {
     if (isa<ConstantPointerNull>(EndPtr)) {
       // With a null EndPtr, this function won't capture the main argument.
       // It would be readonly too, except that it still may write to errno.
-      CI->addAttribute(1, Attributes::get(Callee->getContext(),
-                                          Attributes::NoCapture));
+      CI->addAttribute(1, Attribute::NoCapture);
     }
 
     return 0;
@@ -951,7 +969,14 @@ struct MemCmpOpt : public LibCallOptimization {
       // Make sure we're not reading out-of-bounds memory.
       if (Len > LHSStr.size() || Len > RHSStr.size())
         return 0;
-      uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      // Fold the memcmp and normalize the result.  This way we get consistent
+      // results across multiple platforms.
+      uint64_t Ret = 0;
+      int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      if (Cmp < 0)
+        Ret = -1;
+      else if (Cmp > 0)
+        Ret = 1;
       return ConstantInt::get(CI->getType(), Ret);
     }
 
@@ -1016,6 +1041,630 @@ struct MemSetOpt : public LibCallOptimization {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
+      return 0;
+
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
+      }
+    }
+
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
+  }
+};
+
+struct UnsafeFPLibCallOptimization : public LibCallOptimization {
+  bool UnsafeFPShrink;
+  UnsafeFPLibCallOptimization(bool UnsafeFPShrink) {
+    this->UnsafeFPShrink = UnsafeFPShrink;
+  }
+};
+
+struct CosOpt : public UnsafeFPLibCallOptimization {
+  CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    // cos(-x) -> cos(x)
+    Value *Op1 = CI->getArgOperand(0);
+    if (BinaryOperator::isFNeg(Op1)) {
+      BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
+      return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
+    }
+    return Ret;
+  }
+};
+
+struct PowOpt : public UnsafeFPLibCallOptimization {
+  PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
+    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+        return Op1C;
+      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+        return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
+    }
+
+    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+    if (Op2C == 0) return Ret;
+
+    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
+      return ConstantFP::get(CI->getType(), 1.0);
+
+    if (Op2C->isExactlyValue(0.5)) {
+      // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
+      // This is faster than calling pow, and still handles negative zero
+      // and negative infinity correctly.
+      // TODO: In fast-math mode, this could be just sqrt(x).
+      // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
+      Value *Inf = ConstantFP::getInfinity(CI->getType());
+      Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+      Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B,
+                                         Callee->getAttributes());
+      Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
+                                         Callee->getAttributes());
+      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
+      return Sel;
+    }
+
+    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
+      return Op1;
+    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
+      return B.CreateFMul(Op1, Op1, "pow2");
+    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0),
+                          Op1, "powrecip");
+    return 0;
+  }
+};
+
+struct Exp2Opt : public UnsafeFPLibCallOptimization {
+  Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    Value *Op = CI->getArgOperand(0);
+    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+    Value *LdExpArg = 0;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
+    }
+
+    if (LdExpArg) {
+      const char *Name;
+      if (Op->getType()->isFloatTy())
+        Name = "ldexpf";
+      else if (Op->getType()->isDoubleTy())
+        Name = "ldexp";
+      else
+        Name = "ldexpl";
+
+      Constant *One = ConstantFP::get(*Context, APFloat(1.0f));
+      if (!Op->getType()->isFloatTy())
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                             Op->getType(),
+                                             B.getInt32Ty(), NULL);
+      CallInst *CI = B.CreateCall2(Callee, One, LdExpArg);
+      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+        CI->setCallingConv(F->getCallingConv());
+
+      return CI;
+    }
+    return Ret;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Integer Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct FFSOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 1 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getParamType(0)->isIntegerTy())
+      return 0;
+
+    Value *Op = CI->getArgOperand(0);
+
+    // Constant fold.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (CI->isZero()) // ffs(0) -> 0.
+        return B.getInt32(0);
+      // ffs(c) -> cttz(c)+1
+      return B.getInt32(CI->getValue().countTrailingZeros() + 1);
+    }
+
+    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+    Type *ArgType = Op->getType();
+    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
+                                         Intrinsic::cttz, ArgType);
+    Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+    V = B.CreateIntCast(V, B.getInt32Ty(), false);
+
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+    return B.CreateSelect(Cond, V, B.getInt32(0));
+  }
+};
+
+struct AbsOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(integer) where the types agree.
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+
+    // abs(x) -> x >s -1 ? x : -x
+    Value *Op = CI->getArgOperand(0);
+    Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()),
+                                 "ispos");
+    Value *Neg = B.CreateNeg(Op, "neg");
+    return B.CreateSelect(Pos, Op, Neg);
+  }
+};
+
+struct IsDigitOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // isdigit(c) -> (c-'0') <u 10
+    Value *Op = CI->getArgOperand(0);
+    Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+    Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+struct IsAsciiOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // isascii(c) -> c <u 128
+    Value *Op = CI->getArgOperand(0);
+    Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+struct ToAsciiOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require i32(i32)
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // toascii(c) -> c & 0x7f
+    return B.CreateAnd(CI->getArgOperand(0),
+                       ConstantInt::get(CI->getType(),0x7F));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct PrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
+      return 0;
+
+    // Empty format string -> noop.
+    if (FormatStr.empty())  // Tolerate printf's declared void.
+      return CI->use_empty() ? (Value*)CI :
+                               ConstantInt::get(CI->getType(), 0);
+
+    // Do not do any of the following transformations if the printf return value
+    // is used, in general the printf return value is not compatible with either
+    // putchar() or puts().
+    if (!CI->use_empty())
+      return 0;
+
+    // printf("x") -> putchar('x'), even for '%'.
+    if (FormatStr.size() == 1) {
+      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    // printf("foo\n") --> puts("foo")
+    if (FormatStr[FormatStr.size()-1] == '\n' &&
+        FormatStr.find('%') == std::string::npos) {  // no format characters.
+      // Create a string literal with no \n on it.  We expect the constant merge
+      // pass to be run after this pass, to merge duplicate strings.
+      FormatStr = FormatStr.drop_back();
+      Value *GV = B.CreateGlobalString(FormatStr, "str");
+      Value *NewCI = EmitPutS(GV, B, TD, TLI);
+      return (CI->use_empty() || !NewCI) ?
+              NewCI :
+              ConstantInt::get(CI->getType(), FormatStr.size()+1);
+    }
+
+    // Optimize specific format strings.
+    // printf("%c", chr) --> putchar(chr)
+    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isIntegerTy()) {
+      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI);
+
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    // printf("%s\n", str) --> puts(str)
+    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isPointerTy()) {
+      return EmitPutS(CI->getArgOperand(1), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // printf(format, ...) -> iprintf(format, ...) if no floating point
+    // arguments.
+    if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *IPrintFFn =
+        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(IPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct SPrintFOpt : public LibCallOptimization {
+  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumArgOperands() == 2) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
+                                      FormatStr.size() + 1), 1);   // nul byte.
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
+
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
+      if (!Len)
+        return 0;
+      Value *IncLen = B.CreateAdd(Len,
+                                  ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed pointer arguments and an integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+    // point arguments.
+    if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(SIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FPrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // All the optimizations depend on the format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumArgOperands() == 2) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 FormatStr.size()),
+                                CI->getArgOperand(0), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> fputc(chr, F)
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
+                               TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) --> fputs(str, F)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
+        return 0;
+      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed paramters as pointers and integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+    // floating point arguments.
+    if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(FIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FWriteOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        !FT->getParamType(3)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    return 0;
+  }
+};
+
+struct FPutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !CI->use_empty())
+      return 0;
+
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getArgOperand(0));
+    if (!Len) return 0;
+    // Known to have no uses (see above).
+    return EmitFWrite(CI->getArgOperand(0),
+                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
+                      CI->getArgOperand(1), B, TD, TLI);
+  }
+};
+
+struct PutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    // Check for a constant string.
+    StringRef Str;
+    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+      return 0;
+
+    if (Str.empty() && CI->use_empty()) {
+      // puts("") -> putchar('\n')
+      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    return 0;
+  }
+};
+
 } // End anonymous namespace.
 
 namespace llvm {
@@ -1024,98 +1673,252 @@ class LibCallSimplifierImpl {
   const DataLayout *TD;
   const TargetLibraryInfo *TLI;
   const LibCallSimplifier *LCS;
-  StringMap<LibCallOptimization*> Optimizations;
-
-  // Fortified library call optimizations.
-  MemCpyChkOpt MemCpyChk;
-  MemMoveChkOpt MemMoveChk;
-  MemSetChkOpt MemSetChk;
-  StrCpyChkOpt StrCpyChk;
-  StpCpyChkOpt StpCpyChk;
-  StrNCpyChkOpt StrNCpyChk;
-
-  // String library call optimizations.
-  StrCatOpt StrCat;
-  StrNCatOpt StrNCat;
-  StrChrOpt StrChr;
-  StrRChrOpt StrRChr;
-  StrCmpOpt StrCmp;
-  StrNCmpOpt StrNCmp;
-  StrCpyOpt StrCpy;
-  StpCpyOpt StpCpy;
-  StrNCpyOpt StrNCpy;
-  StrLenOpt StrLen;
-  StrPBrkOpt StrPBrk;
-  StrToOpt StrTo;
-  StrSpnOpt StrSpn;
-  StrCSpnOpt StrCSpn;
-  StrStrOpt StrStr;
-
-  // Memory library call optimizations.
-  MemCmpOpt MemCmp;
-  MemCpyOpt MemCpy;
-  MemMoveOpt MemMove;
-  MemSetOpt MemSet;
-
-  void initOptimizations();
-  void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
+  bool UnsafeFPShrink;
+
+  // Math library call optimizations.
+  CosOpt Cos;
+  PowOpt Pow;
+  Exp2Opt Exp2;
 public:
   LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
-                        const LibCallSimplifier *LCS) {
+                        const LibCallSimplifier *LCS,
+                        bool UnsafeFPShrink = false)
+    : Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) {
     this->TD = TD;
     this->TLI = TLI;
     this->LCS = LCS;
+    this->UnsafeFPShrink = UnsafeFPShrink;
   }
 
   Value *optimizeCall(CallInst *CI);
+  LibCallOptimization *lookupOptimization(CallInst *CI);
+  bool hasFloatVersion(StringRef FuncName);
 };
 
-void LibCallSimplifierImpl::initOptimizations() {
-  // Fortified library call optimizations.
-  Optimizations["__memcpy_chk"] = &MemCpyChk;
-  Optimizations["__memmove_chk"] = &MemMoveChk;
-  Optimizations["__memset_chk"] = &MemSetChk;
-  Optimizations["__strcpy_chk"] = &StrCpyChk;
-  Optimizations["__stpcpy_chk"] = &StpCpyChk;
-  Optimizations["__strncpy_chk"] = &StrNCpyChk;
-  Optimizations["__stpncpy_chk"] = &StrNCpyChk;
-
-  // String library call optimizations.
-  addOpt(LibFunc::strcat, &StrCat);
-  addOpt(LibFunc::strncat, &StrNCat);
-  addOpt(LibFunc::strchr, &StrChr);
-  addOpt(LibFunc::strrchr, &StrRChr);
-  addOpt(LibFunc::strcmp, &StrCmp);
-  addOpt(LibFunc::strncmp, &StrNCmp);
-  addOpt(LibFunc::strcpy, &StrCpy);
-  addOpt(LibFunc::stpcpy, &StpCpy);
-  addOpt(LibFunc::strncpy, &StrNCpy);
-  addOpt(LibFunc::strlen, &StrLen);
-  addOpt(LibFunc::strpbrk, &StrPBrk);
-  addOpt(LibFunc::strtol, &StrTo);
-  addOpt(LibFunc::strtod, &StrTo);
-  addOpt(LibFunc::strtof, &StrTo);
-  addOpt(LibFunc::strtoul, &StrTo);
-  addOpt(LibFunc::strtoll, &StrTo);
-  addOpt(LibFunc::strtold, &StrTo);
-  addOpt(LibFunc::strtoull, &StrTo);
-  addOpt(LibFunc::strspn, &StrSpn);
-  addOpt(LibFunc::strcspn, &StrCSpn);
-  addOpt(LibFunc::strstr, &StrStr);
-
-  // Memory library call optimizations.
-  addOpt(LibFunc::memcmp, &MemCmp);
-  addOpt(LibFunc::memcpy, &MemCpy);
-  addOpt(LibFunc::memmove, &MemMove);
-  addOpt(LibFunc::memset, &MemSet);
+bool LibCallSimplifierImpl::hasFloatVersion(StringRef FuncName) {
+  LibFunc::Func Func;
+  SmallString<20> FloatFuncName = FuncName;
+  FloatFuncName += 'f';
+  if (TLI->getLibFunc(FloatFuncName, Func))
+    return TLI->has(Func);
+  return false;
 }
 
-Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
-  if (Optimizations.empty())
-    initOptimizations();
-
+// Fortified library call optimizations.
+static MemCpyChkOpt MemCpyChk;
+static MemMoveChkOpt MemMoveChk;
+static MemSetChkOpt MemSetChk;
+static StrCpyChkOpt StrCpyChk;
+static StpCpyChkOpt StpCpyChk;
+static StrNCpyChkOpt StrNCpyChk;
+
+// String library call optimizations.
+static StrCatOpt StrCat;
+static StrNCatOpt StrNCat;
+static StrChrOpt StrChr;
+static StrRChrOpt StrRChr;
+static StrCmpOpt StrCmp;
+static StrNCmpOpt StrNCmp;
+static StrCpyOpt StrCpy;
+static StpCpyOpt StpCpy;
+static StrNCpyOpt StrNCpy;
+static StrLenOpt StrLen;
+static StrPBrkOpt StrPBrk;
+static StrToOpt StrTo;
+static StrSpnOpt StrSpn;
+static StrCSpnOpt StrCSpn;
+static StrStrOpt StrStr;
+
+// Memory library call optimizations.
+static MemCmpOpt MemCmp;
+static MemCpyOpt MemCpy;
+static MemMoveOpt MemMove;
+static MemSetOpt MemSet;
+
+// Math library call optimizations.
+static UnaryDoubleFPOpt UnaryDoubleFP(false);
+static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+
+  // Integer library call optimizations.
+static FFSOpt FFS;
+static AbsOpt Abs;
+static IsDigitOpt IsDigit;
+static IsAsciiOpt IsAscii;
+static ToAsciiOpt ToAscii;
+
+// Formatting and IO library call optimizations.
+static PrintFOpt PrintF;
+static SPrintFOpt SPrintF;
+static FPrintFOpt FPrintF;
+static FWriteOpt FWrite;
+static FPutsOpt FPuts;
+static PutsOpt Puts;
+
+LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
+  LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
-  LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
+  StringRef FuncName = Callee->getName();
+
+  // Next check for intrinsics.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::pow:
+       return &Pow;
+    case Intrinsic::exp2:
+       return &Exp2;
+    default:
+       return 0;
+    }
+  }
+
+  // Then check for known library functions.
+  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+    switch (Func) {
+      case LibFunc::strcat:
+        return &StrCat;
+      case LibFunc::strncat:
+        return &StrNCat;
+      case LibFunc::strchr:
+        return &StrChr;
+      case LibFunc::strrchr:
+        return &StrRChr;
+      case LibFunc::strcmp:
+        return &StrCmp;
+      case LibFunc::strncmp:
+        return &StrNCmp;
+      case LibFunc::strcpy:
+        return &StrCpy;
+      case LibFunc::stpcpy:
+        return &StpCpy;
+      case LibFunc::strncpy:
+        return &StrNCpy;
+      case LibFunc::strlen:
+        return &StrLen;
+      case LibFunc::strpbrk:
+        return &StrPBrk;
+      case LibFunc::strtol:
+      case LibFunc::strtod:
+      case LibFunc::strtof:
+      case LibFunc::strtoul:
+      case LibFunc::strtoll:
+      case LibFunc::strtold:
+      case LibFunc::strtoull:
+        return &StrTo;
+      case LibFunc::strspn:
+        return &StrSpn;
+      case LibFunc::strcspn:
+        return &StrCSpn;
+      case LibFunc::strstr:
+        return &StrStr;
+      case LibFunc::memcmp:
+        return &MemCmp;
+      case LibFunc::memcpy:
+        return &MemCpy;
+      case LibFunc::memmove:
+        return &MemMove;
+      case LibFunc::memset:
+        return &MemSet;
+      case LibFunc::cosf:
+      case LibFunc::cos:
+      case LibFunc::cosl:
+        return &Cos;
+      case LibFunc::powf:
+      case LibFunc::pow:
+      case LibFunc::powl:
+        return &Pow;
+      case LibFunc::exp2l:
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+        return &Exp2;
+      case LibFunc::ffs:
+      case LibFunc::ffsl:
+      case LibFunc::ffsll:
+        return &FFS;
+      case LibFunc::abs:
+      case LibFunc::labs:
+      case LibFunc::llabs:
+        return &Abs;
+      case LibFunc::isdigit:
+        return &IsDigit;
+      case LibFunc::isascii:
+        return &IsAscii;
+      case LibFunc::toascii:
+        return &ToAscii;
+      case LibFunc::printf:
+        return &PrintF;
+      case LibFunc::sprintf:
+        return &SPrintF;
+      case LibFunc::fprintf:
+        return &FPrintF;
+      case LibFunc::fwrite:
+        return &FWrite;
+      case LibFunc::fputs:
+        return &FPuts;
+      case LibFunc::puts:
+        return &Puts;
+      case LibFunc::ceil:
+      case LibFunc::fabs:
+      case LibFunc::floor:
+      case LibFunc::rint:
+      case LibFunc::round:
+      case LibFunc::nearbyint:
+      case LibFunc::trunc:
+        if (hasFloatVersion(FuncName))
+          return &UnaryDoubleFP;
+        return 0;
+      case LibFunc::acos:
+      case LibFunc::acosh:
+      case LibFunc::asin:
+      case LibFunc::asinh:
+      case LibFunc::atan:
+      case LibFunc::atanh:
+      case LibFunc::cbrt:
+      case LibFunc::cosh:
+      case LibFunc::exp:
+      case LibFunc::exp10:
+      case LibFunc::expm1:
+      case LibFunc::log:
+      case LibFunc::log10:
+      case LibFunc::log1p:
+      case LibFunc::log2:
+      case LibFunc::logb:
+      case LibFunc::sin:
+      case LibFunc::sinh:
+      case LibFunc::sqrt:
+      case LibFunc::tan:
+      case LibFunc::tanh:
+        if (UnsafeFPShrink && hasFloatVersion(FuncName))
+         return &UnsafeUnaryDoubleFP;
+        return 0;
+      case LibFunc::memcpy_chk:
+        return &MemCpyChk;
+      default:
+        return 0;
+      }
+  }
+
+  // Finally check for fortified library calls.
+  if (FuncName.endswith("_chk")) {
+    if (FuncName == "__memmove_chk")
+      return &MemMoveChk;
+    else if (FuncName == "__memset_chk")
+      return &MemSetChk;
+    else if (FuncName == "__strcpy_chk")
+      return &StrCpyChk;
+    else if (FuncName == "__stpcpy_chk")
+      return &StpCpyChk;
+    else if (FuncName == "__strncpy_chk")
+      return &StrNCpyChk;
+    else if (FuncName == "__stpncpy_chk")
+      return &StrNCpyChk;
+  }
+
+  return 0;
+
+}
+
+Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
+  LibCallOptimization *LCO = lookupOptimization(CI);
   if (LCO) {
     IRBuilder<> Builder(CI);
     return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
@@ -1123,14 +1926,10 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
   return 0;
 }
 
-void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) {
-  if (TLI->has(F))
-    Optimizations[TLI->getName(F)] = Opt;
-}
-
 LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
-                                     const TargetLibraryInfo *TLI) {
-  Impl = new LibCallSimplifierImpl(TD, TLI, this);
+                                     const TargetLibraryInfo *TLI,
+                                     bool UnsafeFPShrink) {
+  Impl = new LibCallSimplifierImpl(TD, TLI, this, UnsafeFPShrink);
 }
 
 LibCallSimplifier::~LibCallSimplifier() {
@@ -1138,6 +1937,7 @@ LibCallSimplifier::~LibCallSimplifier() {
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->hasFnAttr(Attribute::NoBuiltin)) return 0;
   return Impl->optimizeCall(CI);
 }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index b1cad06..560f581 100644
--- a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -15,12 +15,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
index a30b093..b5941bd 100644
--- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -13,11 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/Instructions.h"
-#include "llvm/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 using namespace llvm;
 
 // Out of line method to get vtable etc for class.
@@ -63,14 +63,29 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // Check all operands to see if any need to be remapped.
     for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
       Value *OP = MD->getOperand(i);
-      if (OP == 0 || MapValue(OP, VM, Flags, TypeMapper) == OP) continue;
+      if (OP == 0) continue;
+      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper);
+      // Use identity map if Mapped_Op is null and we can ignore missing
+      // entries.
+      if (Mapped_OP == OP ||
+          (Mapped_OP == 0 && (Flags & RF_IgnoreMissingEntries)))
+        continue;
 
       // Ok, at least one operand needs remapping.  
       SmallVector<Value*, 4> Elts;
       Elts.reserve(MD->getNumOperands());
       for (i = 0; i != e; ++i) {
         Value *Op = MD->getOperand(i);
-        Elts.push_back(Op ? MapValue(Op, VM, Flags, TypeMapper) : 0);
+        if (Op == 0)
+          Elts.push_back(0);
+        else {
+          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper);
+          // Use identity map if Mapped_Op is null and we can ignore missing
+          // entries.
+          if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
+            Mapped_Op = Op;
+          Elts.push_back(Mapped_Op);
+        }
       }
       MDNode *NewMD = MDNode::get(V->getContext(), Elts);
       Dummy->replaceAllUsesWith(NewMD);
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
index f7be3e3..17900da 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -16,39 +16,38 @@
 
 #define BBV_NAME "bb-vectorize"
 #define DEBUG_TYPE BBV_NAME
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/DataLayout.h"
-#include "llvm/TargetTransformInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
-#include <map>
 using namespace llvm;
 
 static cl::opt<bool>
@@ -89,6 +88,10 @@ MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
   cl::desc("The maximum number of pairable instructions per group"));
 
 static cl::opt<unsigned>
+MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
+  cl::desc("The maximum number of candidate instruction pairs per group"));
+
+static cl::opt<unsigned>
 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
   cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
                        " a full cycle check"));
@@ -199,9 +202,7 @@ namespace {
       DT = &P->getAnalysis<DominatorTree>();
       SE = &P->getAnalysis<ScalarEvolution>();
       TD = P->getAnalysisIfAvailable<DataLayout>();
-      TTI = IgnoreTargetInfo ? 0 :
-        P->getAnalysisIfAvailable<TargetTransformInfo>();
-      VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
+      TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>();
     }
 
     typedef std::pair<Value *, Value *> ValuePair;
@@ -209,18 +210,12 @@ namespace {
     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
     typedef std::pair<VPPair, unsigned> VPPairWithType;
-    typedef std::pair<std::multimap<Value *, Value *>::iterator,
-              std::multimap<Value *, Value *>::iterator> VPIteratorPair;
-    typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
-              std::multimap<ValuePair, ValuePair>::iterator>
-                VPPIteratorPair;
 
     AliasAnalysis *AA;
     DominatorTree *DT;
     ScalarEvolution *SE;
     DataLayout *TD;
-    TargetTransformInfo *TTI;
-    const VectorTargetTransformInfo *VTTI;
+    const TargetTransformInfo *TTI;
 
     // FIXME: const correct?
 
@@ -228,7 +223,7 @@ namespace {
 
     bool getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
-                       std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                        DenseSet<ValuePair> &FixedOrderPairs,
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
@@ -242,33 +237,36 @@ namespace {
       PairConnectionSplat
     };
 
-    void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts,
-                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                       DenseMap<VPPair, unsigned> &PairConnectionTypes);
+    void computeConnectedPairs(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes);
 
     void buildDepMap(BasicBlock &BB,
-                       std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts,
-                       DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
-                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                        std::vector<Value *> &PairableInsts,
-                        DenseSet<ValuePair> &FixedOrderPairs,
-                        DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                        std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                        DenseSet<ValuePair> &PairableInstUsers,
-                        DenseMap<Value *, Value *>& ChosenPairs);
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &PairableInstUsers);
+
+    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *>& ChosenPairs);
 
     void fuseChosenPairs(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *>& ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs,
-                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *>& ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
 
 
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
@@ -280,56 +278,63 @@ namespace {
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
                       Instruction *J, bool UpdateUsers = true,
-                      std::multimap<Value *, Value *> *LoadMoveSet = 0);
+                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
 
-    void computePairsConnectedTo(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      ValuePair P);
+  void computePairsConnectedTo(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             ValuePair P);
 
     bool pairsConflict(ValuePair P, ValuePair Q,
-                 DenseSet<ValuePair> &PairableInstUsers,
-                 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> >
+               *PairableInstUserMap = 0,
+             DenseSet<VPPair> *PairableInstUserPairSet = 0);
 
     bool pairWillFormCycle(ValuePair P,
-                       std::multimap<ValuePair, ValuePair> &PairableInstUsers,
-                       DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree,
-                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
-                      bool UseCycleCheck);
-
-    void buildInitialTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree, ValuePair J);
-
-    void findBestTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      int &BestEffSize, VPIteratorPair ChoiceRange,
-                      bool UseCycleCheck);
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
+             DenseSet<ValuePair> &CurrentPairs);
+
+    void pruneDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG,
+             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+             bool UseCycleCheck);
+
+    void buildInitialDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
+
+    void findBestDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+             bool UseCycleCheck);
 
     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
                      Instruction *J, unsigned o);
@@ -361,20 +366,22 @@ namespace {
 
     void collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I);
 
     void collectLoadMoveSet(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet);
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs);
 
     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I, Instruction *J);
 
     void moveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
@@ -387,7 +394,7 @@ namespace {
         return false;
       }
 
-      DEBUG(if (VTTI) dbgs() << "BBV: using target information\n");
+      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
 
       bool changed = false;
       // Iterate a sufficient number of times to merge types of size 1 bit,
@@ -395,7 +402,7 @@ namespace {
       // target vector register.
       unsigned n = 1;
       for (unsigned v = 2;
-           (VTTI || v <= Config.VectorBits) &&
+           (TTI || v <= Config.VectorBits) &&
            (!Config.MaxIter || n <= Config.MaxIter);
            v *= 2, ++n) {
         DEBUG(dbgs() << "BBV: fusing loop #" << n <<
@@ -426,9 +433,7 @@ namespace {
       DT = &getAnalysis<DominatorTree>();
       SE = &getAnalysis<ScalarEvolution>();
       TD = getAnalysisIfAvailable<DataLayout>();
-      TTI = IgnoreTargetInfo ? 0 :
-        getAnalysisIfAvailable<TargetTransformInfo>();
-      VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
+      TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>();
 
       return vectorizeBB(BB);
     }
@@ -438,6 +443,7 @@ namespace {
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<DominatorTree>();
       AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<ScalarEvolution>();
@@ -467,18 +473,18 @@ namespace {
 
     static inline void getInstructionTypes(Instruction *I,
                                            Type *&T1, Type *&T2) {
-      if (isa<StoreInst>(I)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // For stores, it is the value type, not the pointer type that matters
         // because the value is what will come from a vector register.
   
-        Value *IVal = cast<StoreInst>(I)->getValueOperand();
+        Value *IVal = SI->getValueOperand();
         T1 = IVal->getType();
       } else {
         T1 = I->getType();
       }
   
-      if (I->isCast())
-        T2 = cast<CastInst>(I)->getSrcTy();
+      if (CastInst *CI = dyn_cast<CastInst>(I))
+        T2 = CI->getSrcTy();
       else
         T2 = T1;
 
@@ -504,7 +510,7 @@ namespace {
       // InsertElement and ExtractElement have a depth factor of zero. This is
       // for two reasons: First, they cannot be usefully fused. Second, because
       // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the trees in the next iteration. Thus, giving them a
+      // used to compare the dags in the next iteration. Thus, giving them a
       // weight of zero allows the pass to essentially ignore them in
       // subsequent iterations when looking for vectorization opportunities
       // while still tracking dependency chains that flow through those
@@ -520,7 +526,7 @@ namespace {
       return 1;
     }
 
-    // Returns the cost of the provided instruction using VTTI.
+    // Returns the cost of the provided instruction using TTI.
     // This does not handle loads and stores.
     unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
       switch (Opcode) {
@@ -531,7 +537,7 @@ namespace {
         // generate vector GEPs.
         return 0;
       case Instruction::Br:
-        return VTTI->getCFInstrCost(Opcode);
+        return TTI->getCFInstrCost(Opcode);
       case Instruction::PHI:
         return 0;
       case Instruction::Add:
@@ -552,11 +558,11 @@ namespace {
       case Instruction::And:
       case Instruction::Or:
       case Instruction::Xor:
-        return VTTI->getArithmeticInstrCost(Opcode, T1);
+        return TTI->getArithmeticInstrCost(Opcode, T1);
       case Instruction::Select:
       case Instruction::ICmp:
       case Instruction::FCmp:
-        return VTTI->getCmpSelInstrCost(Opcode, T1, T2);
+        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
       case Instruction::ZExt:
       case Instruction::SExt:
       case Instruction::FPToUI:
@@ -570,7 +576,7 @@ namespace {
       case Instruction::FPTrunc:
       case Instruction::BitCast:
       case Instruction::ShuffleVector:
-        return VTTI->getCastInstrCost(Opcode, T1, T2);
+        return TTI->getCastInstrCost(Opcode, T1, T2);
       }
 
       return 1;
@@ -642,7 +648,7 @@ namespace {
       Function *F = I->getCalledFunction();
       if (!F) return false;
 
-      unsigned IID = F->getIntrinsicID();
+      Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
       if (!IID) return false;
 
       switch(IID) {
@@ -660,23 +666,11 @@ namespace {
       case Intrinsic::pow:
         return Config.VectorizeMath;
       case Intrinsic::fma:
+      case Intrinsic::fmuladd:
         return Config.VectorizeFMA;
       }
     }
 
-    // Returns true if J is the second element in some pair referenced by
-    // some multimap pair iterator pair.
-    template <typename V>
-    bool isSecondInIteratorPair(V J, std::pair<
-           typename std::multimap<V, V>::iterator,
-           typename std::multimap<V, V>::iterator> PairRange) {
-      for (typename std::multimap<V, V>::iterator K = PairRange.first;
-           K != PairRange.second; ++K)
-        if (K->second == J) return true;
-
-      return false;
-    }
-
     bool isPureIEChain(InsertElementInst *IE) {
       InsertElementInst *IENext = IE;
       do {
@@ -701,11 +695,12 @@ namespace {
     DenseMap<Value *, Value *> AllChosenPairs;
     DenseSet<ValuePair> AllFixedOrderPairs;
     DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
+    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
+                                                 AllConnectedPairDeps;
 
     do {
       std::vector<Value *> PairableInsts;
-      std::multimap<Value *, Value *> CandidatePairs;
+      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
       DenseSet<ValuePair> FixedOrderPairs;
       DenseMap<ValuePair, int> CandidatePairCostSavings;
       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
@@ -714,6 +709,14 @@ namespace {
                                          PairableInsts, NonPow2Len);
       if (PairableInsts.empty()) continue;
 
+      // Build the candidate pair set for faster lookups.
+      DenseSet<ValuePair> CandidatePairsSet;
+      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
+           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
+        for (std::vector<Value *>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          CandidatePairsSet.insert(ValuePair(I->first, *J));
+
       // Now we have a map of all of the pairable instructions and we need to
       // select the best possible pairing. A good pairing is one such that the
       // users of the pair are also paired. This defines a (directed) forest
@@ -723,30 +726,33 @@ namespace {
       // Note that it only matters that both members of the second pair use some
       // element of the first pair (to allow for splatting).
 
-      std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
+                                                   ConnectedPairDeps;
       DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
-                            PairConnectionTypes);
+      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
+                            PairableInsts, ConnectedPairs, PairConnectionTypes);
       if (ConnectedPairs.empty()) continue;
 
-      for (std::multimap<ValuePair, ValuePair>::iterator
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I) {
-        ConnectedPairDeps.insert(VPPair(I->second, I->first));
-      }
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          ConnectedPairDeps[*J].push_back(I->first);
 
       // Build the pairable-instruction dependency map
       DenseSet<ValuePair> PairableInstUsers;
       buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
 
       // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest tree meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that tree
+      // the pairing with the largest dag meeting the depth requirement on at
+      // least one branch. Then select all pairings that are part of that dag
       // and remove them from the list of available pairings and pairable
       // variables.
 
       DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairCostSavings,
+      choosePairs(CandidatePairs, CandidatePairsSet,
+        CandidatePairCostSavings,
         PairableInsts, FixedOrderPairs, PairConnectionTypes,
         ConnectedPairs, ConnectedPairDeps,
         PairableInstUsers, ChosenPairs);
@@ -780,14 +786,15 @@ namespace {
         }
       }
 
-      for (std::multimap<ValuePair, ValuePair>::iterator
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I) {
-        if (AllPairConnectionTypes.count(*I)) {
-          AllConnectedPairs.insert(*I);
-          AllConnectedPairDeps.insert(VPPair(I->second, I->first));
-        }
-      }
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+          JE = I->second.end(); J != JE; ++J)
+          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
+            AllConnectedPairs[I->first].push_back(*J);
+            AllConnectedPairDeps[*J].push_back(I->first);
+          }
     } while (ShouldContinue);
 
     if (AllChosenPairs.empty()) return false;
@@ -903,8 +910,8 @@ namespace {
          T2->getScalarType()->isPointerTy()))
       return false;
 
-    if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
-                  T2->getPrimitiveSizeInBits() >= Config.VectorBits))
+    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
       return false;
 
     return true;
@@ -913,7 +920,7 @@ namespace {
   // This function returns true if the two provided instructions are compatible
   // (meaning that they can be fused into a vector instruction). This assumes
   // that I has already been determined to be vectorizable and that J is not
-  // in the use tree of I.
+  // in the use dag of I.
   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
                        bool IsSimpleLoadStore, bool NonPow2Len,
                        int &CostSavings, int &FixedOrder) {
@@ -935,7 +942,7 @@ namespace {
     unsigned MaxTypeBits = std::max(
       IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
       IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
-    if (!VTTI && MaxTypeBits > Config.VectorBits)
+    if (!TTI && MaxTypeBits > Config.VectorBits)
       return false;
 
     // FIXME: handle addsub-type operations!
@@ -967,21 +974,26 @@ namespace {
             return false;
         }
 
-        if (VTTI) {
-          unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
-                                                 IAlignment, IAddressSpace);
-          unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(),
-                                                 JAlignment, JAddressSpace);
-          unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
-                                                 BottomAlignment,
-                                                 IAddressSpace);
+        if (TTI) {
+          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
+                                                IAlignment, IAddressSpace);
+          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
+                                                JAlignment, JAddressSpace);
+          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
+                                                BottomAlignment,
+                                                IAddressSpace);
+
+          ICost += TTI->getAddressComputationCost(aTypeI);
+          JCost += TTI->getAddressComputationCost(aTypeJ);
+          VCost += TTI->getAddressComputationCost(VType);
+
           if (VCost > ICost + JCost)
             return false;
 
           // We don't want to fuse to a type that will be split, even
           // if the two input types will also be split and there is no other
           // associated cost.
-          unsigned VParts = VTTI->getNumberOfParts(VType);
+          unsigned VParts = TTI->getNumberOfParts(VType);
           if (VParts > 1)
             return false;
           else if (!VParts && VCost == ICost + JCost)
@@ -992,11 +1004,17 @@ namespace {
       } else {
         return false;
       }
-    } else if (VTTI) {
+    } else if (TTI) {
       unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
       unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
       Type *VT1 = getVecTypeForPair(IT1, JT1),
            *VT2 = getVecTypeForPair(IT2, JT2);
+
+      // Note that this procedure is incorrect for insert and extract element
+      // instructions (because combining these often results in a shuffle),
+      // but this cost is ignored (because insert and extract element
+      // instructions are assigned a zero depth factor and are not really
+      // fused in general).
       unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
 
       if (VCost > ICost + JCost)
@@ -1005,8 +1023,8 @@ namespace {
       // We don't want to fuse to a type that will be split, even
       // if the two input types will also be split and there is no other
       // associated cost.
-      unsigned VParts1 = VTTI->getNumberOfParts(VT1),
-               VParts2 = VTTI->getNumberOfParts(VT2);
+      unsigned VParts1 = TTI->getNumberOfParts(VT1),
+               VParts2 = TTI->getNumberOfParts(VT2);
       if (VParts1 > 1 || VParts2 > 1)
         return false;
       else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
@@ -1019,14 +1037,67 @@ namespace {
     // vectorized, the second arguments must be equal.
     CallInst *CI = dyn_cast<CallInst>(I);
     Function *FI;
-    if (CI && (FI = CI->getCalledFunction()) &&
-        FI->getIntrinsicID() == Intrinsic::powi) {
-
-      Value *A1I = CI->getArgOperand(1),
-            *A1J = cast<CallInst>(J)->getArgOperand(1);
-      const SCEV *A1ISCEV = SE->getSCEV(A1I),
-                 *A1JSCEV = SE->getSCEV(A1J);
-      return (A1ISCEV == A1JSCEV);
+    if (CI && (FI = CI->getCalledFunction())) {
+      Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID();
+      if (IID == Intrinsic::powi) {
+        Value *A1I = CI->getArgOperand(1),
+              *A1J = cast<CallInst>(J)->getArgOperand(1);
+        const SCEV *A1ISCEV = SE->getSCEV(A1I),
+                   *A1JSCEV = SE->getSCEV(A1J);
+        return (A1ISCEV == A1JSCEV);
+      }
+
+      if (IID && TTI) {
+        SmallVector<Type*, 4> Tys;
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+          Tys.push_back(CI->getArgOperand(i)->getType());
+        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
+
+        Tys.clear();
+        CallInst *CJ = cast<CallInst>(J);
+        for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
+          Tys.push_back(CJ->getArgOperand(i)->getType());
+        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
+
+        Tys.clear();
+        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
+               "Intrinsic argument counts differ");
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          if (IID == Intrinsic::powi && i == 1)
+            Tys.push_back(CI->getArgOperand(i)->getType());
+          else
+            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
+                                            CJ->getArgOperand(i)->getType()));
+        }
+
+        Type *RetTy = getVecTypeForPair(IT1, JT1);
+        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
+
+        if (VCost > ICost + JCost)
+          return false;
+
+        // We don't want to fuse to a type that will be split, even
+        // if the two input types will also be split and there is no other
+        // associated cost.
+        unsigned RetParts = TTI->getNumberOfParts(RetTy);
+        if (RetParts > 1)
+          return false;
+        else if (!RetParts && VCost == ICost + JCost)
+          return false;
+
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          if (!Tys[i]->isVectorTy())
+            continue;
+
+          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
+          if (NumParts > 1)
+            return false;
+          else if (!NumParts && VCost == ICost + JCost)
+            return false;
+        }
+
+        CostSavings = ICost + JCost - VCost;
+      }
     }
 
     return true;
@@ -1040,7 +1111,7 @@ namespace {
   // to contain any memory locations to which J writes. The function returns
   // true if J uses I. By default, alias analysis is used to determine
   // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed multimap
+  // If LoadMoveSet is not null, then it is a previously-computed map
   // where the key is the memory-based user instruction and the value is
   // the instruction to be compared with I. So, if LoadMoveSet is provided,
   // then the alias analysis is not used. This is necessary because this
@@ -1050,7 +1121,7 @@ namespace {
   bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
                        AliasSetTracker &WriteSet, Instruction *I,
                        Instruction *J, bool UpdateUsers,
-                       std::multimap<Value *, Value *> *LoadMoveSet) {
+                       DenseSet<ValuePair> *LoadMoveSetPairs) {
     bool UsesI = false;
 
     // This instruction may already be marked as a user due, for example, to
@@ -1068,9 +1139,8 @@ namespace {
         }
       }
     if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSet) {
-        VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
-        UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+      if (LoadMoveSetPairs) {
+        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
       } else {
         for (AliasSetTracker::iterator W = WriteSet.begin(),
              WE = WriteSet.end(); W != WE; ++W) {
@@ -1094,10 +1164,11 @@ namespace {
   // basic block and collects all candidate pairs for vectorization.
   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
-                       std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                        DenseSet<ValuePair> &FixedOrderPairs,
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len) {
+    size_t TotalPairs = 0;
     BasicBlock::iterator E = BB.end();
     if (Start == E) return false;
 
@@ -1143,8 +1214,9 @@ namespace {
           PairableInsts.push_back(I);
         }
 
-        CandidatePairs.insert(ValuePair(I, J));
-        if (VTTI)
+        CandidatePairs[I].push_back(J);
+        ++TotalPairs;
+        if (TTI)
           CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
                                                             CostSavings));
 
@@ -1167,7 +1239,8 @@ namespace {
         // If we have already found too many pairs, break here and this function
         // will be called again starting after the last instruction selected
         // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts) {
+        if (PairableInsts.size() >= Config.MaxInsts ||
+            TotalPairs >= Config.MaxPairs) {
           ShouldContinue = true;
           break;
         }
@@ -1187,11 +1260,12 @@ namespace {
   // it looks for pairs such that both members have an input which is an
   // output of PI or PJ.
   void BBVectorize::computePairsConnectedTo(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      ValuePair P) {
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                  ValuePair P) {
     StoreInst *SI, *SJ;
 
     // For each possible pairing for this variable, look at the uses of
@@ -1209,8 +1283,6 @@ namespace {
         continue;
       }
 
-      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
-
       // For each use of the first variable, look for uses of the second
       // variable...
       for (Value::use_iterator J = P.second->use_begin(),
@@ -1219,19 +1291,17 @@ namespace {
             P.second == SJ->getPointerOperand())
           continue;
 
-        VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
-
         // Look for <I, J>:
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
         }
 
         // Look for <J, I>:
-        if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*J, *I))) {
           VPPair VP(P, ValuePair(*J, *I));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
         }
       }
@@ -1244,9 +1314,9 @@ namespace {
             P.first == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
         }
       }
@@ -1263,16 +1333,14 @@ namespace {
                P.second == SI->getPointerOperand())
         continue;
 
-      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
-
       for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
         if ((SJ = dyn_cast<StoreInst>(*J)) &&
             P.second == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
         }
       }
@@ -1283,55 +1351,73 @@ namespace {
   // connected if some output of the first pair forms an input to both members
   // of the second pair.
   void BBVectorize::computeConnectedPairs(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PE = PairableInsts.end(); PI != PE; ++PI) {
-      VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+      DenseMap<Value *, std::vector<Value *> >::iterator PP =
+        CandidatePairs.find(*PI);
+      if (PP == CandidatePairs.end())
+        continue;
 
-      for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
-           P != choiceRange.second; ++P)
-        computePairsConnectedTo(CandidatePairs, PairableInsts,
-                                ConnectedPairs, PairConnectionTypes, *P);
+      for (std::vector<Value *>::iterator P = PP->second.begin(),
+           E = PP->second.end(); P != E; ++P)
+        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
+                                PairableInsts, ConnectedPairs,
+                                PairConnectionTypes, ValuePair(*PI, *P));
     }
 
-    DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
+    DEBUG(size_t TotalPairs = 0;
+          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
+               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
+            TotalPairs += I->second.size();
+          dbgs() << "BBV: found " << TotalPairs
                  << " pair connections.\n");
   }
 
   // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use tree of A. If B is in the use tree of A, then B
+  // if B is in the use dag of A. If B is in the use dag of A, then B
   // depends on the output of A.
   void BBVectorize::buildDepMap(
                       BasicBlock &BB,
-                      std::multimap<Value *, Value *> &CandidatePairs,
+                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       DenseSet<ValuePair> &PairableInstUsers) {
     DenseSet<Value *> IsInPair;
-    for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
-         E = CandidatePairs.end(); C != E; ++C) {
+    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
+         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
       IsInPair.insert(C->first);
-      IsInPair.insert(C->second);
+      IsInPair.insert(C->second.begin(), C->second.end());
     }
 
-    // Iterate through the basic block, recording all Users of each
+    // Iterate through the basic block, recording all users of each
     // pairable instruction.
 
-    BasicBlock::iterator E = BB.end();
+    BasicBlock::iterator E = BB.end(), EL =
+      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
     for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
       if (IsInPair.find(I) == IsInPair.end()) continue;
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
-      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
         (void) trackUsesOfI(Users, WriteSet, I, J);
 
+        if (J == EL)
+          break;
+      }
+
       for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U)
+           U != E; ++U) {
+        if (IsInPair.find(*U) == IsInPair.end()) continue;
         PairableInstUsers.insert(ValuePair(I, *U));
+      }
+
+      if (I == EL)
+        break;
     }
   }
 
@@ -1339,8 +1425,9 @@ namespace {
   // input of pair Q is an output of pair P. If this is the case, then these
   // two pairs cannot be simultaneously fused.
   bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-                     DenseSet<ValuePair> &PairableInstUsers,
-                     std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
+             DenseSet<VPPair> *PairableInstUserPairSet) {
     // Two pairs are in conflict if they are mutual Users of eachother.
     bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
                   PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
@@ -1353,17 +1440,14 @@ namespace {
     if (PairableInstUserMap) {
       // FIXME: The expensive part of the cycle check is not so much the cycle
       // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure (same is true of
-      // most uses of std::multimap).
+      // profiling and probably a different data structure.
       if (PUsesQ) {
-        VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
-        if (!isSecondInIteratorPair(P, QPairRange))
-          PairableInstUserMap->insert(VPPair(Q, P));
+        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
+          (*PairableInstUserMap)[Q].push_back(P);
       }
       if (QUsesP) {
-        VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
-        if (!isSecondInIteratorPair(Q, PPairRange))
-          PairableInstUserMap->insert(VPPair(P, Q));
+        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
+          (*PairableInstUserMap)[P].push_back(Q);
       }
     }
 
@@ -1373,8 +1457,8 @@ namespace {
   // This function walks the use graph of current pairs to see if, starting
   // from P, the walk returns to P.
   bool BBVectorize::pairWillFormCycle(ValuePair P,
-                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                       DenseSet<ValuePair> &CurrentPairs) {
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<ValuePair> &CurrentPairs) {
     DEBUG(if (DebugCycleCheck)
             dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
                    << *P.second << "\n");
@@ -1391,36 +1475,41 @@ namespace {
       DEBUG(if (DebugCycleCheck)
               dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
                      << *QTop.second << "\n");
-      VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
-      for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
-           C != QPairRange.second; ++C) {
-        if (C->second == P) {
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        PairableInstUserMap.find(QTop);
+      if (QQ == PairableInstUserMap.end())
+        continue;
+
+      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
+           CE = QQ->second.end(); C != CE; ++C) {
+        if (*C == P) {
           DEBUG(dbgs()
                  << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << *C->first.first << " <-> " << *C->first.second << "\n");
+                 << QTop.first << " <-> " << C->second << "\n");
           return true;
         }
 
-        if (CurrentPairs.count(C->second) && !Visited.count(C->second))
-          Q.push_back(C->second);
+        if (CurrentPairs.count(*C) && !Visited.count(*C))
+          Q.push_back(*C);
       }
     } while (!Q.empty());
 
     return false;
   }
 
-  // This function builds the initial tree of connected pairs with the
+  // This function builds the initial dag of connected pairs with the
   // pair J at the root.
-  void BBVectorize::buildInitialTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a Tree. The Tree
+  void BBVectorize::buildInitialDAGFor(
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseSet<ValuePair> &PairableInstUsers,
+                  DenseMap<Value *, Value *> &ChosenPairs,
+                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
+    // Each of these pairs is viewed as the root node of a DAG. The DAG
     // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the Tree and the maximum depth of the Tree.
+    // the pairs that compose the DAG and the maximum depth of the DAG.
     SmallVector<ValuePairWithDepth, 32> Q;
     // General depth-first post-order traversal:
     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
@@ -1430,69 +1519,65 @@ namespace {
       // Push each child onto the queue:
       bool MoreChildren = false;
       size_t MaxChildDepth = QTop.second;
-      VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
-      for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
-           k != qtRange.second; ++k) {
-        // Make sure that this child pair is still a candidate:
-        bool IsStillCand = false;
-        VPIteratorPair checkRange =
-          CandidatePairs.equal_range(k->second.first);
-        for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
-             m != checkRange.second; ++m) {
-          if (m->second == k->second.second) {
-            IsStillCand = true;
-            break;
-          }
-        }
-
-        if (IsStillCand) {
-          DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
-          if (C == Tree.end()) {
-            size_t d = getDepthFactor(k->second.first);
-            Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
-            MoreChildren = true;
-          } else {
-            MaxChildDepth = std::max(MaxChildDepth, C->second);
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
+             ke = QQ->second.end(); k != ke; ++k) {
+          // Make sure that this child pair is still a candidate:
+          if (CandidatePairsSet.count(*k)) {
+            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
+            if (C == DAG.end()) {
+              size_t d = getDepthFactor(k->first);
+              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
+              MoreChildren = true;
+            } else {
+              MaxChildDepth = std::max(MaxChildDepth, C->second);
+            }
           }
         }
-      }
 
       if (!MoreChildren) {
-        // Record the current pair as part of the Tree:
-        Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+        // Record the current pair as part of the DAG:
+        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
         Q.pop_back();
       }
     } while (!Q.empty());
   }
 
-  // Given some initial tree, prune it by removing conflicting pairs (pairs
+  // Given some initial dag, prune it by removing conflicting pairs (pairs
   // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree,
-                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
-                      bool UseCycleCheck) {
+  void BBVectorize::pruneDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              std::vector<Value *> &PairableInsts,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseMap<ValuePair, size_t> &DAG,
+              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+              bool UseCycleCheck) {
     SmallVector<ValuePairWithDepth, 32> Q;
     // General depth-first post-order traversal:
     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
     do {
       ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedTree.insert(QTop.first);
+      PrunedDAG.insert(QTop.first);
 
       // Visit each child, pruning as necessary...
       SmallVector<ValuePairWithDepth, 8> BestChildren;
-      VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
-      for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
-           K != QTopRange.second; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
-        if (C == Tree.end()) continue;
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ == ConnectedPairs.end())
+        continue;
+
+      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
+           KE = QQ->second.end(); K != KE; ++K) {
+        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
+        if (C == DAG.end()) continue;
 
-        // This child is in the Tree, now we need to make sure it is the
+        // This child is in the DAG, now we need to make sure it is the
         // best of any conflicting children. There could be multiple
         // conflicting children, so first, determine if we're keeping
         // this child, then delete conflicting children as necessary.
@@ -1506,7 +1591,7 @@ namespace {
         // fusing (a,b) we have y .. a/b .. x where y is an input
         // to a/b and x is an output to a/b: x and y can no longer
         // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the Tree is not
+        // make sure that a child pair added to the DAG is not
         // both an input and output of an already-selected pair.
 
         // Pairing-induced dependencies can also form from more complicated
@@ -1525,7 +1610,8 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             if (C2->second >= C->second) {
               CanAdd = false;
               break;
@@ -1537,15 +1623,16 @@ namespace {
         if (!CanAdd) continue;
 
         // Even worse, this child could conflict with another node already
-        // selected for the Tree. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
-             E2 = PrunedTree.end(); T != E2; ++T) {
+        // selected for the DAG. If that is the case, ignore this child.
+        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
+             E2 = PrunedDAG.end(); T != E2; ++T) {
           if (T->first == C->first.first ||
               T->first == C->first.second ||
               T->second == C->first.first ||
               T->second == C->first.second ||
               pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1562,7 +1649,8 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1577,7 +1665,8 @@ namespace {
               ChosenPairs.begin(), E2 = ChosenPairs.end();
              C2 != E2; ++C2) {
           if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1589,7 +1678,7 @@ namespace {
         // To check for non-trivial cycles formed by the addition of the
         // current pair we've formed a list of all relevant pairs, now use a
         // graph walk to check for a cycle. We start from the current pair and
-        // walk the use tree to see if we again reach the current pair. If we
+        // walk the use dag to see if we again reach the current pair. If we
         // do, then the current pair is rejected.
 
         // FIXME: It may be more efficient to use a topological-ordering
@@ -1626,34 +1715,40 @@ namespace {
     } while (!Q.empty());
   }
 
-  // This function finds the best tree of mututally-compatible connected
+  // This function finds the best dag of mututally-compatible connected
   // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      int &BestEffSize, VPIteratorPair ChoiceRange,
-                      bool UseCycleCheck) {
-    for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
-         J != ChoiceRange.second; ++J) {
+  void BBVectorize::findBestDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              DenseSet<ValuePair> &CandidatePairsSet,
+              DenseMap<ValuePair, int> &CandidatePairCostSavings,
+              std::vector<Value *> &PairableInsts,
+              DenseSet<ValuePair> &FixedOrderPairs,
+              DenseMap<VPPair, unsigned> &PairConnectionTypes,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+              bool UseCycleCheck) {
+    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
+         J != JE; ++J) {
+      ValuePair IJ(II, *J);
+      if (!CandidatePairsSet.count(IJ))
+        continue;
 
       // Before going any further, make sure that this pair does not
       // conflict with any already-selected pairs (see comment below
-      // near the Tree pruning for more details).
+      // near the DAG pruning for more details).
       DenseSet<ValuePair> ChosenPairSet;
       bool DoesConflict = false;
       for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
            E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, *J, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : 0)) {
+        if (pairsConflict(*C, IJ, PairableInstUsers,
+                          UseCycleCheck ? &PairableInstUserMap : 0,
+                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
           DoesConflict = true;
           break;
         }
@@ -1663,40 +1758,42 @@ namespace {
       if (DoesConflict) continue;
 
       if (UseCycleCheck &&
-          pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
         continue;
 
-      DenseMap<ValuePair, size_t> Tree;
-      buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, Tree, *J);
+      DenseMap<ValuePair, size_t> DAG;
+      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
+                          PairableInsts, ConnectedPairs,
+                          PairableInstUsers, ChosenPairs, DAG, IJ);
 
       // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned Tree.
-      size_t MaxDepth = Tree.lookup(*J);
+      // depth is still the same in the unpruned DAG.
+      size_t MaxDepth = DAG.lookup(IJ);
 
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
-                   << *J->first << " <-> " << *J->second << "} of depth " <<
-                   MaxDepth << " and size " << Tree.size() << "\n");
+      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
+                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
+                   MaxDepth << " and size " << DAG.size() << "\n");
 
-      // At this point the Tree has been constructed, but, may contain
+      // At this point the DAG has been constructed, but, may contain
       // contradictory children (meaning that different children of
-      // some tree node may be attempting to fuse the same instruction).
-      // So now we walk the tree again, in the case of a conflict,
+      // some dag node may be attempting to fuse the same instruction).
+      // So now we walk the dag again, in the case of a conflict,
       // keep only the child with the largest depth. To break a tie,
       // favor the first child.
 
-      DenseSet<ValuePair> PrunedTree;
-      pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
-                   PrunedTree, *J, UseCycleCheck);
+      DenseSet<ValuePair> PrunedDAG;
+      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                   PairableInstUsers, PairableInstUserMap,
+                   PairableInstUserPairSet,
+                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
 
       int EffSize = 0;
-      if (VTTI) {
-        DenseSet<Value *> PrunedTreeInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S) {
-          PrunedTreeInstrs.insert(S->first);
-          PrunedTreeInstrs.insert(S->second);
+      if (TTI) {
+        DenseSet<Value *> PrunedDAGInstrs;
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
+          PrunedDAGInstrs.insert(S->first);
+          PrunedDAGInstrs.insert(S->second);
         }
 
         // The set of pairs that have already contributed to the total cost.
@@ -1709,8 +1806,8 @@ namespace {
 
         // The node weights represent the cost savings associated with
         // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S) {
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
           if (!isa<ShuffleVectorInst>(S->first) &&
               !isa<InsertElementInst>(S->first) &&
               !isa<ExtractElementInst>(S->first))
@@ -1728,15 +1825,17 @@ namespace {
 
           // The edge weights contribute in a negative sense: they represent
           // the cost of shuffles.
-          VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
-          if (IP.first != ConnectedPairDeps.end()) {
+          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
+            ConnectedPairDeps.find(*S);
+          if (SS != ConnectedPairDeps.end()) {
             unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-                 Q != IP.second; ++Q) {
-              if (!PrunedTree.count(Q->second))
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
                 continue;
               DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
               assert(R != PairConnectionTypes.end() &&
                      "Cannot find pair connection type");
               if (R->second == PairConnectionDirect)
@@ -1752,24 +1851,35 @@ namespace {
               ((NumDepsSwap > NumDepsDirect) ||
                 FixedOrderPairs.count(ValuePair(S->second, S->first)));
 
-            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-                 Q != IP.second; ++Q) {
-              if (!PrunedTree.count(Q->second))
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
                 continue;
               DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
               assert(R != PairConnectionTypes.end() &&
                      "Cannot find pair connection type");
-              Type *Ty1 = Q->second.first->getType(),
-                   *Ty2 = Q->second.second->getType();
+              Type *Ty1 = Q.second.first->getType(),
+                   *Ty2 = Q.second.second->getType();
               Type *VTy = getVecTypeForPair(Ty1, Ty2);
               if ((R->second == PairConnectionDirect && FlipOrder) ||
                   (R->second == PairConnectionSwap && !FlipOrder)  ||
                   R->second == PairConnectionSplat) {
                 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                    VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2) {
+                  if (R->second == PairConnectionSplat)
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Broadcast, VTy));
+                  else
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Reverse, VTy));
+                }
+
                 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q->second.first << " <-> " << *Q->second.second <<
+                  *Q.second.first << " <-> " << *Q.second.second <<
                     "} -> {" <<
                   *S->first << " <-> " << *S->second << "} = " <<
                    ESContrib << "\n");
@@ -1796,7 +1906,7 @@ namespace {
               }
               if (isa<ExtractElementInst>(*I))
                 continue;
-              if (PrunedTreeInstrs.count(*I))
+              if (PrunedDAGInstrs.count(*I))
                 continue;
               NeedsExtraction = true;
               break;
@@ -1804,11 +1914,13 @@ namespace {
 
             if (NeedsExtraction) {
               int ESContrib;
-              if (Ty1->isVectorTy())
+              if (Ty1->isVectorTy()) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                Ty1, VTy);
-              else
-                ESContrib = (int) VTTI->getVectorInstrCost(
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
+              } else
+                ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::ExtractElement, VTy, 0);
 
               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
@@ -1826,7 +1938,7 @@ namespace {
               }
               if (isa<ExtractElementInst>(*I))
                 continue;
-              if (PrunedTreeInstrs.count(*I))
+              if (PrunedDAGInstrs.count(*I))
                 continue;
               NeedsExtraction = true;
               break;
@@ -1834,11 +1946,14 @@ namespace {
 
             if (NeedsExtraction) {
               int ESContrib;
-              if (Ty2->isVectorTy())
+              if (Ty2->isVectorTy()) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                Ty2, VTy);
-              else
-                ESContrib = (int) VTTI->getVectorInstrCost(
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy,
+                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
+              } else
+                ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::ExtractElement, VTy, 1);
               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
                 *S->second << "} = " << ESContrib << "\n");
@@ -1865,7 +1980,7 @@ namespace {
               ValuePair VPR = ValuePair(O2, O1);
 
               // Internal edges are not handled here.
-              if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
                 continue;
 
               Type *Ty1 = O1->getType(),
@@ -1913,22 +2028,26 @@ namespace {
               } else if (IncomingPairs.count(VPR)) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2)
+                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                    TargetTransformInfo::SK_Reverse, VTy));
               } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
-                ESContrib = (int) VTTI->getVectorInstrCost(
+                ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::InsertElement, VTy, 0);
-                ESContrib += (int) VTTI->getVectorInstrCost(
+                ESContrib += (int) TTI->getVectorInstrCost(
                                      Instruction::InsertElement, VTy, 1);
               } else if (!Ty1->isVectorTy()) {
                 // O1 needs to be inserted into a vector of size O2, and then
                 // both need to be shuffled together.
-                ESContrib = (int) VTTI->getVectorInstrCost(
+                ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::InsertElement, Ty2, 0);
                 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
                                                 VTy, Ty2);
               } else if (!Ty2->isVectorTy()) {
                 // O2 needs to be inserted into a vector of size O1, and then
                 // both need to be shuffled together.
-                ESContrib = (int) VTTI->getVectorInstrCost(
+                ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::InsertElement, Ty1, 0);
                 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
                                                 VTy, Ty1);
@@ -1955,27 +2074,27 @@ namespace {
 
         if (!HasNontrivialInsts) {
           DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in tree;"
+                "\tNo non-trivial instructions in DAG;"
                 " override to zero effective size\n");
           EffSize = 0;
         }
       } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S)
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S)
           EffSize += (int) getDepthFactor(S->first);
       }
 
       DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned Tree for pair {"
-             << *J->first << " <-> " << *J->second << "} of depth " <<
-             MaxDepth << " and size " << PrunedTree.size() <<
+             dbgs() << "BBV: found pruned DAG for pair {"
+             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
+             MaxDepth << " and size " << PrunedDAG.size() <<
             " (effective size: " << EffSize << ")\n");
-      if (((VTTI && !UseChainDepthWithTI) ||
+      if (((TTI && !UseChainDepthWithTI) ||
             MaxDepth >= Config.ReqChainDepth) &&
           EffSize > 0 && EffSize > BestEffSize) {
         BestMaxDepth = MaxDepth;
         BestEffSize = EffSize;
-        BestTree = PrunedTree;
+        BestDAG = PrunedDAG;
       }
     }
   }
@@ -1983,66 +2102,98 @@ namespace {
   // Given the list of candidate pairs, this function selects those
   // that will be fused into vector instructions.
   void BBVectorize::choosePairs(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *>& ChosenPairs) {
+                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                DenseSet<ValuePair> &CandidatePairsSet,
+                DenseMap<ValuePair, int> &CandidatePairCostSavings,
+                std::vector<Value *> &PairableInsts,
+                DenseSet<ValuePair> &FixedOrderPairs,
+                DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+                DenseSet<ValuePair> &PairableInstUsers,
+                DenseMap<Value *, Value *>& ChosenPairs) {
     bool UseCycleCheck =
-     CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck;
-    std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
+
+    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
+    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
+         E = CandidatePairsSet.end(); I != E; ++I) {
+      std::vector<Value *> &JJ = CandidatePairs2[I->second];
+      if (JJ.empty()) JJ.reserve(32);
+      JJ.push_back(I->first);
+    }
+
+    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
+    DenseSet<VPPair> PairableInstUserPairSet;
     for (std::vector<Value *>::iterator I = PairableInsts.begin(),
          E = PairableInsts.end(); I != E; ++I) {
       // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.count(*I);
+      size_t NumChoices = CandidatePairs.lookup(*I).size();
       if (!NumChoices) continue;
 
-      VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+      std::vector<Value *> &JJ = CandidatePairs[*I];
 
-      // The best pair to choose and its tree:
+      // The best pair to choose and its dag:
       size_t BestMaxDepth = 0;
       int BestEffSize = 0;
-      DenseSet<ValuePair> BestTree;
-      findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
+      DenseSet<ValuePair> BestDAG;
+      findBestDAGFor(CandidatePairs, CandidatePairsSet,
+                      CandidatePairCostSavings,
                       PairableInsts, FixedOrderPairs, PairConnectionTypes,
                       ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap, ChosenPairs,
-                      BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+                      PairableInstUsers, PairableInstUserMap,
+                      PairableInstUserPairSet, ChosenPairs,
+                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
                       UseCycleCheck);
 
-      // A tree has been chosen (or not) at this point. If no tree was
+      if (BestDAG.empty())
+        continue;
+
+      // A dag has been chosen (or not) at this point. If no dag was
       // chosen, then this instruction, I, cannot be paired (and is no longer
       // considered).
 
-      DEBUG(if (BestTree.size() > 0)
-              dbgs() << "BBV: selected pairs in the best tree for: "
-                     << *cast<Instruction>(*I) << "\n");
+      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
+                   << *cast<Instruction>(*I) << "\n");
 
-      for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
-           SE2 = BestTree.end(); S != SE2; ++S) {
-        // Insert the members of this tree into the list of chosen pairs.
+      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
+           SE2 = BestDAG.end(); S != SE2; ++S) {
+        // Insert the members of this dag into the list of chosen pairs.
         ChosenPairs.insert(ValuePair(S->first, S->second));
         DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
                *S->second << "\n");
 
-        // Remove all candidate pairs that have values in the chosen tree.
-        for (std::multimap<Value *, Value *>::iterator K =
-               CandidatePairs.begin(); K != CandidatePairs.end();) {
-          if (K->first == S->first || K->second == S->first ||
-              K->second == S->second || K->first == S->second) {
-            // Don't remove the actual pair chosen so that it can be used
-            // in subsequent tree selections.
-            if (!(K->first == S->first && K->second == S->second))
-              CandidatePairs.erase(K++);
-            else
-              ++K;
-          } else {
-            ++K;
-          }
+        // Remove all candidate pairs that have values in the chosen dag.
+        std::vector<Value *> &KK = CandidatePairs[S->first];
+        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
+             K != KE; ++K) {
+          if (*K == S->second)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(S->first, *K));
+        }
+
+        std::vector<Value *> &LL = CandidatePairs2[S->second];
+        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
+             L != LE; ++L) {
+          if (*L == S->first)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(*L, S->second));
+        }
+
+        std::vector<Value *> &MM = CandidatePairs[S->second];
+        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
+             M != ME; ++M) {
+          assert(*M != S->first && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(S->second, *M));
+        }
+
+        std::vector<Value *> &NN = CandidatePairs2[S->first];
+        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
+             N != NE; ++N) {
+          assert(*N != S->second && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(*N, S->first));
         }
       }
     }
@@ -2550,7 +2701,7 @@ namespace {
         continue;
       } else if (isa<CallInst>(I)) {
         Function *F = cast<CallInst>(I)->getCalledFunction();
-        unsigned IID = F->getIntrinsicID();
+        Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
         if (o == NumOperands-1) {
           BasicBlock &BB = *I->getParent();
 
@@ -2559,8 +2710,7 @@ namespace {
           Type *ArgTypeJ = J->getType();
           Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
-          ReplacedOperands[o] = Intrinsic::getDeclaration(M,
-            (Intrinsic::ID) IID, VArgType);
+          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
           continue;
         } else if (IID == Intrinsic::powi && o == 1) {
           // The second argument of powi is a single integer and we've already
@@ -2647,7 +2797,7 @@ namespace {
 
   // Move all uses of the function I (including pairing-induced uses) after J.
   bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I, Instruction *J) {
     // Skip to the first instruction past I.
     BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
@@ -2655,18 +2805,18 @@ namespace {
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
     for (; cast<Instruction>(L) != J; ++L)
-      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
+      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
 
     assert(cast<Instruction>(L) == J &&
       "Tracking has not proceeded far enough to check for dependencies");
     // If J is now in the use set of I, then trackUsesOfI will return true
     // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
+    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
   }
 
   // Move all uses of the function I (including pairing-induced uses) after J.
   void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J) {
     // Skip to the first instruction past I.
@@ -2675,7 +2825,7 @@ namespace {
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
     for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
         // Move this instruction
         Instruction *InstToMove = L; ++L;
 
@@ -2695,7 +2845,8 @@ namespace {
   // to be moved after J (the second instruction) when the pair is fused.
   void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I) {
     // Skip to the first instruction past I.
     BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
@@ -2708,8 +2859,10 @@ namespace {
     // could be before I if this is an inverted input.
     for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
       if (trackUsesOfI(Users, WriteSet, I, L)) {
-        if (L->mayReadFromMemory())
-          LoadMoveSet.insert(ValuePair(L, I));
+        if (L->mayReadFromMemory()) {
+          LoadMoveSet[L].push_back(I);
+          LoadMoveSetPairs.insert(ValuePair(L, I));
+        }
       }
     }
   }
@@ -2718,20 +2871,22 @@ namespace {
   // are chosen for vectorization, we can end up in a situation where the
   // aliasing analysis starts returning different query results as the
   // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use trees here as were found earlier, we'll
+  // relies on finding the same use dags here as were found earlier, we'll
   // need to precompute the necessary aliasing information here and then
   // manually update it during the fusion process.
   void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet) {
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs) {
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PIE = PairableInsts.end(); PI != PIE; ++PI) {
       DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
       if (P == ChosenPairs.end()) continue;
 
       Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
+                             LoadMoveSetPairs, I);
     }
   }
 
@@ -2767,12 +2922,12 @@ namespace {
   // because the vector instruction is inserted in the location of the pair's
   // second member).
   void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs,
-                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
     LLVMContext& Context = BB.getContext();
 
     // During the vectorization process, the order of the pairs to be fused
@@ -2786,8 +2941,10 @@ namespace {
          E = FlippedPairs.end(); P != E; ++P)
       ChosenPairs.insert(*P);
 
-    std::multimap<Value *, Value *> LoadMoveSet;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
+    DenseSet<ValuePair> LoadMoveSetPairs;
+    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
+                       LoadMoveSet, LoadMoveSetPairs);
 
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
@@ -2819,7 +2976,7 @@ namespace {
       ChosenPairs.erase(FP);
       ChosenPairs.erase(P);
 
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
+      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
         DEBUG(dbgs() << "BBV: fusion of: " << *I <<
                " <-> " << *J <<
                " aborted because of non-trivial dependency cycle\n");
@@ -2836,18 +2993,20 @@ namespace {
         // of dependencies connected via swaps, and those directly connected,
         // and flip the order if the number of swaps is greater.
         bool OrigOrder = true;
-        VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
-        if (IP.first == ConnectedPairDeps.end()) {
-          IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
+          ConnectedPairDeps.find(ValuePair(I, J));
+        if (IJ == ConnectedPairDeps.end()) {
+          IJ = ConnectedPairDeps.find(ValuePair(J, I));
           OrigOrder = false;
         }
 
-        if (IP.first != ConnectedPairDeps.end()) {
+        if (IJ != ConnectedPairDeps.end()) {
           unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-               Q != IP.second; ++Q) {
+          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
+               TE = IJ->second.end(); T != TE; ++T) {
+            VPPair Q(IJ->first, *T);
             DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q->second, Q->first));
+              PairConnectionTypes.find(VPPair(Q.second, Q.first));
             assert(R != PairConnectionTypes.end() &&
                    "Cannot find pair connection type");
             if (R->second == PairConnectionDirect)
@@ -2873,17 +3032,20 @@ namespace {
 
       // If the pair being fused uses the opposite order from that in the pair
       // connection map, then we need to flip the types.
-      VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
-      for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-           Q != IP.second; ++Q) {
-        DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
-        assert(R != PairConnectionTypes.end() &&
-               "Cannot find pair connection type");
-        if (R->second == PairConnectionDirect)
-          R->second = PairConnectionSwap;
-        else if (R->second == PairConnectionSwap)
-          R->second = PairConnectionDirect;
-      }
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
+        ConnectedPairs.find(ValuePair(H, L));
+      if (HL != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
+             TE = HL->second.end(); T != TE; ++T) {
+          VPPair Q(HL->first, *T);
+          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
+          assert(R != PairConnectionTypes.end() &&
+                 "Cannot find pair connection type");
+          if (R->second == PairConnectionDirect)
+            R->second = PairConnectionSwap;
+          else if (R->second == PairConnectionSwap)
+            R->second = PairConnectionDirect;
+        }
 
       bool LBeforeH = !FlipPairOrder;
       unsigned NumOperands = I->getNumOperands();
@@ -2915,12 +3077,12 @@ namespace {
       Instruction *K1 = 0, *K2 = 0;
       replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
 
-      // The use tree of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use tree of the
-      // first instruction is disjoint from the input tree of the second
+      // The use dag of the first original instruction must be moved to after
+      // the location of the second instruction. The entire use dag of the
+      // first instruction is disjoint from the input dag of the second
       // (by definition), and so commutes with it.
 
-      moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
 
       if (!isa<StoreInst>(I)) {
         L->replaceAllUsesWith(K1);
@@ -2937,17 +3099,23 @@ namespace {
       // yet-to-be-fused pair. The loads in question are the keys of the map.
       if (I->mayReadFromMemory()) {
         std::vector<ValuePair> NewSetMembers;
-        VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
-        VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
-        for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
-             N != IPairRange.second; ++N)
-          NewSetMembers.push_back(ValuePair(K, N->second));
-        for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
-             N != JPairRange.second; ++N)
-          NewSetMembers.push_back(ValuePair(K, N->second));
+        DenseMap<Value *, std::vector<Value *> >::iterator II =
+          LoadMoveSet.find(I);
+        if (II != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = II->second.begin(),
+               NE = II->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
+        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
+          LoadMoveSet.find(J);
+        if (JJ != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = JJ->second.begin(),
+               NE = JJ->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
         for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A)
-          LoadMoveSet.insert(*A);
+             AE = NewSetMembers.end(); A != AE; ++A) {
+          LoadMoveSet[A->first].push_back(A->second);
+          LoadMoveSetPairs.insert(*A);
+        }
       }
 
       // Before removing I, set the iterator to the next instruction.
@@ -2972,6 +3140,7 @@ char BBVectorize::ID = 0;
 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
@@ -3006,6 +3175,7 @@ VectorizeConfig::VectorizeConfig() {
   MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
   SplatBreaksChain = ::SplatBreaksChain;
   MaxInsts = ::MaxInsts;
+  MaxPairs = ::MaxPairs;
   MaxIter = ::MaxIter;
   Pow2LenOnly = ::Pow2LenOnly;
   NoMemOpBoost = ::NoMemOpBoost;
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a7ef248..acf2b81 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9,10 +9,10 @@
 //
 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
 // and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
+// in the codegen. However, the vectorizer uses (will use) the codegen
 // interfaces to generate IR that is likely to result in an optimal binary.
 //
-// The loop vectorizer combines consecutive loop iteration into a single
+// The loop vectorizer combines consecutive loop iterations into a single
 // 'wide' iteration. After this transformation the index is incremented
 // by the SIMD vector width, and not by one.
 //
@@ -20,67 +20,107 @@
 // 1. The main loop pass that drives the different parts.
 // 2. LoopVectorizationLegality - A unit that checks for the legality
 //    of the vectorization.
-// 3. SingleBlockLoopVectorizer - A unit that performs the actual
+// 3. InnerLoopVectorizer - A unit that performs the actual
 //    widening of instructions.
 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
 //
 // Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
 //===----------------------------------------------------------------------===//
+
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Value.h"
-#include "llvm/Function.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
+#include <map>
+
 using namespace llvm;
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
-          cl::desc("Set the default vectorization width. Zero is autoselect."));
+                    cl::desc("Sets the SIMD width. Zero is autoselect."));
+
+static cl::opt<unsigned>
+VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
+                    cl::desc("Sets the vectorization unroll count. "
+                             "Zero is autoselect."));
+
+static cl::opt<bool>
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                   cl::desc("Enable if-conversion during vectorization."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-const unsigned TinyTripCountThreshold = 16;
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+                             cl::Hidden,
+                             cl::desc("Don't vectorize loops with a constant "
+                                      "trip count that is smaller than this "
+                                      "value."));
+
+/// We don't unroll loops with a known constant trip count below this number.
+static const unsigned TinyTripCountUnrollThreshold = 128;
 
 /// When performing a runtime memory check, do not check more than this
 /// number of pointers. Notice that the check is quadratic!
-const unsigned RuntimeMemoryCheckThreshold = 2;
+static const unsigned RuntimeMemoryCheckThreshold = 4;
+
+/// We use a metadata with this name  to indicate that a scalar loop was
+/// vectorized and that we don't need to re-vectorize it if we run into it
+/// again.
+static const char*
+AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
 
 namespace {
 
@@ -88,7 +128,7 @@ namespace {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 
-/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
 /// scalars. This class also implements the following features:
@@ -97,36 +137,61 @@ class LoopVectorizationCostModel;
 /// * It handles the code generation for reduction variables.
 /// * Scalarization (implementation using scalars) of un-vectorizable
 ///   instructions.
-/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// InnerLoopVectorizer does not perform any vectorization-legality
 /// checks, and relies on the caller to check for the different legality
-/// aspects. The SingleBlockLoopVectorizer relies on the
+/// aspects. The InnerLoopVectorizer relies on the
 /// LoopVectorizationLegality class to provide information about the induction
 /// and reduction variables that were found to a given vectorization factor.
-class SingleBlockLoopVectorizer {
+class InnerLoopVectorizer {
 public:
-  /// Ctor.
-  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            DominatorTree *dt, LPPassManager *Lpm,
-                            unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
-  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
+  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                      DominatorTree *DT, DataLayout *DL,
+                      const TargetLibraryInfo *TLI, unsigned VecWidth,
+                      unsigned UnrollFactor)
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+        OldInduction(0), WidenMap(UnrollFactor) {}
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *Legal) {
-    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    // Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop(Legal);
-    /// Widen each instruction in the old loop to a new one in the new loop.
-    /// Use the Legality module to find the induction and reduction variables.
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
     // Register the new loop and update the analysis passes.
     updateAnalysis();
- }
+  }
 
 private:
+  /// A small list of PHINodes.
+  typedef SmallVector<PHINode*, 4> PhiVector;
+  /// When we unroll loops we have multiple vector values for each scalar.
+  /// This data structure holds the unrolled and vectorized values that
+  /// originated from one scalar instruction.
+  typedef SmallVector<Value*, 2> VectorParts;
+
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comparator value or NULL if no check is needed.
+  Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                               Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  VectorParts createBlockInMask(BasicBlock *BB);
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+  /// A helper function to vectorize a single BB within the innermost loop.
+  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
+                            PhiVector *PV);
+
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
@@ -135,6 +200,10 @@ private:
   /// of scalars.
   void scalarizeInstruction(Instruction *Instr);
 
+  /// Vectorize Load and Store instructions,
+  void vectorizeMemoryInstruction(Instruction *Instr,
+                                  LoopVectorizationLegality *Legal);
+
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
   /// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -142,37 +211,82 @@ private:
   /// element.
   Value *getBroadcastInstrs(Value *V);
 
-  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
-  /// for each element in the vector. Starting from zero.
-  Value *getConsecutiveVector(Value* Val);
+  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
+  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
+  /// The sequence starts at StartIndex.
+  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
   /// are not within the map, they have to be loop invariant, so we simply
   /// broadcast them into a vector.
-  Value *getVectorValue(Value *V);
+  VectorParts &getVectorValue(Value *V);
+
+  /// Generate a shuffle sequence that will reverse the vector Vec.
+  Value *reverseVector(Value *Vec);
+
+  /// This is a helper class that holds the vectorizer state. It maps scalar
+  /// instructions to vector instructions. When the code is 'unrolled' then
+  /// then a single scalar value is mapped to multiple vector parts. The parts
+  /// are stored in the VectorPart type.
+  struct ValueMap {
+    /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
+    /// are mapped.
+    ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
+
+    /// \return True if 'Key' is saved in the Value Map.
+    bool has(Value *Key) const { return MapStorage.count(Key); }
+
+    /// Initializes a new entry in the map. Sets all of the vector parts to the
+    /// save value in 'Val'.
+    /// \return A reference to a vector with splat values.
+    VectorParts &splat(Value *Key, Value *Val) {
+      VectorParts &Entry = MapStorage[Key];
+      Entry.assign(UF, Val);
+      return Entry;
+    }
 
-  /// Get a uniform vector of constant integers. We use this to get
-  /// vectors of ones and zeros for the reduction code.
-  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+    ///\return A reference to the value that is stored at 'Key'.
+    VectorParts &get(Value *Key) {
+      VectorParts &Entry = MapStorage[Key];
+      if (Entry.empty())
+        Entry.resize(UF);
+      assert(Entry.size() == UF);
+      return Entry;
+    }
 
-  typedef DenseMap<Value*, Value*> ValueMap;
+  private:
+    /// The unroll factor. Each entry in the map stores this number of vector
+    /// elements.
+    unsigned UF;
+
+    /// Map storage. We use std::map and not DenseMap because insertions to a
+    /// dense map invalidates its iterators.
+    std::map<Value *, VectorParts> MapStorage;
+  };
 
   /// The original loop.
   Loop *OrigLoop;
-  // Scev analysis to use.
+  /// Scev analysis to use.
   ScalarEvolution *SE;
-  // Loop Info.
+  /// Loop Info.
   LoopInfo *LI;
-  // Dominator Tree.
+  /// Dominator Tree.
   DominatorTree *DT;
-  // Loop Pass Manager;
-  LPPassManager *LPM;
-  // The vectorization factor to use.
+  /// Data Layout.
+  DataLayout *DL;
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// The vectorization SIMD factor to use. Each vector will have this many
+  /// vector elements.
   unsigned VF;
+  /// The vectorization unroll factor to use. Each scalar is vectorized to this
+  /// many different vector instructions.
+  unsigned UF;
 
-  // The builder that we use
+  /// The builder that we use
   IRBuilder<> Builder;
 
   // --- Vectorization state ---
@@ -189,14 +303,14 @@ private:
   BasicBlock *LoopVectorBody;
   ///The scalar loop body.
   BasicBlock *LoopScalarBody;
-  ///The first bypass block.
-  BasicBlock *LoopBypassBlock;
+  /// A list of all bypass blocks. The first block is the entry of the loop.
+  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
   /// The new Induction variable which was added to the new block.
   PHINode *Induction;
   /// The induction variable of the old basic block.
   PHINode *OldInduction;
-  // Maps scalars to widened vectors.
+  /// Maps scalars to widened vectors.
   ValueMap WidenMap;
 };
 
@@ -207,36 +321,48 @@ private:
 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
 ///   will change the order of memory accesses in a way that will change the
 ///   correctness of the program.
-/// * Scalars checks - The code in canVectorizeBlock checks for a number
-///   of different conditions, such as the availability of a single induction
-///   variable, that all types are supported and vectorize-able, etc.
-/// This code reflects the capabilities of SingleBlockLoopVectorizer.
-/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
-  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+                            DominatorTree *DT, TargetTransformInfo* TTI,
+                            AliasAnalysis *AA, TargetLibraryInfo *TLI)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
+        Induction(0) {}
 
-  /// This represents the kinds of reductions that we support.
+  /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
-    NoReduction, /// Not a reduction.
-    IntegerAdd,  /// Sum of numbers.
-    IntegerMult, /// Product of numbers.
-    IntegerOr,   /// Bitwise or logical OR of numbers.
-    IntegerAnd,  /// Bitwise or logical AND of numbers.
-    IntegerXor   /// Bitwise or logical XOR of numbers.
+    RK_NoReduction, ///< Not a reduction.
+    RK_IntegerAdd,  ///< Sum of integers.
+    RK_IntegerMult, ///< Product of integers.
+    RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
+    RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
+    RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_FloatAdd,    ///< Sum of floats.
+    RK_FloatMult    ///< Product of floats.
+  };
+
+  /// This enum represents the kinds of inductions that we support.
+  enum InductionKind {
+    IK_NoInduction,         ///< Not an induction variable.
+    IK_IntInduction,        ///< Integer induction variable. Step = 1.
+    IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
+    IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
+    IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
   };
 
   /// This POD struct holds information about reduction variables.
   struct ReductionDescriptor {
-    // Default C'tor
-    ReductionDescriptor():
-    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+      Kind(RK_NoReduction) {}
 
-    // C'tor.
-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
-    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
 
     // The starting value of the reduction.
     // It does not have to be zero!
@@ -250,52 +376,113 @@ public:
   // This POD struct holds information about the memory runtime legality
   // check that a group of pointers do not overlap.
   struct RuntimePointerCheck {
+    RuntimePointerCheck() : Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
     SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+  };
+
+  /// A POD for saving information about induction variables.
+  struct InductionInfo {
+    InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
+    InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
+    /// Start value.
+    Value *StartValue;
+    /// Induction kind.
+    InductionKind IK;
   };
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  typedef MapVector<PHINode*, InductionInfo> InductionList;
+
+  /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+  /// respective Store/Load instruction(s) to calculate aliasing.
+  typedef MapVector<Value*, Instruction* > AliasMap;
+  typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
   bool canVectorize();
 
   /// Returns the Induction variable.
-  PHINode *getInduction() {return Induction;}
+  PHINode *getInduction() { return Induction; }
 
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
-  /// Check if the pointer returned by this GEP is consecutive
-  /// when the index is vectorized. This happens when the last
-  /// index of the GEP is consecutive, like the induction variable.
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Returns True if V is an induction variable in this loop.
+  bool isInductionVariable(const Value *V);
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
   /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutiveGep(Value *Ptr);
+  /// Returns:
+  /// 0 - Stride is unknown or non consecutive.
+  /// 1 - Address is consecutive.
+  /// -1 - Address is consecutive, and decreasing.
+  int isConsecutivePtr(Value *Ptr);
 
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
   /// Returns true if this instruction will remain scalar after vectorization.
-  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+  bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
 
   /// Returns the information that we collected about runtime memory check.
-  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
+  RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
   /// and we only need to check individual instructions.
-  bool canVectorizeBlock(BasicBlock &BB);
+  bool canVectorizeInstrs();
 
   /// When we vectorize loops we may change the order in which
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
-  /// Returns true if BB is vectorizable
-  bool canVectorizeMemory(BasicBlock &BB);
+  /// Returns true if the loop is vectorizable
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Collect the variables that need to stay uniform after vectorization.
+  void collectLoopUniforms();
+
+  /// Return true if all of the instructions in the block can be speculatively
+  /// executed.
+  bool blockCanBePredicated(BasicBlock *BB);
 
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
@@ -303,10 +490,19 @@ private:
   /// Returns true if the instruction I can be a reduction variable of type
   /// 'Kind'.
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
-  /// Returns True, if 'Phi' is an induction variable.
-  bool isInductionVariable(PHINode *Phi);
+  /// Returns the induction kind of Phi. This function may return NoInduction
+  /// if the PHI is not an induction variable.
+  InductionKind isInductionVariable(PHINode *Phi);
   /// Return true if can compute the address bounds of Ptr within the loop.
   bool hasComputableBounds(Value *Ptr);
+  /// Return true if there is the chance of write reorder.
+  bool hasPossibleGlobalWriteReorder(Value *Object,
+                                     Instruction *Inst,
+                                     AliasMultiMap &WriteObjects,
+                                     unsigned MaxByteWidth);
+  /// Return the AA location for a load or a store.
+  AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -314,13 +510,27 @@ private:
   ScalarEvolution *SE;
   /// DataLayout analysis.
   DataLayout *DL;
+  /// Dominators.
+  DominatorTree *DT;
+  /// Target Info.
+  TargetTransformInfo *TTI;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
+  /// Target Library Info.
+  TargetLibraryInfo *TLI;
 
   //  ---  vectorization state --- //
 
-  /// Holds the induction variable.
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
   PHINode *Induction;
   /// Holds the reduction variables.
   ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value*, 4> AllowedExit;
@@ -334,23 +544,57 @@ private:
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
-/// In many cases vectorization is not profitable. This can happen because
-/// of a number of reasons. In this class we mainly attempt to predict
-/// the expected speedup/slowdowns due to the supported instruction set.
-/// We use the VectorTargetTransformInfo to query the different backends
-/// for the cost of different operations.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
 class LoopVectorizationCostModel {
 public:
-  /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
-                             LoopVectorizationLegality *Leg,
-                             const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
+                             const TargetTransformInfo &TTI,
+                             DataLayout *DL, const TargetLibraryInfo *TLI)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
+
+  /// Information about vectorization costs
+  struct VectorizationFactor {
+    unsigned Width; // Vector width with best cost
+    unsigned Cost; // Cost of the loop with that width
+  };
+  /// \return The most profitable vectorization factor and the cost of that VF.
+  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// then this vectorization factor will be selected if vectorization is
+  /// possible.
+  VectorizationFactor selectVectorizationFactor(bool OptForSize,
+                                                unsigned UserVF);
+
+  /// \return The size (in bits) of the widest type in the code that
+  /// needs to be vectorized. We ignore values that remain scalar such as
+  /// 64 bit loop indices.
+  unsigned getWidestType();
+
+  /// \return The most profitable unroll factor.
+  /// If UserUF is non-zero then this method finds the best unroll-factor
+  /// based on register pressure and other parameters.
+  /// VF and LoopCost are the selected vectorization factor and the cost of the
+  /// selected VF.
+  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
+                              unsigned LoopCost);
+
+  /// \brief A struct that represents some properties of the register usage
+  /// of a loop.
+  struct RegisterUsage {
+    /// Holds the number of loop invariant values that are used in the loop.
+    unsigned LoopInvariantRegs;
+    /// Holds the maximum number of concurrent live intervals in the loop.
+    unsigned MaxLocalUsers;
+    /// Holds the number of instructions in the loop.
+    unsigned NumInstructions;
+  };
 
-  /// Returns the most profitable vectorization factor for the loop that is
-  /// smaller or equal to the VF argument. This method checks every power
-  /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = 8);
+  /// \return  information about the register usage of the loop.
+  RegisterUsage calculateRegisterUsage();
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
@@ -368,21 +612,32 @@ private:
   /// the scalar type.
   static Type* ToVectorTy(Type *Scalar, unsigned VF);
 
+  /// Returns whether the instruction is a load or store and will be a emitted
+  /// as a vector operation.
+  bool isConsecutiveLoadOrStore(Instruction *I);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
-
+  /// Loop Info analysis.
+  LoopInfo *LI;
   /// Vectorization legality.
   LoopVectorizationLegality *Legal;
   /// Vector target information.
-  const VectorTargetTransformInfo *VTTI;
+  const TargetTransformInfo &TTI;
+  /// Target data layout information.
+  DataLayout *DL;
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
 };
 
+/// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
-  static char ID; // Pass identification, replacement for typeid
+  /// Pass identification, replacement for typeid
+  static char ID;
 
-  LoopVectorize() : LoopPass(ID) {
+  explicit LoopVectorize() : LoopPass(ID) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -391,6 +646,8 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
   TargetTransformInfo *TTI;
   DominatorTree *DT;
+  AliasAnalysis *AA;
+  TargetLibraryInfo *TLI;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -400,45 +657,57 @@ struct LoopVectorize : public LoopPass {
     SE = &getAnalysis<ScalarEvolution>();
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
-    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    TTI = &getAnalysis<TargetTransformInfo>();
     DT = &getAnalysis<DominatorTree>();
+    AA = getAnalysisIfAvailable<AliasAnalysis>();
+    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
 
-    // Select the preffered vectorization factor.
-    unsigned VF = 1;
-    if (VectorizationFactor == 0) {
-      const VectorTargetTransformInfo *VTTI = 0;
-      if (TTI)
-        VTTI = TTI->getVectorTargetTransformInfo();
-      // Use the cost model.
-      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
-      VF = CM.findBestVectorizationFactor();
-
-      if (VF == 1) {
-        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-        return false;
-      }
+    // Use the cost model.
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
+
+    // Check the function attributes to find out if this function should be
+    // optimized for size.
+    Function *F = L->getHeader()->getParent();
+    Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
+    Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
+    unsigned FnIndex = AttributeSet::FunctionIndex;
+    bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
+    bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
+
+    if (NoFloat) {
+      DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+            "attribute is used.\n");
+      return false;
+    }
 
-    } else {
-      // Use the user command flag.
-      VF = VectorizationFactor;
+    // Select the optimal vectorization factor.
+    LoopVectorizationCostModel::VectorizationFactor VF;
+    VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    // Select the unroll factor.
+    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
+                                        VF.Width, VF.Cost);
+
+    if (VF.Width == 1) {
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+      return false;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
-          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
-          "\n");
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
+          F->getParent()->getModuleIdentifier()<<"\n");
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
 
-    // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
+    // If we decided that it is *legal* to vectorize the loop then do it.
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -449,52 +718,75 @@ struct LoopVectorize : public LoopPass {
     LoopPass::getAnalysisUsage(AU);
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
+    AU.addRequired<DominatorTree>();
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
-    AU.addRequired<DominatorTree>();
+    AU.addRequired<TargetTransformInfo>();
     AU.addPreserved<LoopInfo>();
     AU.addPreserved<DominatorTree>();
   }
 
 };
 
-Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Instructions that access the old induction variable
-  // actually want to get the new one.
-  if (V == OldInduction)
-    V = Induction;
-  // Create the types.
-  LLVMContext &C = V->getContext();
-  Type *VTy = VectorType::get(V->getType(), VF);
-  Type *I32 = IntegerType::getInt32Ty(C);
-  Constant *Zero = ConstantInt::get(I32, 0);
-  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
-  Value *UndefVal = UndefValue::get(VTy);
-  // Insert the value into a new vector.
-  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel.
+//===----------------------------------------------------------------------===//
+
+void
+LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
+                                                       Loop *Lp, Value *Ptr) {
+  const SCEV *Sc = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+  assert(AR && "Invalid addrec expression");
+  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+  Pointers.push_back(Ptr);
+  Starts.push_back(AR->getStart());
+  Ends.push_back(ScEnd);
+}
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // Save the current insertion location.
+  Instruction *Loc = Builder.GetInsertPoint();
+
+  // We need to place the broadcast of invariant variables outside the loop.
+  Instruction *Instr = dyn_cast<Instruction>(V);
+  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
   // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                             "broadcast");
-  // We are accessing the induction variable. Make sure to promote the
-  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
-  if (V == Induction)
-    return getConsecutiveVector(Shuf);
+  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+  // Restore the builder insertion point.
+  if (Invariant)
+    Builder.SetInsertPoint(Loc);
+
   return Shuf;
 }
 
-Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+                                                 bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
          "Elem must be an integer");
   // Create the types.
   Type *ITy = Val->getType()->getScalarType();
   VectorType *Ty = cast<VectorType>(Val->getType());
-  unsigned VLen = Ty->getNumElements();
+  int VLen = Ty->getNumElements();
   SmallVector<Constant*, 8> Indices;
 
   // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VLen; ++i)
-    Indices.push_back(ConstantInt::get(ITy, i));
+  for (int i = 0; i < VLen; ++i) {
+    int Idx = Negate ? (-i): i;
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+  }
 
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
@@ -502,20 +794,58 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+  // Make sure that the pointer does not point to structs.
+  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+    return 0;
+
+  // If this value is a pointer induction variable we know it is consecutive.
+  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+  if (Phi && Inductions.count(Phi)) {
+    InductionInfo II = Inductions[Phi];
+    if (IK_PtrInduction == II.IK)
+      return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
+  }
+
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
-    return false;
+    return 0;
 
   unsigned NumOperands = Gep->getNumOperands();
   Value *LastIndex = Gep->getOperand(NumOperands - 1);
 
+  Value *GpPtr = Gep->getPointerOperand();
+  // If this GEP value is a consecutive pointer induction variable and all of
+  // the indices are constant then we know it is consecutive. We can
+  Phi = dyn_cast<PHINode>(GpPtr);
+  if (Phi && Inductions.count(Phi)) {
+
+    // Make sure that the pointer does not point to structs.
+    PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+    if (GepPtrType->getElementType()->isAggregateType())
+      return 0;
+
+    // Make sure that all of the index operands are loop invariant.
+    for (unsigned i = 1; i < NumOperands; ++i)
+      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        return 0;
+
+    InductionInfo II = Inductions[Phi];
+    if (IK_PtrInduction == II.IK)
+      return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
+  }
+
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
-      return false;
+      return 0;
 
-  // We can emit wide load/stores only of the last index is the induction
+  // We can emit wide load/stores only if the last index is the induction
   // variable.
   const SCEV *Last = SE->getSCEV(LastIndex);
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
@@ -524,44 +854,153 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
     // The memory is consecutive because the last index is consecutive
     // and all other indices are loop invariant.
     if (Step->isOne())
-      return true;
+      return 1;
+    if (Step->isAllOnesValue())
+      return -1;
   }
 
-  return false;
+  return 0;
 }
 
 bool LoopVectorizationLegality::isUniform(Value *V) {
   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
-Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+InnerLoopVectorizer::VectorParts&
+InnerLoopVectorizer::getVectorValue(Value *V) {
+  assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
-  // If we saved a vectorized copy of V, use it.
-  Value *&MapEntry = WidenMap[V];
-  if (MapEntry)
-    return MapEntry;
 
-  // Broadcast V and save the value for future uses.
+  // If we have this scalar in the map, return it.
+  if (WidenMap.has(V))
+    return WidenMap.get(V);
+
+  // If this scalar is unknown, assume that it is a constant or that it is
+  // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  MapEntry = B;
-  return B;
+  return WidenMap.splat(V, B);
 }
 
-Constant*
-SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  SmallVector<Constant*, 8> Indices;
-  // Create a vector of consecutive numbers from zero to VF.
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+  assert(Vec->getType()->isVectorTy() && "Invalid type");
+  SmallVector<Constant*, 8> ShuffleMask;
   for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
+    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
 
-  // Add the consecutive indices to the vector value.
-  return ConstantVector::get(Indices);
+  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+                                     ConstantVector::get(ShuffleMask),
+                                     "reverse");
 }
 
-void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+                                             LoopVectorizationLegality *Legal) {
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+
+  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *DataTy = VectorType::get(ScalarDataTy, VF);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+
+  // If the pointer is loop invariant or if it is non consecutive,
+  // scalarize the load.
+  int Stride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = Stride < 0;
+  bool UniformLoad = LI && Legal->isUniform(Ptr);
+  if (Stride == 0 || UniformLoad)
+    return scalarizeInstruction(Instr);
+
+  Constant *Zero = Builder.getInt32(0);
+  VectorParts &Entry = WidenMap.get(Instr);
+
+  // Handle consecutive loads/stores.
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+    Value *PtrOperand = Gep->getPointerOperand();
+    Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+    FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(0, FirstBasePtr);
+    Gep2->setName("gep.indvar.base");
+    Ptr = Builder.Insert(Gep2);
+  } else if (Gep) {
+    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+                               OrigLoop) && "Base ptr must be invariant");
+
+    // The last index does not have to be the induction. It can be
+    // consecutive and be a function of the index. For example A[I+1];
+    unsigned NumOperands = Gep->getNumOperands();
+
+    Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+    VectorParts &GEPParts = getVectorValue(LastGepOperand);
+    Value *LastIndex = GEPParts[0];
+    LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(NumOperands - 1, LastIndex);
+    Gep2->setName("gep.indvar.idx");
+    Ptr = Builder.Insert(Gep2);
+  } else {
+    // Use the induction element ptr.
+    assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+    VectorParts &PtrVal = getVectorValue(Ptr);
+    Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+  }
+
+  // Handle Stores:
+  if (SI) {
+    assert(!Legal->isUniform(SI->getPointerOperand()) &&
+           "We do not allow storing to uniform addresses");
+
+    VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // Calculate the pointer for the specific unroll-part.
+      Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+      if (Reverse) {
+        // If we store to reverse consecutive memory locations then we need
+        // to reverse the order of elements in the stored value.
+        StoredVal[Part] = reverseVector(StoredVal[Part]);
+        // If the address is consecutive but reversed, then the
+        // wide store needs to start at the last vector element.
+        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      }
+
+      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+    }
+  }
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // Calculate the pointer for the specific unroll-part.
+    Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+    if (Reverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+    }
+
+    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+    cast<LoadInst>(LI)->setAlignment(Alignment);
+    Entry[Part] = Reverse ? reverseVector(LI) :  LI;
+  }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
-  SmallVector<Value*, 8> Params;
+  SmallVector<VectorParts, 4> Params;
 
   // Find all of the vectorized parameters.
   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
@@ -569,7 +1008,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
-      Params.push_back(getBroadcastInstrs(Induction));
+      Params.push_back(getVectorValue(SrcOp));
       continue;
     }
 
@@ -578,13 +1017,15 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If the src is an instruction that appeared earlier in the basic block
     // then it should already be vectorized.
-    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
-      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
+    if (SrcInst && OrigLoop->contains(SrcInst)) {
+      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
       // The parameter is a vector value from earlier.
-      Params.push_back(WidenMap[SrcInst]);
+      Params.push_back(WidenMap.get(SrcInst));
     } else {
       // The parameter is a scalar from outside the loop. Maybe even a constant.
-      Params.push_back(SrcOp);
+      VectorParts Scalars;
+      Scalars.append(UF, SrcOp);
+      Params.push_back(Scalars);
     }
   }
 
@@ -593,112 +1034,185 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
-  Value *VecResults = 0;
 
-  // If we have a return value, create an empty vector. We place the scalarized
-  // instructions in this vector.
-  if (!IsVoidRetTy)
-    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
+  Value *UndefVec = IsVoidRetTy ? 0 :
+    UndefValue::get(VectorType::get(Instr->getType(), VF));
+  // Create a new entry in the WidenMap and initialize it to Undef or Null.
+  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
   // For each scalar that we create:
-  for (unsigned i = 0; i < VF; ++i) {
-    Instruction *Cloned = Instr->clone();
-    if (!IsVoidRetTy)
-      Cloned->setName(Instr->getName() + ".cloned");
-    // Replace the operands of the cloned instrucions with extracted scalars.
-    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-      Value *Op = Params[op];
-      // Param is a vector. Need to extract the right lane.
-      if (Op->getType()->isVectorTy())
-        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
-      Cloned->setOperand(op, Op);
+  for (unsigned Width = 0; Width < VF; ++Width) {
+    // For each vector unroll 'part':
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+      // Replace the operands of the cloned instrucions with extracted scalars.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        Value *Op = Params[op][Part];
+        // Param is a vector. Need to extract the right lane.
+        if (Op->getType()->isVectorTy())
+          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
+        Cloned->setOperand(op, Op);
+      }
+
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
+
+      // If the original scalar returns a value we need to place it in a vector
+      // so that future users will be able to use it.
+      if (!IsVoidRetTy)
+        VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
+                                                       Builder.getInt32(Width));
+    }
+  }
+}
+
+Instruction *
+InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                     Instruction *Loc) {
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+  Legal->getRuntimePointerCheck();
+
+  if (!PtrRtCheck->Need)
+    return NULL;
+
+  Instruction *MemoryRuntimeCheck = 0;
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<Value* , 2> Starts;
+  SmallVector<Value* , 2> Ends;
+
+  SCEVExpander Exp(*SE, "induction");
+
+  // Use this type for pointer arithmetic.
+  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
     }
+  }
+
+  IRBuilder<> ChkBuilder(Loc);
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
 
-    // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+      if (MemoryRuntimeCheck)
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "conflict.rdx");
 
-    // If the original scalar returns a value we need to place it in a vector
-    // so that future users will be able to use it.
-    if (!IsVoidRetTy)
-      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
-                                               Builder.getInt32(i));
+      MemoryRuntimeCheck = cast<Instruction>(IsConflict);
+    }
   }
 
-  if (!IsVoidRetTy)
-    WidenMap[Instr] = VecResults;
+  return MemoryRuntimeCheck;
 }
 
 void
-SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-    [ ] <-- vector loop bypass.
-  /  |
- /   v
-|   [ ]     <-- vector pre header.
-|    |
-|    v
-|   [  ] \
-|   [  ]_|   <-- vector loop.
-|    |
- \   v
-   >[ ]   <--- middle-block.
-  /  |
- /   v
-|   [ ]     <--- new preheader.
-|    |
-|    v
-|   [ ] \
-|   [ ]_|   <-- old scalar loop to handle remainder.
- \   |
-  \  v
-   >[ ]     <-- exit block.
+       [ ] <-- vector loop bypass (may consist of multiple blocks).
+     /  |
+    /   v
+   |   [ ]     <-- vector pre header.
+   |    |
+   |    v
+   |   [  ] \
+   |   [  ]_|   <-- vector loop.
+   |    |
+    \   v
+      >[ ]   <--- middle-block.
+     /  |
+    /   v
+   |   [ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
    ...
    */
 
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(ExitBlock && "Must have an exit block");
+
+  // Mark the old scalar loop with metadata that tells us not to vectorize this
+  // loop again if we run into it.
+  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
+  OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
+
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
   OldInduction = Legal->getInduction();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
+  Type *IdxTy = OldInduction ? OldInduction->getType() :
+  DL->getIntPtrType(SE->getContext());
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
   // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != IdxTy)
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
 
-  // This is the original scalar-loop preheader.
-  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
-  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
-  assert(ExitBlock && "Must have an exit block");
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
 
-  // The loop index does not have to start at Zero. It starts with this value.
-  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+  // The loop index does not have to start at Zero. Find the original start
+  // value from the induction PHI node. If we don't have an induction variable
+  // then we know that it starts at zero.
+  Value *StartIdx = OldInduction ?
+  OldInduction->getIncomingValueForBlock(BypassBlock):
+  ConstantInt::get(IdxTy, 0);
 
-  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
+  LoopBypassBlocks.push_back(BypassBlock);
 
+  // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
-      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
-  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                 "vector.body");
-
-  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                  "middle.block");
+  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody =
+  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-  // Find the induction variable.
-  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -706,105 +1220,167 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Generate the induction variable.
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  Constant *Step = ConstantInt::get(IdxTy, VF);
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
 
-  // Expand the trip count and place the new instructions in the preheader.
-  // Notice that the pre-header does not change, only the loop body.
-  SCEVExpander Exp(*SE, "induction");
-  Instruction *Loc = BypassBlock->getTerminator();
+  // This is the IR builder that we use to add all of the logic for bypassing
+  // the new vector loop.
+  IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
 
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  if (Count->getType() != IdxTy) {
+    // The exit count can be of pointer type. Convert it to the correct
+    // integer type.
+    if (ExitCount->getType()->isPointerTy())
+      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
+    else
+      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
+  }
 
   // Add the start index to the loop count to get the new end index.
-  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
 
   // Now we need to generate the expression for N - (N % VF), which is
   // the part that the vectorized body will execute.
-  Constant *CIVF = ConstantInt::get(IdxTy, VF);
-  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
-  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
-  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
-                                                     "end.idx.rnd.down", Loc);
-
-  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
-  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               IdxEndRoundDown,
-                               StartIdx,
-                               "cmp.zero", Loc);
+  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
+  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down");
+
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
+  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
+                                          "cmp.zero");
+
+  BasicBlock *LastBypassBlock = BypassBlock;
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
+                                                 BypassBlock->getTerminator());
+  if (MemRuntimeCheck) {
+    // Create a new block containing the memory check.
+    BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
+                                                          "vector.memcheck");
+    LoopBypassBlocks.push_back(CheckBlock);
+
+    // Replace the branch into the memory check block with a conditional branch
+    // for the "few elements case".
+    Instruction *OldTerm = BypassBlock->getTerminator();
+    BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
+    OldTerm->eraseFromParent();
+
+    Cmp = MemRuntimeCheck;
+    LastBypassBlock = CheckBlock;
+  }
 
-  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-    Legal->getRuntimePointerCheck();
-  Value *MemoryRuntimeCheck = 0;
-  if (PtrRtCheck->Need) {
-    unsigned NumPointers = PtrRtCheck->Pointers.size();
-    SmallVector<Value* , 2> Starts;
-    SmallVector<Value* , 2> Ends;
-
-    // Use this type for pointer arithmetic.
-    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
-
-    for (unsigned i=0; i < NumPointers; ++i) {
-      Value *Ptr = PtrRtCheck->Pointers[i];
-      const SCEV *Sc = SE->getSCEV(Ptr);
-
-      if (SE->isLoopInvariant(Sc, OrigLoop)) {
-        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
-              *Ptr <<"\n");
-        Starts.push_back(Ptr);
-        Ends.push_back(Ptr);
-      } else {
-        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
-        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
-        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
-        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
-        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
-        Starts.push_back(Start);
-        Ends.push_back(End);
-      }
-    }
+  LastBypassBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp,
+                     LastBypassBlock);
 
-    for (unsigned i=0; i < NumPointers; ++i) {
-      for (unsigned j=i+1; j < NumPointers; ++j) {
-        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[0], Ends[1], "bound0", Loc);
-        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[1], Ends[0], "bound1", Loc);
-        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
-                                                    "found.conflict", Loc);
-        if (MemoryRuntimeCheck) {
-          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
-                                                      MemoryRuntimeCheck,
-                                                      IsConflict,
-                                                      "conflict.rdx", Loc);
-        } else {
-          MemoryRuntimeCheck = IsConflict;
-        }
-      }
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
+
+  // This variable saves the new starting index for the scalar loop.
+  PHINode *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    LoopVectorizationLegality::InductionInfo II = I->second;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                         MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    switch (II.IK) {
+    case LoopVectorizationLegality::IK_NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IK_IntInduction: {
+      // Handle the integer induction counter:
+      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+      break;
+    }
+    case LoopVectorizationLegality::IK_ReverseIntInduction: {
+      // Convert the CountRoundDown variable to the PHI size.
+      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
+      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
+      Value *CRD = CountRoundDown;
+      if (CRDSize > IISize)
+        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
+                               II.StartValue->getType(), "tr.crd",
+                               LoopBypassBlocks.back()->getTerminator());
+      else if (CRDSize < IISize)
+        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
+                               II.StartValue->getType(),
+                               "sext.crd",
+                               LoopBypassBlocks.back()->getTerminator());
+      // Handle reverse integer induction counter:
+      EndValue =
+        BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::IK_PtrInduction: {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue =
+        GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::IK_ReversePtrInduction: {
+      // The value at the end of the loop for the reverse pointer is calculated
+      // by creating a GEP with a negative index starting from the start value.
+      Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
+      Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
+                                  "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
+                                  "rev.ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
     }
-  }// end of need-runtime-check code.
+    }// end of case
 
-  // If we are using memory runtime checks, include them in.
-  if (MemoryRuntimeCheck) {
-    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
-                                 "CntOrMem", Loc);
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
   }
 
-  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
-  // Remove the old terminator.
-  Loc->eraseFromParent();
+  // If we are generating a new induction variable then we also need to
+  // generate the code that calculates the exit value. This value is not
+  // simply the end of the counter because we may skip the vectorized body
+  // in case of a runtime check.
+  if (!OldInduction){
+    assert(!ResumeIndex && "Unexpected resume value found");
+    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+                                  MiddleBlock->getTerminator());
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
+    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  }
 
-  // We are going to resume the execution of the scalar loop.
-  // This PHI decides on what number to start. If we come from the
-  // vector loop then we need to start with the end index minus the
-  // index modulo VF. If we come from a bypass edge then we need to start
-  // from the real start.
-  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
-                                         MiddleBlock->getTerminator());
-  ResumeIndex->addIncoming(StartIdx, BypassBlock);
-  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
 
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
@@ -828,26 +1404,27 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
 
-  // Fix the scalar body iteration count.
-  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
-
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
-  // Register the new loop.
+  // Create and register the new vector loop.
   Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
-
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
   Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks.
   if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+      ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
   }
 
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
   // Save the state.
   LoopVectorPreHeader = VectorPH;
   LoopScalarPreHeader = ScalarPH;
@@ -855,32 +1432,164 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
-  LoopBypassBlock = BypassBlock;
 }
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-static unsigned
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+static Constant*
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
   switch (K) {
-  case LoopVectorizationLegality::IntegerXor:
-  case LoopVectorizationLegality::IntegerAdd:
-  case LoopVectorizationLegality::IntegerOr:
+  case LoopVectorizationLegality:: RK_IntegerXor:
+  case LoopVectorizationLegality:: RK_IntegerAdd:
+  case LoopVectorizationLegality:: RK_IntegerOr:
     // Adding, Xoring, Oring zero to a number does not change it.
-    return 0;
-  case LoopVectorizationLegality::IntegerMult:
+    return ConstantInt::get(Tp, 0);
+  case LoopVectorizationLegality:: RK_IntegerMult:
     // Multiplying a number by 1 does not change it.
-    return 1;
-  case LoopVectorizationLegality::IntegerAnd:
+    return ConstantInt::get(Tp, 1);
+  case LoopVectorizationLegality:: RK_IntegerAnd:
     // AND-ing a number with an all-1 value does not change it.
-    return -1;
+    return ConstantInt::get(Tp, -1, true);
+  case LoopVectorizationLegality:: RK_FloatMult:
+    // Multiplying a number by 1 does not change it.
+    return ConstantFP::get(Tp, 1.0L);
+  case LoopVectorizationLegality:: RK_FloatAdd:
+    // Adding zero to a number does not change it.
+    return ConstantFP::get(Tp, 0.0L);
   default:
     llvm_unreachable("Unknown reduction kind");
   }
 }
 
+static Intrinsic::ID
+getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
+  // If we have an intrinsic call, check if it is trivially vectorizable.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::sqrt:
+    case Intrinsic::sin:
+    case Intrinsic::cos:
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+    case Intrinsic::log:
+    case Intrinsic::log10:
+    case Intrinsic::log2:
+    case Intrinsic::fabs:
+    case Intrinsic::floor:
+    case Intrinsic::ceil:
+    case Intrinsic::trunc:
+    case Intrinsic::rint:
+    case Intrinsic::nearbyint:
+    case Intrinsic::pow:
+    case Intrinsic::fma:
+    case Intrinsic::fmuladd:
+      return II->getIntrinsicID();
+    default:
+      return Intrinsic::not_intrinsic;
+    }
+  }
+
+  if (!TLI)
+    return Intrinsic::not_intrinsic;
+
+  LibFunc::Func Func;
+  Function *F = CI->getCalledFunction();
+  // We're going to make assumptions on the semantics of the functions, check
+  // that the target knows that it's available in this environment.
+  if (!F || !TLI->getLibFunc(F->getName(), Func))
+    return Intrinsic::not_intrinsic;
+
+  // Otherwise check if we have a call to a function that can be turned into a
+  // vector intrinsic.
+  switch (Func) {
+  default:
+    break;
+  case LibFunc::sin:
+  case LibFunc::sinf:
+  case LibFunc::sinl:
+    return Intrinsic::sin;
+  case LibFunc::cos:
+  case LibFunc::cosf:
+  case LibFunc::cosl:
+    return Intrinsic::cos;
+  case LibFunc::exp:
+  case LibFunc::expf:
+  case LibFunc::expl:
+    return Intrinsic::exp;
+  case LibFunc::exp2:
+  case LibFunc::exp2f:
+  case LibFunc::exp2l:
+    return Intrinsic::exp2;
+  case LibFunc::log:
+  case LibFunc::logf:
+  case LibFunc::logl:
+    return Intrinsic::log;
+  case LibFunc::log10:
+  case LibFunc::log10f:
+  case LibFunc::log10l:
+    return Intrinsic::log10;
+  case LibFunc::log2:
+  case LibFunc::log2f:
+  case LibFunc::log2l:
+    return Intrinsic::log2;
+  case LibFunc::fabs:
+  case LibFunc::fabsf:
+  case LibFunc::fabsl:
+    return Intrinsic::fabs;
+  case LibFunc::floor:
+  case LibFunc::floorf:
+  case LibFunc::floorl:
+    return Intrinsic::floor;
+  case LibFunc::ceil:
+  case LibFunc::ceilf:
+  case LibFunc::ceill:
+    return Intrinsic::ceil;
+  case LibFunc::trunc:
+  case LibFunc::truncf:
+  case LibFunc::truncl:
+    return Intrinsic::trunc;
+  case LibFunc::rint:
+  case LibFunc::rintf:
+  case LibFunc::rintl:
+    return Intrinsic::rint;
+  case LibFunc::nearbyint:
+  case LibFunc::nearbyintf:
+  case LibFunc::nearbyintl:
+    return Intrinsic::nearbyint;
+  case LibFunc::pow:
+  case LibFunc::powf:
+  case LibFunc::powl:
+    return Intrinsic::pow;
+  }
+
+  return Intrinsic::not_intrinsic;
+}
+
+/// This function translates the reduction kind to an LLVM binary operator.
+static Instruction::BinaryOps
+getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
+  switch (Kind) {
+    case LoopVectorizationLegality::RK_IntegerAdd:
+      return Instruction::Add;
+    case LoopVectorizationLegality::RK_IntegerMult:
+      return Instruction::Mul;
+    case LoopVectorizationLegality::RK_IntegerOr:
+      return Instruction::Or;
+    case LoopVectorizationLegality::RK_IntegerAnd:
+      return Instruction::And;
+    case LoopVectorizationLegality::RK_IntegerXor:
+      return Instruction::Xor;
+    case LoopVectorizationLegality::RK_FloatMult:
+      return Instruction::FMul;
+    case LoopVectorizationLegality::RK_FloatAdd:
+      return Instruction::FAdd;
+    default:
+      llvm_unreachable("Unknown reduction operation");
+  }
+}
+
 void
-SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
   //
   // Notice: any optimization or new instruction that go
@@ -888,208 +1597,29 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  typedef SmallVector<PHINode*, 4> PhiVector;
-  BasicBlock &BB = *OrigLoop->getHeader();
-  Constant *Zero = ConstantInt::get(
-    IntegerType::getInt32Ty(BB.getContext()), 0);
+  Constant *Zero = Builder.getInt32(0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // steages. First, we create a new vector PHI node with no incoming edges.
+  // stages. First, we create a new vector PHI node with no incoming edges.
   // We use this value when we vectorize all of the instructions that use the
   // PHI. Next, after all of the instructions in the block are complete we
   // add the new incoming edges to the PHI. At this point all of the
   // instructions in the basic block are vectorized, so we can use them to
   // construct the PHI.
-  PhiVector PHIsToFix;
+  PhiVector RdxPHIsToFix;
 
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *Inst = it;
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
 
-    switch (Inst->getOpcode()) {
-      case Instruction::Br:
-        // Nothing to do for PHIs and BR, since we already took care of the
-        // loop control flow instructions.
-        continue;
-      case Instruction::PHI:{
-        PHINode* P = cast<PHINode>(Inst);
-        // Special handling for the induction var.
-        if (OldInduction == Inst)
-          continue;
-        // This is phase one of vectorizing PHIs.
-        // This has to be a reduction variable.
-        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
-        Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
-        PHIsToFix.push_back(P);
-        continue;
-      }
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor: {
-        // Just widen binops.
-        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
-
-        // Use this vector value for all users of the original instruction.
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
-        WidenMap[Inst] = V;
-
-        // Update the NSW, NUW and Exact flags.
-        BinaryOperator *VecOp = cast<BinaryOperator>(V);
-        if (isa<OverflowingBinaryOperator>(BinOp)) {
-          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
-          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
-        }
-        if (isa<PossiblyExactOperator>(VecOp))
-          VecOp->setIsExact(BinOp->isExact());
-        break;
-      }
-      case Instruction::Select: {
-        // Widen selects.
-        // If the selector is loop invariant we can create a select
-        // instruction with a scalar condition. Otherwise, use vector-select.
-        Value *Cond = Inst->getOperand(0);
-        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
-
-        // The condition can be loop invariant  but still defined inside the
-        // loop. This means that we can't just use the original 'cond' value.
-        // We have to take the 'vectorized' value and pick the first lane.
-        // Instcombine will make this a no-op.
-        Cond = getVectorValue(Cond);
-        if (InvariantCond)
-          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
-
-        Value *Op0 = getVectorValue(Inst->getOperand(1));
-        Value *Op1 = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
-        break;
-      }
-
-      case Instruction::ICmp:
-      case Instruction::FCmp: {
-        // Widen compares. Generate vector compares.
-        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
-        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
-        if (FCmp)
-          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
-        else
-          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
-        break;
-      }
+  // Vectorize all of the blocks in the original loop.
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb)
+    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
 
-      case Instruction::Store: {
-        // Attempt to issue a wide store.
-        StoreInst *SI = dyn_cast<StoreInst>(Inst);
-        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
-        Value *Ptr = SI->getPointerOperand();
-        unsigned Alignment = SI->getAlignment();
-
-        assert(!Legal->isUniform(Ptr) &&
-               "We do not allow storing to uniform addresses");
-
-        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-
-        // This store does not use GEPs.
-        if (!Legal->isConsecutiveGep(Gep)) {
-          scalarizeInstruction(Inst);
-          break;
-        }
-
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
-        Value *Val = getVectorValue(SI->getValueOperand());
-        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
-        break;
-      }
-      case Instruction::Load: {
-        // Attempt to issue a wide load.
-        LoadInst *LI = dyn_cast<LoadInst>(Inst);
-        Type *RetTy = VectorType::get(LI->getType(), VF);
-        Value *Ptr = LI->getPointerOperand();
-        unsigned Alignment = LI->getAlignment();
-        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-
-        // If we don't have a gep, or that the pointer is loop invariant,
-        // scalarize the load.
-        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
-          scalarizeInstruction(Inst);
-          break;
-        }
-
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder.CreateLoad(Ptr);
-        LI->setAlignment(Alignment);
-        // Use this vector value for all users of the load.
-        WidenMap[Inst] = LI;
-        break;
-      }
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast: {
-        /// Vectorize bitcasts.
-        CastInst *CI = dyn_cast<CastInst>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
-        break;
-      }
-
-      default:
-        /// All other instructions are unsupported. Scalarize them.
-        scalarizeInstruction(Inst);
-        break;
-    }// end of switch.
-  }// end of for_each instr.
-
-  // At this point every instruction in the original loop is widended to
+  // At this point every instruction in the original loop is widened to
   // a vector form. We are almost done. Now, we need to fix the PHI nodes
   // that we vectorized. The PHI nodes are currently empty because we did
   // not want to introduce cycles. Notice that the remaining PHI nodes
@@ -1098,38 +1628,36 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // Create the 'reduced' values for each of the induction vars.
   // The reduced values are the vector values that we scalarize and combine
   // after the loop is finished.
-  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
-    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
     // Find the reduction variable descriptor.
     assert(Legal->getReductionVars()->count(RdxPhi) &&
            "Unable to find the reduction variable");
     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
-      (*Legal->getReductionVars())[RdxPhi];
+    (*Legal->getReductionVars())[RdxPhi];
 
     // We need to generate a reduction vector from the incoming scalar.
     // To do so, we need to generate the 'identity' vector and overide
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
-    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
-    Type *VecTy = VectorExit->getType();
+    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit[0]->getType();
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
-                                          VecTy->getScalarType());
+    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
+    Constant *Identity = ConstantVector::getSplat(VF, Iden);
 
     // This vector is the Identity vector where the first element is the
     // incoming scalar reduction.
     Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                    RdxDesc.StartValue, Zero);
-
+                                                     RdxDesc.StartValue, Zero);
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
@@ -1138,10 +1666,17 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Reductions do not have to start at zero. They can start with
     // any loop invariant values.
-    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
-    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
-    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
+    BasicBlock *Latch = OrigLoop->getLoopLatch();
+    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
+    VectorParts &Val = getVectorValue(LoopVal);
+    for (unsigned part = 0; part < UF; ++part) {
+      // Make sure to add the reduction stat value only to the 
+      // first unroll part.
+      Value *StartVal = (part == 0) ? VectorStart : Identity;
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
+    }
 
     // Before each round, move the insertion point right between
     // the PHIs and the values we are going to write.
@@ -1149,40 +1684,56 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // instructions.
     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
-    // This PHINode contains the vectorized reduction variable, or
-    // the initial value vector, if we bypass the vector loop.
-    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
-    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
-
-    // Extract the first scalar.
-    Value *Scalar0 =
-      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
-    // Extract and reduce the remaining vector elements.
-    for (unsigned i=1; i < VF; ++i) {
-      Value *Scalar1 =
-        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      switch (RdxDesc.Kind) {
-        case LoopVectorizationLegality::IntegerAdd:
-          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerMult:
-          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerOr:
-          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerAnd:
-          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerXor:
-          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
-          break;
-        default:
-          llvm_unreachable("Unknown reduction operation");
-      }
+    VectorParts RdxParts;
+    for (unsigned part = 0; part < UF; ++part) {
+      // This PHINode contains the vectorized reduction variable, or
+      // the initial value vector, if we bypass the vector loop.
+      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+      Value *StartVal = (part == 0) ? VectorStart : Identity;
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
+      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
+      RdxParts.push_back(NewPhi);
+    }
+
+    // Reduce all of the unrolled parts into a single vector.
+    Value *ReducedPartRdx = RdxParts[0];
+    for (unsigned part = 1; part < UF; ++part) {
+      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
+      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
+                                           "bin.rdx");
     }
 
+    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+    // and vector ops, reducing the set of values being computed by half each
+    // round.
+    assert(isPowerOf2_32(VF) &&
+           "Reduction emission only supported for pow2 vectors!");
+    Value *TmpVec = ReducedPartRdx;
+    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i/2; ++j)
+        ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                UndefValue::get(Builder.getInt32Ty()));
+
+      Value *Shuf =
+        Builder.CreateShuffleVector(TmpVec,
+                                    UndefValue::get(TmpVec->getType()),
+                                    ConstantVector::get(ShuffleMask),
+                                    "rdx.shuf");
+
+      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
+      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+    }
+
+    // The result is in the first element of the vector.
+    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
     // We know that the loop is in LCSSA form. We need to update the
@@ -1207,24 +1758,378 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Fix the scalar loop reduction variable with the incoming reduction sum
     // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    int IncomingEdgeBlockIdx =
+    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+    // Pick the other block.
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
+
+  // The Loop exit block may have single value PHI nodes where the incoming
+  // value is 'undef'. While vectorizing we only handled real values that
+  // were defined inside the loop. Here we handle the 'undef case'.
+  // See PR14725.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+       LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi) continue;
+    if (LCSSAPhi->getNumIncomingValues() == 1)
+      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
+                            LoopMiddleBlock);
+  }
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
+         "Invalid edge");
+
+  VectorParts SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  if (BI->isConditional()) {
+    VectorParts EdgeMask = getVectorValue(BI->getCondition());
+
+    if (BI->getSuccessor(0) != Dst)
+      for (unsigned part = 0; part < UF; ++part)
+        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+
+    for (unsigned part = 0; part < UF; ++part)
+      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+    return EdgeMask;
+  }
+
+  return SrcMask;
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB) {
+    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+    return getVectorValue(C);
+  }
+
+  // This is the block mask. We OR all incoming edges, and with zero.
+  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+  VectorParts BlockMask = getVectorValue(Zero);
+
+  // For each pred:
+  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
+    VectorParts EM = createEdgeMask(*it, BB);
+    for (unsigned part = 0; part < UF; ++part)
+      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+  }
+
+  return BlockMask;
+}
+
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                          BasicBlock *BB, PhiVector *PV) {
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    VectorParts &Entry = WidenMap.get(it);
+    switch (it->getOpcode()) {
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      PHINode* P = cast<PHINode>(it);
+      // Handle reduction variables:
+      if (Legal->getReductionVars()->count(P)) {
+        for (unsigned part = 0; part < UF; ++part) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(it->getType(), VF);
+          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
+                                        LoopVectorBody-> getFirstInsertionPt());
+        }
+        PV->push_back(P);
+        continue;
+      }
+
+      // Check for PHI nodes that are lowered to vector selects.
+      if (P->getParent() != OrigLoop->getHeader()) {
+        // We know that all PHIs in non header blocks are converted into
+        // selects, so we don't have to worry about the insertion order and we
+        // can just use the builder.
+
+        // At this point we generate the predication tree. There may be
+        // duplications since this is a simple recursive scan, but future
+        // optimizations will clean it up.
+        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
+                                               P->getParent());
+
+        for (unsigned part = 0; part < UF; ++part) {
+        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
+        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
+                                             "predphi");
+        }
+        continue;
+      }
+
+      // This PHINode must be an induction variable.
+      // Make sure that we know about it.
+      assert(Legal->getInductionVars()->count(P) &&
+             "Not an induction variable");
+
+      LoopVectorizationLegality::InductionInfo II =
+        Legal->getInductionVars()->lookup(P);
+
+      switch (II.IK) {
+      case LoopVectorizationLegality::IK_NoInduction:
+        llvm_unreachable("Unknown induction");
+      case LoopVectorizationLegality::IK_IntInduction: {
+        assert(P == OldInduction && "Unexpected PHI");
+        Value *Broadcasted = getBroadcastInstrs(Induction);
+        // After broadcasting the induction variable we need to make the
+        // vector consecutive by adding 0, 1, 2 ...
+        for (unsigned part = 0; part < UF; ++part)
+          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
+        continue;
+      }
+      case LoopVectorizationLegality::IK_ReverseIntInduction:
+      case LoopVectorizationLegality::IK_PtrInduction:
+      case LoopVectorizationLegality::IK_ReversePtrInduction:
+        // Handle reverse integer and pointer inductions.
+        Value *StartIdx = 0;
+        // If we have a single integer induction variable then use it.
+        // Otherwise, start counting at zero.
+        if (OldInduction) {
+          LoopVectorizationLegality::InductionInfo OldII =
+            Legal->getInductionVars()->lookup(OldInduction);
+          StartIdx = OldII.StartValue;
+        } else {
+          StartIdx = ConstantInt::get(Induction->getType(), 0);
+        }
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // Handle the reverse integer induction variable case.
+        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
+          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                                 "resize.norm.idx");
+          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                                 "reverse.idx");
+
+          // This is a new value so do not hoist it out.
+          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+          // After broadcasting the induction variable we need to make the
+          // vector consecutive by adding  ... -3, -2, -1, 0.
+          for (unsigned part = 0; part < UF; ++part)
+            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+          continue;
+        }
+
+        // Handle the pointer induction variable case.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+        // Is this a reverse induction ptr or a consecutive induction ptr.
+        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                        II.IK);
+
+        // This is the vector of results. Notice that we don't generate
+        // vector geps because scalar geps result in better code.
+        for (unsigned part = 0; part < UF; ++part) {
+          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+          for (unsigned int i = 0; i < VF; ++i) {
+            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+            Value *GlobalIdx;
+            if (!Reverse)
+              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+            else
+              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
+            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                               "next.gep");
+            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                                 Builder.getInt32(i),
+                                                 "insert.gep");
+          }
+          Entry[part] = VecVal;
+        }
+        continue;
+      }
+
+    }// End of PHI.
+
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Just widen binops.
+      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &B = getVectorValue(it->getOperand(1));
+
+      // Use this vector value for all users of the original instruction.
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+
+        // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
+        BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
+        if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (VecOp && isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
+
+        Entry[Part] = V;
+      }
+      break;
+    }
+    case Instruction::Select: {
+      // Widen selects.
+      // If the selector is loop invariant we can create a select
+      // instruction with a scalar condition. Otherwise, use vector-select.
+      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
+                                               OrigLoop);
+
+      // The condition can be loop invariant  but still defined inside the
+      // loop. This means that we can't just use the original 'cond' value.
+      // We have to take the 'vectorized' value and pick the first lane.
+      // Instcombine will make this a no-op.
+      VectorParts &Cond = getVectorValue(it->getOperand(0));
+      VectorParts &Op0  = getVectorValue(it->getOperand(1));
+      VectorParts &Op1  = getVectorValue(it->getOperand(2));
+      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
+                                                       Builder.getInt32(0));
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Entry[Part] = Builder.CreateSelect(
+          InvariantCond ? ScalarCond : Cond[Part],
+          Op0[Part],
+          Op1[Part]);
+      }
+      break;
+    }
+
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Widen compares. Generate vector compares.
+      bool FCmp = (it->getOpcode() == Instruction::FCmp);
+      CmpInst *Cmp = dyn_cast<CmpInst>(it);
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &B = getVectorValue(it->getOperand(1));
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *C = 0;
+        if (FCmp)
+          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        else
+          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+        Entry[Part] = C;
+      }
+      break;
+    }
+
+    case Instruction::Store:
+    case Instruction::Load:
+        vectorizeMemoryInstruction(it, Legal);
+        break;
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      CastInst *CI = dyn_cast<CastInst>(it);
+      /// Optimize the special case where the source is the induction
+      /// variable. Notice that we can only optimize the 'trunc' case
+      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
+      /// c. other casts depend on pointer size.
+      if (CI->getOperand(0) == OldInduction &&
+          it->getOpcode() == Instruction::Trunc) {
+        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
+                                               CI->getType());
+        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
+        break;
+      }
+      /// Vectorize casts.
+      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      for (unsigned Part = 0; Part < UF; ++Part)
+        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+      break;
+    }
+
+    case Instruction::Call: {
+      // Ignore dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
+        break;
+
+      Module *M = BB->getParent()->getParent();
+      CallInst *CI = cast<CallInst>(it);
+      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+      assert(ID && "Not an intrinsic call!");
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        SmallVector<Value*, 4> Args;
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
+          Args.push_back(Arg[Part]);
+        }
+        Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
+        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+        Entry[Part] = Builder.CreateCall(F, Args);
+      }
+      break;
+    }
+
+    default:
+      // All other instructions are unsupported. Scalarize them.
+      scalarizeInstruction(it);
+      break;
+    }// end of switch.
+  }// end of for_each instr.
 }
 
-void SingleBlockLoopVectorizer::updateAnalysis() {
-  // The original basic block.
+void InnerLoopVectorizer::updateAnalysis() {
+  // Forget the original basic block.
   SE->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
 
-  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
   DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
   DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -1232,45 +2137,94 @@ void SingleBlockLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
-bool LoopVectorizationLegality::canVectorize() {
-  if (!TheLoop->getLoopPreheader()) {
-    assert(false && "No preheader!!");
-    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  false;
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion)
+    return false;
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+
+  // Collect the blocks that need predication.
+  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+    BasicBlock *BB = LoopBlocks[i];
+
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator()))
+      return false;
+
+    // We must have at most two predecessors because we need to convert
+    // all PHIs to selects.
+    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
+    if (Preds > 2)
+      return false;
+
+    // We must be able to predicate all blocks that need to be predicated.
+    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
+      return false;
   }
 
-  // We can only vectorize single basic block loops.
+  // We can if-convert this loop.
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+
+  // We can only vectorize innermost loops.
+  if (TheLoop->getSubLoopsVector().size())
+    return false;
+
+  // We must have a single backedge.
+  if (TheLoop->getNumBackEdges() != 1)
+    return false;
+
+  // We must have a single exiting block.
+  if (!TheLoop->getExitingBlock())
+    return false;
+
   unsigned NumBlocks = TheLoop->getNumBlocks();
-  if (NumBlocks != 1) {
-    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+
+  // Check if we can if-convert non single-bb loops.
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
     return false;
   }
 
   // We need to have a loop header.
-  BasicBlock *BB = TheLoop->getHeader();
-  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << "\n");
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
 
   // Do not loop-vectorize loops with a tiny trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
-  if (TC > 0u && TC < TinyTripCountThreshold) {
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
+  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
   }
 
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs()) {
+    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    return false;
+  }
+
   // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+  if (!canVectorizeMemory()) {
+    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
     return false;
   }
 
+  // Collect all of the variables that remain uniform after vectorization.
+  collectLoopUniforms();
+
   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
         (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
         <<"!\n");
@@ -1281,130 +2235,220 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
-  // Scan the instructions in the block and look for hazards.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *I = it;
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+  BasicBlock *Header = TheLoop->getHeader();
 
-    PHINode *Phi = dyn_cast<PHINode>(I);
-    if (Phi) {
-      // This should not happen because the loop should be normalized.
-      if (Phi->getNumIncomingValues() != 2) {
-        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-        return false;
-      }
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
-        return false;
-      }
+  // If we marked the scalar loop as "already vectorized" then no need
+  // to vectorize it again.
+  if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
+    DEBUG(dbgs() << "LV: This loop was vectorized before\n");
+    return false;
+  }
+
+  // For each block in the loop.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
 
-      if (isInductionVariable(Phi)) {
-        if (Induction) {
-          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+    // Scan the instructions in the block and look for hazards.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+        // This should not happen because the loop should be normalized.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
           return false;
         }
-        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
-        Induction = Phi;
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerAdd)) {
-        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerMult)) {
-        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerOr)) {
-        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerAnd)) {
-        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerXor)) {
-        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
 
-      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
-      return false;
-    }// end of PHI handling
+        // Check that this PHI type is allowed.
+        if (!Phi->getType()->isIntegerTy() &&
+            !Phi->getType()->isFloatingPointTy() &&
+            !Phi->getType()->isPointerTy()) {
+          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+          return false;
+        }
 
-    // We still don't handle functions.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site.\n");
-      return false;
-    }
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
+        if (*bb != Header)
+          continue;
 
-    // We do not re-vectorize vectors.
-    if (!VectorType::isValidElementType(I->getType()) &&
-        !I->getType()->isVoidTy()) {
-      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
-      return false;
-    }
+        // This is the value coming from the preheader.
+        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+        // Check if this is an induction variable.
+        InductionKind IK = isInductionVariable(Phi);
+
+        if (IK_NoInduction != IK) {
+          // Int inductions are special because we only allow one IV.
+          if (IK == IK_IntInduction) {
+            if (Induction) {
+              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+              return false;
+            }
+            Induction = Phi;
+          }
+
+          DEBUG(dbgs() << "LV: Found an induction variable.\n");
+          Inductions[Phi] = InductionInfo(StartValue, IK);
+          continue;
+        }
 
-    // Reduction instructions are allowed to have exit users.
-    // All other instructions must not have external users.
-    if (!AllowedExit.count(I))
-      //Check that all of the users of the loop are inside the BB.
-      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
-           it != e; ++it) {
-        Instruction *U = cast<Instruction>(*it);
-        // This user may be a reduction exit value.
-        BasicBlock *Parent = U->getParent();
-        if (Parent != &BB) {
-          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        if (AddReductionVar(Phi, RK_IntegerAdd)) {
+          DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerMult)) {
+          DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerOr)) {
+          DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerAnd)) {
+          DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerXor)) {
+          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_FloatMult)) {
+          DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_FloatAdd)) {
+          DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+
+        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+        return false;
+      }// end of PHI handling
+
+      // We still don't handle functions. However, we can ignore dbg intrinsic
+      // calls and we do handle certain intrinsic and libm functions.
+      CallInst *CI = dyn_cast<CallInst>(it);
+      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+        DEBUG(dbgs() << "LV: Found a call site.\n");
+        return false;
+      }
+
+      // Check that the instruction return type is vectorizable.
+      if (!VectorType::isValidElementType(it->getType()) &&
+          !it->getType()->isVoidTy()) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+        return false;
+      }
+
+      // Check that the stored type is vectorizable.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
+        Type *T = ST->getValueOperand()->getType();
+        if (!VectorType::isValidElementType(T))
           return false;
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (!AllowedExit.count(it))
+        //Check that all of the users of the loop are inside the BB.
+        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
+             I != E; ++I) {
+          Instruction *U = cast<Instruction>(*I);
+          // This user may be a reduction exit value.
+          if (!TheLoop->contains(U)) {
+            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+            return false;
+          }
         }
-    }
-  } // next instr.
+    } // next instr.
+
+  }
 
   if (!Induction) {
-      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
-      return false;
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    assert(getInductionVars()->size() && "No induction variables");
   }
 
-  // Don't vectorize if the memory dependencies do not allow vectorization.
-  if (!canVectorizeMemory(BB))
-    return false;
+  return true;
+}
 
+void LoopVectorizationLegality::collectLoopUniforms() {
   // We now know that the loop is vectorizable!
   // Collect variables that will remain uniform after vectorization.
   std::vector<Value*> Worklist;
+  BasicBlock *Latch = TheLoop->getLoopLatch();
 
   // Start with the conditional branch and walk up the block.
-  Worklist.push_back(BB.getTerminator()->getOperand(0));
+  Worklist.push_back(Latch->getTerminator()->getOperand(0));
 
   while (Worklist.size()) {
     Instruction *I = dyn_cast<Instruction>(Worklist.back());
     Worklist.pop_back();
-    // Look at instructions inside this block.
-    if (!I) continue;
-    if (I->getParent() != &BB) continue;
 
+    // Look at instructions inside this loop.
     // Stop when reaching PHI nodes.
-    if (isa<PHINode>(I)) {
-      assert(I == Induction && "Found a uniform PHI that is not the induction");
-      break;
-    }
+    // TODO: we need to follow values all over the loop, not only in this block.
+    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
+      continue;
 
     // This is a known uniform.
     Uniforms.insert(I);
 
     // Insert all operands.
-    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
       Worklist.push_back(I->getOperand(i));
     }
   }
+}
 
-  return true;
+AliasAnalysis::Location
+LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+    return AA->getLocation(Store);
+  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+    return AA->getLocation(Load);
+
+  llvm_unreachable("Should be either load or store instruction");
 }
 
-bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+bool
+LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
+                                                Value *Object,
+                                                Instruction *Inst,
+                                                AliasMultiMap& WriteObjects,
+                                                unsigned MaxByteWidth) {
+
+  AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+
+  std::vector<Instruction*>::iterator
+              it = WriteObjects[Object].begin(),
+              end = WriteObjects[Object].end();
+
+  for (; it != end; ++it) {
+    Instruction* I = *it;
+    if (I == Inst)
+      continue;
+
+    AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
+    if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
+                  ThatLoc.getWithNewSize(MaxByteWidth)))
+      return true;
+  }
+  return false;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+
+  if (TheLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -1413,35 +2457,40 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
-  // Scan the BB and collect legal loads and stores.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *I = it;
-
-    // If this is a load, save it. If this instruction can read from memory
-    // but is not a load, then we quit. Notice that we don't handle function
-    // calls that read or write.
-    if (I->mayReadFromMemory()) {
-      LoadInst *Ld = dyn_cast<LoadInst>(I);
-      if (!Ld) return false;
-      if (!Ld->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
-        return false;
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld) return false;
+        if (!Ld->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+          return false;
+        }
+        Loads.push_back(Ld);
+        continue;
       }
-      Loads.push_back(Ld);
-      continue;
-    }
 
-    // Save store instructions. Abort if other instructions write to memory.
-    if (I->mayWriteToMemory()) {
-      StoreInst *St = dyn_cast<StoreInst>(I);
-      if (!St) return false;
-      if (!St->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
-        return false;
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) return false;
+        if (!St->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+          return false;
+        }
+        Stores.push_back(St);
       }
-      Stores.push_back(St);
-    }
-  } // next instr.
+    } // next instr.
+  } // next block.
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -1449,13 +2498,14 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Check if we see any stores. If there are no stores, then we don't
   // care if the pointers are *restrict*.
   if (!Stores.size()) {
-        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
-        return true;
+    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+    return true;
   }
 
-  // Holds the read and read-write *pointers* that we find.
-  ValueVector Reads;
-  ValueVector ReadWrites;
+  // Holds the read and read-write *pointers* that we find. These maps hold
+  // unique values for pointers (so no need for multi-map).
+  AliasMap Reads;
+  AliasMap ReadWrites;
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -1466,8 +2516,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
 
   ValueVector::iterator I, IE;
   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
-    StoreInst *ST = dyn_cast<StoreInst>(*I);
-    assert(ST && "Bad StoreInst");
+    StoreInst *ST = cast<StoreInst>(*I);
     Value* Ptr = ST->getPointerOperand();
 
     if (isUniform(Ptr)) {
@@ -1478,12 +2527,11 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     // If we did *not* see this pointer before, insert it to
     // the read-write list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr))
-      ReadWrites.push_back(Ptr);
+      ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
-    LoadInst *LD = dyn_cast<LoadInst>(*I);
-    assert(LD && "Bad LoadInst");
+    LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
     // If we did *not* see this pointer before, insert it to the
     // read list. If we *did* see it before, then it is already in
@@ -1493,8 +2541,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     // If the address of i is unknown (for example A[B[i]]) then we may
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
-    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
-      Reads.push_back(Ptr);
+    if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
+      Reads.insert(std::make_pair(Ptr, LD));
   }
 
   // If we write (or read-write) to a single destination and there are no
@@ -1506,84 +2554,156 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool RT = true;
-  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
-    if (hasComputableBounds(*I)) {
-      PtrRtCheck.Pointers.push_back(*I);
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+  bool CanDoRT = true;
+  AliasMap::iterator MI, ME;
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
-      RT = false;
+      CanDoRT = false;
       break;
     }
-  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
-    if (hasComputableBounds(*I)) {
-      PtrRtCheck.Pointers.push_back(*I);
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+  }
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
-      RT = false;
+      CanDoRT = false;
       break;
     }
+  }
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
-  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
-    PtrRtCheck.Pointers.clear();
-    RT = false;
+  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.reset();
+    CanDoRT = false;
   }
 
-  PtrRtCheck.Need = RT;
-
-  if (RT) {
+  if (CanDoRT) {
     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
   }
 
+  bool NeedRTCheck = false;
+
+  // Biggest vectorized access possible, vector width * unroll factor.
+  // TODO: We're being very pessimistic here, find a way to know the
+  // real access width before getting here.
+  unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
+                           TTI->getMaximumUnrollFactor();
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
-  ValueSet WriteObjects;
+  // Note that WriteObjects duplicates the stores (indexed now by underlying
+  // objects) to avoid pointing to elements inside ReadWrites.
+  // TODO: Maybe create a new type where they can interact without duplication.
+  AliasMultiMap WriteObjects;
   ValueVector TempObjects;
 
   // Check that the read-writes do not conflict with other read-write
   // pointers.
-  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
-    GetUnderlyingObjects(*I, TempObjects, DL);
-    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
-         it != e; ++it) {
-      if (!isIdentifiedObject(*it)) {
-        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return RT;
+  bool AllWritesIdentified = true;
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    Instruction *Inst = (*MI).second;
+
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
+      if (!isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
+        NeedRTCheck = true;
+        AllWritesIdentified = false;
       }
-      if (!WriteObjects.insert(*it)) {
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty()) {
+        DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
+        WriteObjects[*UI].push_back(Inst);
+        continue;
+      }
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a conflicting global value:"
+            << **UI <<"\n");
+      DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **it <<"\n");
-        return RT;
+              << *UI <<"\n");
+        return false;
       }
+
+      // Didn't alias, insert into map for further reference.
+      WriteObjects[*UI].push_back(Inst);
     }
     TempObjects.clear();
   }
 
   /// Check that the reads don't conflict with the read-writes.
-  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
-    GetUnderlyingObjects(*I, TempObjects, DL);
-    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
-         it != e; ++it) {
-      if (!isIdentifiedObject(*it)) {
-        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return RT;
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
+      // If all of the writes are identified then we don't care if the read
+      // pointer is identified or not.
+      if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
+        NeedRTCheck = true;
       }
-      if (WriteObjects.count(*it)) {
-        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
-              << **it <<"\n");
-        return RT;
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty())
+        continue;
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a global value:  "
+            << **UI <<"\n");
+      Instruction *Inst = (*MI).second;
+      DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
+              << *UI <<"\n");
+        return false;
       }
     }
     TempObjects.clear();
   }
 
-  // It is safe to vectorize and we don't need any runtime checks.
-  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
-  PtrRtCheck.Pointers.clear();
-  PtrRtCheck.Need = false;
+  PtrRtCheck.Need = NeedRTCheck;
+  if (NeedRTCheck && !CanDoRT) {
+    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+        " need a runtime memory check.\n");
   return true;
 }
 
@@ -1592,38 +2712,43 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
-  // Find the possible incoming reduction variable.
-  BasicBlock *BB = Phi->getParent();
-  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
-  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
-  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+  // Reduction variables are only found in the loop header block.
+  if (Phi->getParent() != TheLoop->getHeader())
+    return false;
+
+  // Obtain the reduction start value from the value that comes from the loop
+  // preheader.
+  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
 
   // ExitInstruction is the single value which is used outside the loop.
   // We only allow for a single reduction value to be used outside the loop.
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = 0;
+  // Indicates that we found a binary operation in our scan.
+  bool FoundBinOp = false;
 
   // Iter is our iterator. We start with the PHI node and scan for all of the
-  // users of this instruction. All users must be instructions which can be
+  // users of this instruction. All users must be instructions that can be
   // used as reduction variables (such as ADD). We may have a single
-  // out-of-block user. They cycle must end with the original PHI.
-  // Also, we can't have multiple block-local users.
+  // out-of-block user. The cycle must end with the original PHI.
   Instruction *Iter = Phi;
   while (true) {
-    // Any reduction instr must be of one of the allowed kinds.
-    if (!isReductionInstr(Iter, Kind))
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_empty())
       return false;
 
-    // Did we found a user inside this block ?
+    // Did we find a user inside this loop already ?
     bool FoundInBlockUser = false;
-    // Did we reach the initial PHI node ?
+    // Did we reach the initial PHI node already ?
     bool FoundStartPHI = false;
 
-    // If the instruction has no users then this is a broken
-    // chain and can't be a reduction variable.
-    if (Iter->use_empty())
-      return false;
+    // Is this a bin op ?
+    FoundBinOp |= !isa<PHINode>(Iter);
+
+    // Remember the current instruction.
+    Instruction *OldIter = Iter;
 
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
@@ -1634,75 +2759,171 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         FoundStartPHI = true;
         continue;
       }
+
       // Check if we found the exit user.
       BasicBlock *Parent = U->getParent();
-      if (Parent != BB) {
-        // We must have a single exit instruction.
+      if (!TheLoop->contains(Parent)) {
+        // Exit if you find multiple outside users.
         if (ExitInstruction != 0)
           return false;
         ExitInstruction = Iter;
       }
+
+      // We allow in-loop PHINodes which are not the original reduction PHI
+      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
+      // structure) then don't skip this PHI.
+      if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
+          U->getParent() != TheLoop->getHeader() &&
+          TheLoop->contains(U) &&
+          Iter->hasNUsesOrMore(2))
+        continue;
+
       // We can't have multiple inside users.
       if (FoundInBlockUser)
         return false;
       FoundInBlockUser = true;
+
+      // Any reduction instr must be of one of the allowed kinds.
+      if (!isReductionInstr(U, Kind))
+        return false;
+
+      // Reductions of instructions such as Div, and Sub is only
+      // possible if the LHS is the reduction variable.
+      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+        return false;
+
       Iter = U;
     }
 
+    // If all uses were skipped this can't be a reduction variable.
+    if (Iter == OldIter)
+      return false;
+
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
-   if (FoundStartPHI && ExitInstruction) {
-     // This instruction is allowed to have out-of-loop users.
-     AllowedExit.insert(ExitInstruction);
-
-     // Save the description of this reduction variable.
-     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
-     Reductions[Phi] = RD;
-     return true;
-   }
+    if (FoundStartPHI) {
+      // This instruction is allowed to have out-of-loop users.
+      AllowedExit.insert(ExitInstruction);
+
+      // Save the description of this reduction variable.
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      Reductions[Phi] = RD;
+      // We've ended the cycle. This is a reduction variable if we have an
+      // outside user and it has a binary op.
+      return FoundBinOp && ExitInstruction;
+    }
   }
 }
 
 bool
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind) {
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::PHI:
-      // possibly.
-      return true;
-    case Instruction::Add:
-    case Instruction::Sub:
-      return Kind == IntegerAdd;
-    case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-      return Kind == IntegerMult;
-    case Instruction::And:
-      return Kind == IntegerAnd;
-    case Instruction::Or:
-      return Kind == IntegerOr;
-    case Instruction::Xor:
-      return Kind == IntegerXor;
-    }
+  bool FP = I->getType()->isFloatingPointTy();
+  bool FastMath = (FP && I->isCommutative() && I->isAssociative());
+
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case Instruction::PHI:
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
+        return false;
+    // possibly.
+    return true;
+  case Instruction::Sub:
+  case Instruction::Add:
+    return Kind == RK_IntegerAdd;
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::Mul:
+    return Kind == RK_IntegerMult;
+  case Instruction::And:
+    return Kind == RK_IntegerAnd;
+  case Instruction::Or:
+    return Kind == RK_IntegerOr;
+  case Instruction::Xor:
+    return Kind == RK_IntegerXor;
+  case Instruction::FMul:
+    return Kind == RK_FloatMult && FastMath;
+  case Instruction::FAdd:
+    return Kind == RK_FloatAdd && FastMath;
+   }
 }
 
-bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
-  // Check that the PHI is consecutive and starts at zero.
+LoopVectorizationLegality::InductionKind
+LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return IK_NoInduction;
+
+  // Check that the PHI is consecutive.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
   if (!AR) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-    return false;
+    return IK_NoInduction;
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
-  if (!Step->isOne()) {
-    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
+  // Integer inductions need to have a stride of one.
+  if (PhiTy->isIntegerTy()) {
+    if (Step->isOne())
+      return IK_IntInduction;
+    if (Step->isAllOnesValue())
+      return IK_ReverseIntInduction;
+    return IK_NoInduction;
+  }
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C)
+    return IK_NoInduction;
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  if (C->getValue()->equalsInt(Size))
+    return IK_PtrInduction;
+  else if (C->getValue()->equalsInt(0 - Size))
+    return IK_ReversePtrInduction;
+
+  return IK_NoInduction;
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  Value *In0 = const_cast<Value*>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
     return false;
+
+  return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
+  assert(TheLoop->contains(BB) && "Unknown block used");
+
+  // Blocks that do not dominate the latch need predication.
+  BasicBlock* Latch = TheLoop->getLoopLatch();
+  return !DT->dominates(BB, Latch);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // We don't predicate loads/stores at the moment.
+    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+      return false;
+
+    // The instructions below can trap.
+    switch (it->getOpcode()) {
+    default: continue;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+             return false;
+    }
   }
+
   return true;
 }
 
@@ -1715,11 +2936,64 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
   return AR->isAffine();
 }
 
-unsigned
-LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
-  if (!VTTI) {
-    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
-    return 1;
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
+                                                      unsigned UserVF) {
+  // Width 1 means no vectorize
+  VectorizationFactor Factor = { 1U, 0U };
+  if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
+    return Factor;
+  }
+
+  // Find the trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
+  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+
+  unsigned WidestType = getWidestType();
+  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxVectorSize = WidestRegister / WidestType;
+  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+
+  if (MaxVectorSize == 0) {
+    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    MaxVectorSize = 1;
+  }
+
+  assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+         " into one vector!");
+
+  unsigned VF = MaxVectorSize;
+
+  // If we optimize the program for size, avoid creating the tail loop.
+  if (OptForSize) {
+    // If we are unable to calculate the trip count then don't try to vectorize.
+    if (TC < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return Factor;
+    }
+
+    // Find the maximum SIMD width that can fit within the trip count.
+    VF = TC % MaxVectorSize;
+
+    if (VF == 0)
+      VF = MaxVectorSize;
+
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    if (VF < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return Factor;
+    }
+  }
+
+  if (UserVF != 0) {
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+
+    Factor.Width = UserVF;
+    return Factor;
   }
 
   float Cost = expectedCost(1);
@@ -1739,23 +3013,278 @@ LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
   }
 
   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
-  return Width;
+  Factor.Width = Width;
+  Factor.Cost = Width * Cost;
+  return Factor;
 }
 
-unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
-  // We can only estimate the cost of single basic block loops.
-  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+unsigned LoopVectorizationCostModel::getWidestType() {
+  unsigned MaxWidth = 8;
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      Type *T = it->getType();
 
-  BasicBlock *BB = TheLoop->getHeader();
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+        continue;
+
+      // Examine PHI nodes that are reduction variables.
+      if (PHINode *PN = dyn_cast<PHINode>(it))
+        if (!Legal->getReductionVars()->count(PN))
+          continue;
+
+      // Examine the stored values.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // consecutive. However, we do want to take consecutive stores/loads of
+      // pointer vectors into account.
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+        continue;
+
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return MaxWidth;
+}
+
+unsigned
+LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
+                                               unsigned UserUF,
+                                               unsigned VF,
+                                               unsigned LoopCost) {
+
+  // -- The unroll heuristics --
+  // We unroll the loop in order to expose ILP and reduce the loop overhead.
+  // There are many micro-architectural considerations that we can't predict
+  // at this level. For example frontend pressure (on decode or fetch) due to
+  // code size, or the number and capabilities of the execution ports.
+  //
+  // We use the following heuristics to select the unroll factor:
+  // 1. If the code has reductions the we unroll in order to break the cross
+  // iteration dependency.
+  // 2. If the loop is really small then we unroll in order to reduce the loop
+  // overhead.
+  // 3. We don't unroll if we think that we will spill registers to memory due
+  // to the increased register pressure.
+
+  // Use the user preference, unless 'auto' is selected.
+  if (UserUF != 0)
+    return UserUF;
+
+  // When we optimize for size we don't unroll.
+  if (OptForSize)
+    return 1;
+
+  // Do not unroll loops with a relatively small trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop,
+                                              TheLoop->getLoopLatch());
+  if (TC > 1 && TC < TinyTripCountUnrollThreshold)
+    return 1;
+
+  unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
+  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
+        " vector registers\n");
+
+  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+  // We divide by these constants so assume that we have at least one
+  // instruction that uses at least one register.
+  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  R.NumInstructions = std::max(R.NumInstructions, 1U);
+
+  // We calculate the unroll factor using the following formula.
+  // Subtract the number of loop invariants from the number of available
+  // registers. These registers are used by all of the unrolled instances.
+  // Next, divide the remaining registers by the number of registers that is
+  // required by the loop, in order to estimate how many parallel instances
+  // fit without causing spills.
+  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+
+  // Clamp the unroll factor ranges to reasonable factors.
+  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0)
+    LoopCost = expectedCost(VF);
+
+  // Clamp the calculated UF to be between the 1 and the max unroll factor
+  // that the target allows.
+  if (UF > MaxUnrollSize)
+    UF = MaxUnrollSize;
+  else if (UF < 1)
+    UF = 1;
+
+  if (Legal->getReductionVars()->size()) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+    return UF;
+  }
+
+  // We want to unroll tiny loops in order to reduce the loop overhead.
+  // We assume that the cost overhead is 1 and we use the cost model
+  // to estimate the cost of the loop and unroll until the cost of the
+  // loop overhead is about 5% of the cost of the loop.
+  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+  if (LoopCost < 20) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+    unsigned NewUF = 20/LoopCost + 1;
+    return std::min(NewUF, UF);
+  }
+
+  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  return 1;
+}
+
+LoopVectorizationCostModel::RegisterUsage
+LoopVectorizationCostModel::calculateRegisterUsage() {
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each instruction. We use RPO to ensure that defs are
+  // met before their users. We assume that each instruction that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // instruction. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+
+  RegisterUsage R;
+  R.NumInstructions = 0;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // instruction that is the key.
+  typedef DenseMap<Instruction*, unsigned> IntervalMap;
+  // Maps instruction to its index.
+  DenseMap<unsigned, Instruction*> IdxToInstr;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of instruction indices that are used in the loop.
+  SmallSet<Instruction*, 8> Ends;
+  // Saves the list of values that are used in the loop but are
+  // defined outside the loop, such as arguments and constants.
+  SmallPtrSet<Value*, 8> LoopInvariants;
+
+  unsigned Index = 0;
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb) {
+    R.NumInstructions += (*bb)->size();
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Instruction *I = it;
+      IdxToInstr[Index++] = I;
+
+      // Save the end location of each USE.
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value *U = I->getOperand(i);
+        Instruction *Instr = dyn_cast<Instruction>(U);
+
+        // Ignore non-instruction values such as arguments, constants, etc.
+        if (!Instr) continue;
+
+        // If this instruction is outside the loop then record it and continue.
+        if (!TheLoop->contains(Instr)) {
+          LoopInvariants.insert(Instr);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[Instr] = Index;
+        Ends.insert(Instr);
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  typedef SmallVector<Instruction*, 2> InstrList;
+  DenseMap<unsigned, InstrList> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
+       it != e; ++it)
+    TransposeEnds[it->second].push_back(it->first);
+
+  SmallSet<Instruction*, 8> OpenIntervals;
+  unsigned MaxUsage = 0;
+
+
+  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+  for (unsigned int i = 0; i < Index; ++i) {
+    Instruction *I = IdxToInstr[i];
+    // Ignore instructions that are never used within the loop.
+    if (!Ends.count(I)) continue;
+
+    // Remove all of the instructions that end at this location.
+    InstrList &List = TransposeEnds[i];
+    for (unsigned int j=0, e = List.size(); j < e; ++j)
+      OpenIntervals.erase(List[j]);
+
+    // Count the number of live interals.
+    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
+          OpenIntervals.size() <<"\n");
+
+    // Add the current instruction to the list of open intervals.
+    OpenIntervals.insert(I);
+  }
+
+  unsigned Invariant = LoopInvariants.size();
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+
+  R.LoopInvariantRegs = Invariant;
+  R.MaxLocalUsers = MaxUsage;
+  return R;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   unsigned Cost = 0;
 
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    Instruction *Inst = it;
-    unsigned C = getInstructionCost(Inst, VF);
-    Cost += C;
-    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
-          " For instruction: "<< *Inst << "\n");
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    unsigned BlockCost = 0;
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the old loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      // Skip dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
+        continue;
+
+      unsigned C = getInstructionCost(it, VF);
+      Cost += C;
+      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
+            VF << " For instruction: "<< *it << "\n");
+    }
+
+    // We assume that if-converted blocks have a 50% chance of being executed.
+    // When the code is scalar then some of the blocks are avoided due to CF.
+    // When the code is vectorized we execute all code paths.
+    if (Legal->blockNeedsPredication(*bb) && VF == 1)
+      BlockCost /= 2;
+
+    Cost += BlockCost;
   }
 
   return Cost;
@@ -1763,8 +3292,6 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
-  assert(VTTI && "Invalid vector target transformation info");
-
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (Legal->isUniformAfterVectorization(I))
@@ -1773,147 +3300,173 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   Type *RetTy = I->getType();
   Type *VectorTy = ToVectorTy(RetTy, VF);
 
-
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
-    case Instruction::GetElementPtr:
-      // We mark this instruction as zero-cost because scalar GEPs are usually
-      // lowered to the intruction addressing mode. At the moment we don't
-      // generate vector geps.
-      return 0;
-    case Instruction::Br: {
-      return VTTI->getCFInstrCost(I->getOpcode());
-    }
-    case Instruction::PHI:
-      return 0;
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
-    }
-    case Instruction::Select: {
-      SelectInst *SI = cast<SelectInst>(I);
-      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
-      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
-      Type *CondTy = SI->getCondition()->getType();
-      if (ScalarCond)
-        CondTy = VectorType::get(CondTy, VF);
-
-      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
-    }
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      Type *ValTy = I->getOperand(0)->getType();
-      VectorTy = ToVectorTy(ValTy, VF);
-      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
-    }
-    case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(I);
-      Type *ValTy = SI->getValueOperand()->getType();
-      VectorTy = ToVectorTy(ValTy, VF);
-
-      if (VF == 1)
-        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
-                              SI->getAlignment(), SI->getPointerAddressSpace());
-
-      // Scalarized stores.
-      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
-        unsigned Cost = 0;
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                              ValTy);
-        // The cost of extracting from the value vector.
-        Cost += VF * (ExtCost);
-        // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           ValTy->getScalarType(),
-                                           SI->getAlignment(),
-                                           SI->getPointerAddressSpace());
-        return Cost;
-      }
-
-      // Wide stores.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
-                                   SI->getPointerAddressSpace());
-    }
-    case Instruction::Load: {
-      LoadInst *LI = cast<LoadInst>(I);
-
-      if (VF == 1)
-        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
-                                     LI->getAlignment(),
-                                     LI->getPointerAddressSpace());
-
-      // Scalarized loads.
-      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
-        unsigned Cost = 0;
-        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
-        // The cost of inserting the loaded value into the result vector.
-        Cost += VF * (InCost);
-        // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           RetTy->getScalarType(),
-                                           LI->getAlignment(),
-                                           LI->getPointerAddressSpace());
-        return Cost;
+  case Instruction::GetElementPtr:
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
+    return 0;
+  case Instruction::Br: {
+    return TTI.getCFInstrCost(I->getOpcode());
+  }
+  case Instruction::PHI:
+    //TODO: IF-converted IFs become selects.
+    return 0;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueKind Op1VK =
+      TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+      TargetTransformInfo::OK_AnyValue;
+
+    if (isa<ConstantInt>(I->getOperand(1)))
+      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+    Type *CondTy = SI->getCondition()->getType();
+    if (!ScalarCond)
+      CondTy = VectorType::get(CondTy, VF);
+
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    VectorTy = ToVectorTy(ValTy, VF);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+  }
+  case Instruction::Store:
+  case Instruction::Load: {
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    Type *ValTy = (SI ? SI->getValueOperand()->getType() :
+                   LI->getType());
+    VectorTy = ToVectorTy(ValTy, VF);
+
+    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+    unsigned AS = SI ? SI->getPointerAddressSpace() :
+      LI->getPointerAddressSpace();
+    Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+    // We add the cost of address computation here instead of with the gep
+    // instruction because only here we know whether the operation is
+    // scalarized.
+    if (VF == 1)
+      return TTI.getAddressComputationCost(VectorTy) +
+        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
+    // Scalarized loads/stores.
+    int Stride = Legal->isConsecutivePtr(Ptr);
+    bool Reverse = Stride < 0;
+    if (0 == Stride) {
+      unsigned Cost = 0;
+      // The cost of extracting from the value vector and pointer vector.
+      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+      for (unsigned i = 0; i < VF; ++i) {
+        //  The cost of extracting the pointer operand.
+        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
+        // In case of STORE, the cost of ExtractElement from the vector.
+        // In case of LOAD, the cost of InsertElement into the returned
+        // vector.
+        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
+                                            Instruction::InsertElement,
+                                            VectorTy, i);
       }
 
-      // Wide loads.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
-                                   LI->getPointerAddressSpace());
-    }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+      // The cost of the scalar loads/stores.
+      Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
+      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                       Alignment, AS);
+      return Cost;
     }
-    default: {
-      // We are scalarizing the instruction. Return the cost of the scalar
-      // instruction, plus the cost of insert and extract into vector
-      // elements, times the vector width.
-      unsigned Cost = 0;
 
-      bool IsVoid = RetTy->isVoidTy();
+    // Wide load/stores.
+    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
-      unsigned InsCost = (IsVoid ? 0 :
-                          VTTI->getInstrCost(Instruction::InsertElement,
-                                             VectorTy));
-
-      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                            VectorTy);
+    if (Reverse)
+      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                  VectorTy, 0);
+    return Cost;
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    // We optimize the truncation of induction variable.
+    // The cost of these is the same as the scalar operation.
+    if (I->getOpcode() == Instruction::Trunc &&
+        Legal->isInductionVariable(I->getOperand(0)))
+      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
+                                  I->getOperand(0)->getType());
+
+    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(I);
+    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+    assert(ID && "Not an intrinsic call!");
+    Type *RetTy = ToVectorTy(CI->getType(), VF);
+    SmallVector<Type*, 4> Tys;
+    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+      Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+    return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+  }
+  default: {
+    // We are scalarizing the instruction. Return the cost of the scalar
+    // instruction, plus the cost of insert and extract into vector
+    // elements, times the vector width.
+    unsigned Cost = 0;
+
+    if (!RetTy->isVoidTy() && VF != 1) {
+      unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
+                                                VectorTy);
+      unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                                VectorTy);
 
       // The cost of inserting the results plus extracting each one of the
       // operands.
       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
-
-      // The cost of executing VF copies of the scalar instruction.
-      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
-      return Cost;
     }
+
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
+    Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+    return Cost;
+  }
   }// end of switch.
 }
 
@@ -1923,12 +3476,11 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-} // namespace
-
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
@@ -1939,3 +3491,14 @@ namespace llvm {
   }
 }
 
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+  // Check for a store.
+  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+    return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
+
+  // Check for a load.
+  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index d26973a..19eefd2 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,4 +1,4 @@
-//===-- Vectorize.cpp -----------------------------------------------------===//
+   //===-- Vectorize.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm-c/Initialization.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm-c/Transforms/Vectorize.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Vectorize.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
 
 using namespace llvm;