178 files changed, 7255 insertions, 2888 deletions
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 0a83c3d..17c9b86 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_library(LLVMAnalysis
   LoopPass.cpp
   MemoryBuiltins.cpp
   MemoryDependenceAnalysis.cpp
+  PHITransAddr.cpp
   PointerTracking.cpp
   PostDominators.cpp
   ProfileEstimatorPass.cpp
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index a276c64..10a8b11 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -25,6 +25,16 @@
 #include "llvm/Support/CallSite.h"
 using namespace llvm;
 
+/// As its comment mentions, PointerMayBeCaptured can be expensive.
+/// However, it's not easy for BasicAA to cache the result, because
+/// it's an ImmutablePass. To work around this, bound queries at a
+/// fixed number of uses.
+///
+/// TODO: Write a new FunctionPass AliasAnalysis so that it can keep
+/// a cache. Then we can move the code from BasicAliasAnalysis into
+/// that path, and remove this threshold.
+static int const Threshold = 20;
+
 /// PointerMayBeCaptured - Return true if this pointer value may be captured
 /// by the enclosing function (which is required to exist).  This routine can
 /// be expensive, so consider caching the results.  The boolean ReturnCaptures
@@ -35,11 +45,17 @@ using namespace llvm;
 bool llvm::PointerMayBeCaptured(const Value *V,
                                 bool ReturnCaptures, bool StoreCaptures) {
   assert(isa<PointerType>(V->getType()) && "Capture is for pointers only!");
-  SmallVector<Use*, 16> Worklist;
-  SmallSet<Use*, 16> Visited;
+  SmallVector<Use*, Threshold> Worklist;
+  SmallSet<Use*, Threshold> Visited;
+  int Count = 0;
 
   for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
        UI != UE; ++UI) {
+    // If there are lots of uses, conservatively say that the value
+    // is captured to avoid taking too much compile time.
+    if (Count++ >= Threshold)
+      return true;
+
     Use *U = &UI.getUse();
     Visited.insert(U);
     Worklist.push_back(U);
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 96f738e..eaf90d0 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -432,7 +432,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   // Instead of loading constant c string, use corresponding integer value
   // directly if string length is small enough.
   std::string Str;
-  if (TD && GetConstantStringInfo(CE->getOperand(0), Str) && !Str.empty()) {
+  if (TD && GetConstantStringInfo(CE, Str) && !Str.empty()) {
     unsigned StrLen = Str.length();
     const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
     unsigned NumBits = Ty->getPrimitiveSizeInBits();
@@ -569,9 +569,16 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
   SmallVector<Constant*, 32> NewIdxs;
   do {
     if (const SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
-      // The only pointer indexing we'll do is on the first index of the GEP.
-      if (isa<PointerType>(ATy) && !NewIdxs.empty())
-        break;
+      if (isa<PointerType>(ATy)) {
+        // The only pointer indexing we'll do is on the first index of the GEP.
+        if (!NewIdxs.empty())
+          break;
+       
+        // Only handle pointers to sized types, not pointers to functions.
+        if (!ATy->getElementType()->isSized())
+          return 0;
+      }
+        
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
       if (ElemSize == 0)
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index 41d803c..1c9f500 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -866,7 +866,9 @@ DISubprogram DIFactory::CreateSubprogram(DIDescriptor Context,
                                          DICompileUnit CompileUnit,
                                          unsigned LineNo, DIType Type,
                                          bool isLocalToUnit,
-                                         bool isDefinition) {
+                                         bool isDefinition,
+                                         unsigned VK, unsigned VIndex,
+                                         DIType ContainingType) {
 
   Value *Elts[] = {
     GetTagConstant(dwarf::DW_TAG_subprogram),
@@ -879,9 +881,38 @@ DISubprogram DIFactory::CreateSubprogram(DIDescriptor Context,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     Type.getNode(),
     ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition)
+    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
+    ConstantInt::get(Type::getInt32Ty(VMContext), (unsigned)VK),
+    ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
+    ContainingType.getNode()
   };
-  return DISubprogram(MDNode::get(VMContext, &Elts[0], 11));
+  return DISubprogram(MDNode::get(VMContext, &Elts[0], 14));
+}
+
+/// CreateSubprogramDefinition - Create new subprogram descriptor for the
+/// given declaration. 
+DISubprogram DIFactory::CreateSubprogramDefinition(DISubprogram &SPDeclaration) {
+  if (SPDeclaration.isDefinition())
+    return DISubprogram(SPDeclaration.getNode());
+
+  MDNode *DeclNode = SPDeclaration.getNode();
+  Value *Elts[] = {
+    GetTagConstant(dwarf::DW_TAG_subprogram),
+    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    DeclNode->getElement(2), // Context
+    DeclNode->getElement(3), // Name
+    DeclNode->getElement(4), // DisplayName
+    DeclNode->getElement(5), // LinkageName
+    DeclNode->getElement(6), // CompileUnit
+    DeclNode->getElement(7), // LineNo
+    DeclNode->getElement(8), // Type
+    DeclNode->getElement(9), // isLocalToUnit
+    ConstantInt::get(Type::getInt1Ty(VMContext), true),
+    DeclNode->getElement(11), // Virtuality
+    DeclNode->getElement(12), // VIndex
+    DeclNode->getElement(13)  // Containting Type
+  };
+  return DISubprogram(MDNode::get(VMContext, &Elts[0], 14));
 }
 
 /// CreateGlobalVariable - Create a new descriptor for the specified global.
@@ -1019,6 +1050,37 @@ Instruction *DIFactory::InsertDeclare(Value *Storage, DIVariable D,
   return CallInst::Create(DeclareFn, Args, Args+2, "", InsertAtEnd);
 }
 
+/// InsertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
+Instruction *DIFactory::InsertDbgValueIntrinsic(Value *V, Value *Offset,
+                                                DIVariable D,
+                                                Instruction *InsertBefore) {
+  assert(V && "no value passed to dbg.value");
+  assert(Offset->getType() == Type::getInt64Ty(V->getContext()) &&
+         "offset must be i64");
+  if (!ValueFn)
+    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
+
+  Value *Elts[] = { V };
+  Value *Args[] = { MDNode::get(V->getContext(), Elts, 1), Offset,
+                    D.getNode() };
+  return CallInst::Create(ValueFn, Args, Args+3, "", InsertBefore);
+}
+
+/// InsertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
+Instruction *DIFactory::InsertDbgValueIntrinsic(Value *V, Value *Offset,
+                                                DIVariable D,
+                                                BasicBlock *InsertAtEnd) {
+  assert(V && "no value passed to dbg.value");
+  assert(Offset->getType() == Type::getInt64Ty(V->getContext()) &&
+         "offset must be i64");
+  if (!ValueFn)
+    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
+
+  Value *Elts[] = { V };
+  Value *Args[] = { MDNode::get(V->getContext(), Elts, 1), Offset,
+                    D.getNode() };
+  return CallInst::Create(ValueFn, Args, Args+3, "", InsertAtEnd);
+}
 
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
diff --git a/lib/Analysis/IPA/Andersens.cpp b/lib/Analysis/IPA/Andersens.cpp
index e12db81..4d5b312 100644
--- a/lib/Analysis/IPA/Andersens.cpp
+++ b/lib/Analysis/IPA/Andersens.cpp
@@ -121,8 +121,6 @@ namespace {
 
       return *LHS == *RHS;
     }
-
-    static bool isPod() { return true; }
   };
 
   class Andersens : public ModulePass, public AliasAnalysis,
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 37747b6..627dbbb 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -53,7 +53,7 @@ static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
       if (newLoop == L)
         return false;
       // if newLoop is an outer loop of L, this is OK.
-      if (!LoopInfo::isNotAlreadyContainedIn(L, newLoop))
+      if (newLoop->contains(L->getHeader()))
         return false;
     }
     return true;
@@ -307,6 +307,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
     AddUsersIfInteresting(I);
 
+  Processed.clear();
   return false;
 }
 
@@ -369,7 +370,7 @@ void IVUsers::dump() const {
 void IVUsers::releaseMemory() {
   IVUsesByStride.clear();
   StrideOrder.clear();
-  Processed.clear();
+  IVUses.clear();
 }
 
 void IVStrideUse::deleted() {
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 4de756c..34089ee 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -316,12 +316,12 @@ bool Loop::hasDedicatedExits() const {
 
 /// getUniqueExitBlocks - Return all unique successor blocks of this loop.
 /// These are the blocks _outside of the current loop_ which are branched to.
-/// This assumes that loop is in canonical form.
+/// This assumes that loop exits are in canonical form.
 ///
 void
 Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
-  assert(isLoopSimplifyForm() &&
-         "getUniqueExitBlocks assumes the loop is in canonical form!");
+  assert(hasDedicatedExits() &&
+         "getUniqueExitBlocks assumes the loop has canonical form exits!");
 
   // Sort the blocks vector so that we can use binary search to do quick
   // lookups.
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index ae6f970..a0c7706 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/PredIteratorCache.h"
@@ -172,7 +173,7 @@ MemDepResult MemoryDependenceAnalysis::
 getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad, 
                          BasicBlock::iterator ScanIt, BasicBlock *BB) {
 
-  Value *invariantTag = 0;
+  Value *InvariantTag = 0;
 
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
@@ -180,34 +181,36 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
 
     // If we're in an invariant region, no dependencies can be found before
     // we pass an invariant-begin marker.
-    if (invariantTag == Inst) {
-      invariantTag = 0;
+    if (InvariantTag == Inst) {
+      InvariantTag = 0;
       continue;
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    }
+    
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+      // Debug intrinsics don't cause dependences.
+      if (isa<DbgInfoIntrinsic>(Inst)) continue;
+      
       // If we pass an invariant-end marker, then we've just entered an
       // invariant region and can start ignoring dependencies.
       if (II->getIntrinsicID() == Intrinsic::invariant_end) {
-        uint64_t invariantSize = ~0ULL;
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getOperand(2)))
-          invariantSize = CI->getZExtValue();
-        
-        AliasAnalysis::AliasResult R =
-          AA->alias(II->getOperand(3), invariantSize, MemPtr, MemSize);
+        // FIXME: This only considers queries directly on the invariant-tagged
+        // pointer, not on query pointers that are indexed off of them.  It'd
+        // be nice to handle that at some point.
+        AliasAnalysis::AliasResult R = 
+          AA->alias(II->getOperand(3), ~0U, MemPtr, ~0U);
         if (R == AliasAnalysis::MustAlias) {
-          invariantTag = II->getOperand(1);
+          InvariantTag = II->getOperand(1);
           continue;
         }
       
       // If we reach a lifetime begin or end marker, then the query ends here
       // because the value is undefined.
-      } else if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-                 II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        uint64_t invariantSize = ~0ULL;
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getOperand(1)))
-          invariantSize = CI->getZExtValue();
-
+      } else if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+        // FIXME: This only considers queries directly on the invariant-tagged
+        // pointer, not on query pointers that are indexed off of them.  It'd
+        // be nice to handle that at some point.
         AliasAnalysis::AliasResult R =
-          AA->alias(II->getOperand(2), invariantSize, MemPtr, MemSize);
+          AA->alias(II->getOperand(2), ~0U, MemPtr, ~0U);
         if (R == AliasAnalysis::MustAlias)
           return MemDepResult::getDef(II);
       }
@@ -215,10 +218,7 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
 
     // If we're querying on a load and we're in an invariant region, we're done
     // at this point. Nothing a load depends on can live in an invariant region.
-    if (isLoad && invariantTag) continue;
-
-    // Debug intrinsics don't cause dependences.
-    if (isa<DbgInfoIntrinsic>(Inst)) continue;
+    if (isLoad && InvariantTag) continue;
 
     // Values depend on loads if the pointers are must aliased.  This means that
     // a load depends on another must aliased load from the same value.
@@ -243,7 +243,7 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       // There can't be stores to the value we care about inside an 
       // invariant region.
-      if (invariantTag) continue;
+      if (InvariantTag) continue;
       
       // If alias analysis can tell that this store is guaranteed to not modify
       // the query pointer, ignore it.  Use getModRefInfo to handle cases where
@@ -292,7 +292,7 @@ getPointerDependencyFrom(Value *MemPtr, uint64_t MemSize, bool isLoad,
     case AliasAnalysis::Mod:
       // If we're in an invariant region, we can ignore calls that ONLY
       // modify the pointer.
-      if (invariantTag) continue;
+      if (InvariantTag) continue;
       return MemDepResult::getClobber(Inst);
     case AliasAnalysis::Ref:
       // If the call is known to never store to the pointer, and if this is a
@@ -374,21 +374,22 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       IntrinsicID = II->getIntrinsicID();
 
     switch (IntrinsicID) {
-      case Intrinsic::lifetime_start:
-      case Intrinsic::lifetime_end:
-      case Intrinsic::invariant_start:
-        MemPtr = QueryInst->getOperand(2);
-        MemSize = cast<ConstantInt>(QueryInst->getOperand(1))->getZExtValue();
-        break;
-      case Intrinsic::invariant_end:
-        MemPtr = QueryInst->getOperand(3);
-        MemSize = cast<ConstantInt>(QueryInst->getOperand(2))->getZExtValue();
-        break;
-      default:
-        CallSite QueryCS = CallSite::get(QueryInst);
-        bool isReadOnly = AA->onlyReadsMemory(QueryCS);
-        LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos,
-                                               QueryParent);
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_start:
+      MemPtr = QueryInst->getOperand(2);
+      MemSize = cast<ConstantInt>(QueryInst->getOperand(1))->getZExtValue();
+      break;
+    case Intrinsic::invariant_end:
+      MemPtr = QueryInst->getOperand(3);
+      MemSize = cast<ConstantInt>(QueryInst->getOperand(2))->getZExtValue();
+      break;
+    default:
+      CallSite QueryCS = CallSite::get(QueryInst);
+      bool isReadOnly = AA->onlyReadsMemory(QueryCS);
+      LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos,
+                                             QueryParent);
+      break;
     }
   } else {
     // Non-memory instruction.
@@ -421,7 +422,7 @@ static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
   if (Count == 0) return;
 
   for (unsigned i = 1; i != unsigned(Count); ++i)
-    assert(Cache[i-1] <= Cache[i] && "Cache isn't sorted!");
+    assert(!(Cache[i] < Cache[i-1]) && "Cache isn't sorted!");
 }
 #endif
 
@@ -462,8 +463,8 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     // determine what is dirty, seeding our initial DirtyBlocks worklist.
     for (NonLocalDepInfo::iterator I = Cache.begin(), E = Cache.end();
        I != E; ++I)
-      if (I->second.isDirty())
-        DirtyBlocks.push_back(I->first);
+      if (I->getResult().isDirty())
+        DirtyBlocks.push_back(I->getBB());
     
     // Sort the cache so that we can do fast binary search lookups below.
     std::sort(Cache.begin(), Cache.end());
@@ -501,27 +502,27 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     DEBUG(AssertSorted(Cache, NumSortedEntries));
     NonLocalDepInfo::iterator Entry = 
       std::upper_bound(Cache.begin(), Cache.begin()+NumSortedEntries,
-                       std::make_pair(DirtyBB, MemDepResult()));
-    if (Entry != Cache.begin() && prior(Entry)->first == DirtyBB)
+                       NonLocalDepEntry(DirtyBB));
+    if (Entry != Cache.begin() && prior(Entry)->getBB() == DirtyBB)
       --Entry;
     
-    MemDepResult *ExistingResult = 0;
+    NonLocalDepEntry *ExistingResult = 0;
     if (Entry != Cache.begin()+NumSortedEntries && 
-        Entry->first == DirtyBB) {
+        Entry->getBB() == DirtyBB) {
       // If we already have an entry, and if it isn't already dirty, the block
       // is done.
-      if (!Entry->second.isDirty())
+      if (!Entry->getResult().isDirty())
         continue;
       
       // Otherwise, remember this slot so we can update the value.
-      ExistingResult = &Entry->second;
+      ExistingResult = &*Entry;
     }
     
     // If the dirty entry has a pointer, start scanning from it so we don't have
     // to rescan the entire block.
     BasicBlock::iterator ScanPos = DirtyBB->end();
     if (ExistingResult) {
-      if (Instruction *Inst = ExistingResult->getInst()) {
+      if (Instruction *Inst = ExistingResult->getResult().getInst()) {
         ScanPos = Inst;
         // We're removing QueryInst's use of Inst.
         RemoveFromReverseMap(ReverseNonLocalDeps, Inst,
@@ -545,9 +546,9 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     // If we had a dirty entry for the block, update it.  Otherwise, just add
     // a new entry.
     if (ExistingResult)
-      *ExistingResult = Dep;
+      ExistingResult->setResult(Dep, 0);
     else
-      Cache.push_back(std::make_pair(DirtyBB, Dep));
+      Cache.push_back(NonLocalDepEntry(DirtyBB, Dep, 0));
     
     // If the block has a dependency (i.e. it isn't completely transparent to
     // the value), remember the association!
@@ -587,17 +588,20 @@ getNonLocalPointerDependency(Value *Pointer, bool isLoad, BasicBlock *FromBB,
   const Type *EltTy = cast<PointerType>(Pointer->getType())->getElementType();
   uint64_t PointeeSize = AA->getTypeStoreSize(EltTy);
   
+  PHITransAddr Address(Pointer, TD);
+  
   // This is the set of blocks we've inspected, and the pointer we consider in
   // each block.  Because of critical edges, we currently bail out if querying
   // a block with multiple different pointers.  This can happen during PHI
   // translation.
   DenseMap<BasicBlock*, Value*> Visited;
-  if (!getNonLocalPointerDepFromBB(Pointer, PointeeSize, isLoad, FromBB,
+  if (!getNonLocalPointerDepFromBB(Address, PointeeSize, isLoad, FromBB,
                                    Result, Visited, true))
     return;
   Result.clear();
-  Result.push_back(std::make_pair(FromBB,
-                                  MemDepResult::getClobber(FromBB->begin())));
+  Result.push_back(NonLocalDepEntry(FromBB,
+                                    MemDepResult::getClobber(FromBB->begin()),
+                                    Pointer));
 }
 
 /// GetNonLocalInfoForBlock - Compute the memdep value for BB with
@@ -613,30 +617,30 @@ GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize,
   // the cache set.  If so, find it.
   NonLocalDepInfo::iterator Entry =
     std::upper_bound(Cache->begin(), Cache->begin()+NumSortedEntries,
-                     std::make_pair(BB, MemDepResult()));
-  if (Entry != Cache->begin() && prior(Entry)->first == BB)
+                     NonLocalDepEntry(BB));
+  if (Entry != Cache->begin() && (Entry-1)->getBB() == BB)
     --Entry;
   
-  MemDepResult *ExistingResult = 0;
-  if (Entry != Cache->begin()+NumSortedEntries && Entry->first == BB)
-    ExistingResult = &Entry->second;
+  NonLocalDepEntry *ExistingResult = 0;
+  if (Entry != Cache->begin()+NumSortedEntries && Entry->getBB() == BB)
+    ExistingResult = &*Entry;
   
   // If we have a cached entry, and it is non-dirty, use it as the value for
   // this dependency.
-  if (ExistingResult && !ExistingResult->isDirty()) {
+  if (ExistingResult && !ExistingResult->getResult().isDirty()) {
     ++NumCacheNonLocalPtr;
-    return *ExistingResult;
+    return ExistingResult->getResult();
   }    
   
   // Otherwise, we have to scan for the value.  If we have a dirty cache
   // entry, start scanning from its position, otherwise we scan from the end
   // of the block.
   BasicBlock::iterator ScanPos = BB->end();
-  if (ExistingResult && ExistingResult->getInst()) {
-    assert(ExistingResult->getInst()->getParent() == BB &&
+  if (ExistingResult && ExistingResult->getResult().getInst()) {
+    assert(ExistingResult->getResult().getInst()->getParent() == BB &&
            "Instruction invalidated?");
     ++NumCacheDirtyNonLocalPtr;
-    ScanPos = ExistingResult->getInst();
+    ScanPos = ExistingResult->getResult().getInst();
     
     // Eliminating the dirty entry from 'Cache', so update the reverse info.
     ValueIsLoadPair CacheKey(Pointer, isLoad);
@@ -652,9 +656,9 @@ GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize,
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
   if (ExistingResult)
-    *ExistingResult = Dep;
+    ExistingResult->setResult(Dep, Pointer);
   else
-    Cache->push_back(std::make_pair(BB, Dep));
+    Cache->push_back(NonLocalDepEntry(BB, Dep, Pointer));
   
   // If the block has a dependency (i.e. it isn't completely transparent to
   // the value), remember the reverse association because we just added it
@@ -683,7 +687,7 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
     break;
   case 2: {
     // Two new entries, insert the last one into place.
-    MemoryDependenceAnalysis::NonLocalDepEntry Val = Cache.back();
+    NonLocalDepEntry Val = Cache.back();
     Cache.pop_back();
     MemoryDependenceAnalysis::NonLocalDepInfo::iterator Entry =
       std::upper_bound(Cache.begin(), Cache.end()-1, Val);
@@ -693,7 +697,7 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
   case 1:
     // One new entry, Just insert the new value at the appropriate position.
     if (Cache.size() != 1) {
-      MemoryDependenceAnalysis::NonLocalDepEntry Val = Cache.back();
+      NonLocalDepEntry Val = Cache.back();
       Cache.pop_back();
       MemoryDependenceAnalysis::NonLocalDepInfo::iterator Entry =
         std::upper_bound(Cache.begin(), Cache.end(), Val);
@@ -707,275 +711,6 @@ SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
   }
 }
 
-/// isPHITranslatable - Return true if the specified computation is derived from
-/// a PHI node in the current block and if it is simple enough for us to handle.
-static bool isPHITranslatable(Instruction *Inst) {
-  if (isa<PHINode>(Inst))
-    return true;
-  
-  // We can handle bitcast of a PHI, but the PHI needs to be in the same block
-  // as the bitcast.
-  if (BitCastInst *BC = dyn_cast<BitCastInst>(Inst)) {
-    Instruction *OpI = dyn_cast<Instruction>(BC->getOperand(0));
-    if (OpI == 0 || OpI->getParent() != Inst->getParent())
-      return true;
-    return isPHITranslatable(OpI);
-  }
-  
-  // We can translate a GEP if all of its operands defined in this block are phi
-  // translatable. 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
-      Instruction *OpI = dyn_cast<Instruction>(GEP->getOperand(i));
-      if (OpI == 0 || OpI->getParent() != Inst->getParent())
-        continue;
-      
-      if (!isPHITranslatable(OpI))
-        return false;
-    }
-    return true;
-  }
-  
-  if (Inst->getOpcode() == Instruction::Add &&
-      isa<ConstantInt>(Inst->getOperand(1))) {
-    Instruction *OpI = dyn_cast<Instruction>(Inst->getOperand(0));
-    if (OpI == 0 || OpI->getParent() != Inst->getParent())
-      return true;
-    return isPHITranslatable(OpI);
-  }
-
-  //   cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
-  //   if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst))
-  //     cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
-  
-  return false;
-}
-
-/// GetPHITranslatedValue - Given a computation that satisfied the
-/// isPHITranslatable predicate, see if we can translate the computation into
-/// the specified predecessor block.  If so, return that value.
-Value *MemoryDependenceAnalysis::
-GetPHITranslatedValue(Value *InVal, BasicBlock *CurBB, BasicBlock *Pred,
-                      const TargetData *TD) const {  
-  // If the input value is not an instruction, or if it is not defined in CurBB,
-  // then we don't need to phi translate it.
-  Instruction *Inst = dyn_cast<Instruction>(InVal);
-  if (Inst == 0 || Inst->getParent() != CurBB)
-    return InVal;
-  
-  if (PHINode *PN = dyn_cast<PHINode>(Inst))
-    return PN->getIncomingValueForBlock(Pred);
-  
-  // Handle bitcast of PHI.
-  if (BitCastInst *BC = dyn_cast<BitCastInst>(Inst)) {
-    // PHI translate the input operand.
-    Value *PHIIn = GetPHITranslatedValue(BC->getOperand(0), CurBB, Pred, TD);
-    if (PHIIn == 0) return 0;
-    
-    // Constants are trivial to phi translate.
-    if (Constant *C = dyn_cast<Constant>(PHIIn))
-      return ConstantExpr::getBitCast(C, BC->getType());
-    
-    // Otherwise we have to see if a bitcasted version of the incoming pointer
-    // is available.  If so, we can use it, otherwise we have to fail.
-    for (Value::use_iterator UI = PHIIn->use_begin(), E = PHIIn->use_end();
-         UI != E; ++UI) {
-      if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI))
-        if (BCI->getType() == BC->getType())
-          return BCI;
-    }
-    return 0;
-  }
-
-  // Handle getelementptr with at least one PHI translatable operand.
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-    SmallVector<Value*, 8> GEPOps;
-    BasicBlock *CurBB = GEP->getParent();
-    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
-      Value *GEPOp = GEP->getOperand(i);
-      // No PHI translation is needed of operands whose values are live in to
-      // the predecessor block.
-      if (!isa<Instruction>(GEPOp) ||
-          cast<Instruction>(GEPOp)->getParent() != CurBB) {
-        GEPOps.push_back(GEPOp);
-        continue;
-      }
-      
-      // If the operand is a phi node, do phi translation.
-      Value *InOp = GetPHITranslatedValue(GEPOp, CurBB, Pred, TD);
-      if (InOp == 0) return 0;
-      
-      GEPOps.push_back(InOp);
-    }
-    
-    // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(&GEPOps[0], GEPOps.size(), TD))
-      return V;
-
-    // Scan to see if we have this GEP available.
-    Value *APHIOp = GEPOps[0];
-    for (Value::use_iterator UI = APHIOp->use_begin(), E = APHIOp->use_end();
-         UI != E; ++UI) {
-      if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI))
-        if (GEPI->getType() == GEP->getType() &&
-            GEPI->getNumOperands() == GEPOps.size() &&
-            GEPI->getParent()->getParent() == CurBB->getParent()) {
-          bool Mismatch = false;
-          for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
-            if (GEPI->getOperand(i) != GEPOps[i]) {
-              Mismatch = true;
-              break;
-            }
-          if (!Mismatch)
-            return GEPI;
-        }
-    }
-    return 0;
-  }
-  
-  // Handle add with a constant RHS.
-  if (Inst->getOpcode() == Instruction::Add &&
-      isa<ConstantInt>(Inst->getOperand(1))) {
-    // PHI translate the LHS.
-    Value *LHS;
-    Constant *RHS = cast<ConstantInt>(Inst->getOperand(1));
-    Instruction *OpI = dyn_cast<Instruction>(Inst->getOperand(0));
-    bool isNSW = cast<BinaryOperator>(Inst)->hasNoSignedWrap();
-    bool isNUW = cast<BinaryOperator>(Inst)->hasNoUnsignedWrap();
-    
-    if (OpI == 0 || OpI->getParent() != Inst->getParent())
-      LHS = Inst->getOperand(0);
-    else {
-      LHS = GetPHITranslatedValue(Inst->getOperand(0), CurBB, Pred, TD);
-      if (LHS == 0)
-        return 0;
-    }
-    
-    // If the PHI translated LHS is an add of a constant, fold the immediates.
-    if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(LHS))
-      if (BOp->getOpcode() == Instruction::Add)
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
-          LHS = BOp->getOperand(0);
-          RHS = ConstantExpr::getAdd(RHS, CI);
-          isNSW = isNUW = false;
-        }
-    
-    // See if the add simplifies away.
-    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, TD))
-      return Res;
-    
-    // Otherwise, see if we have this add available somewhere.
-    for (Value::use_iterator UI = LHS->use_begin(), E = LHS->use_end();
-         UI != E; ++UI) {
-      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(*UI))
-        if (BO->getOperand(0) == LHS && BO->getOperand(1) == RHS &&
-            BO->getParent()->getParent() == CurBB->getParent())
-          return BO;
-    }
-    
-    return 0;
-  }
-  
-  return 0;
-}
-
-/// GetAvailablePHITranslatePointer - Return the value computed by
-/// PHITranslatePointer if it dominates PredBB, otherwise return null.
-Value *MemoryDependenceAnalysis::
-GetAvailablePHITranslatedValue(Value *V,
-                               BasicBlock *CurBB, BasicBlock *PredBB,
-                               const TargetData *TD,
-                               const DominatorTree &DT) const {
-  // See if PHI translation succeeds.
-  V = GetPHITranslatedValue(V, CurBB, PredBB, TD);
-  if (V == 0) return 0;
-  
-  // Make sure the value is live in the predecessor.
-  if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
-    if (!DT.dominates(Inst->getParent(), PredBB))
-      return 0;
-  return V;
-}
-
-
-/// InsertPHITranslatedPointer - Insert a computation of the PHI translated
-/// version of 'V' for the edge PredBB->CurBB into the end of the PredBB
-/// block.  All newly created instructions are added to the NewInsts list.
-///
-Value *MemoryDependenceAnalysis::
-InsertPHITranslatedPointer(Value *InVal, BasicBlock *CurBB,
-                           BasicBlock *PredBB, const TargetData *TD,
-                           const DominatorTree &DT,
-                           SmallVectorImpl<Instruction*> &NewInsts) const {
-  // See if we have a version of this value already available and dominating
-  // PredBB.  If so, there is no need to insert a new copy.
-  if (Value *Res = GetAvailablePHITranslatedValue(InVal, CurBB, PredBB, TD, DT))
-    return Res;
-  
-  // If we don't have an available version of this value, it must be an
-  // instruction.
-  Instruction *Inst = cast<Instruction>(InVal);
-  
-  // Handle bitcast of PHI translatable value.
-  if (BitCastInst *BC = dyn_cast<BitCastInst>(Inst)) {
-    Value *OpVal = InsertPHITranslatedPointer(BC->getOperand(0),
-                                              CurBB, PredBB, TD, DT, NewInsts);
-    if (OpVal == 0) return 0;
-      
-    // Otherwise insert a bitcast at the end of PredBB.
-    BitCastInst *New = new BitCastInst(OpVal, InVal->getType(),
-                                       InVal->getName()+".phi.trans.insert",
-                                       PredBB->getTerminator());
-    NewInsts.push_back(New);
-    return New;
-  }
-  
-  // Handle getelementptr with at least one PHI operand.
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-    SmallVector<Value*, 8> GEPOps;
-    BasicBlock *CurBB = GEP->getParent();
-    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
-      Value *OpVal = InsertPHITranslatedPointer(GEP->getOperand(i),
-                                                CurBB, PredBB, TD, DT, NewInsts);
-      if (OpVal == 0) return 0;
-      GEPOps.push_back(OpVal);
-    }
-    
-    GetElementPtrInst *Result = 
-      GetElementPtrInst::Create(GEPOps[0], GEPOps.begin()+1, GEPOps.end(),
-                                InVal->getName()+".phi.trans.insert",
-                                PredBB->getTerminator());
-    Result->setIsInBounds(GEP->isInBounds());
-    NewInsts.push_back(Result);
-    return Result;
-  }
-  
-#if 0
-  // FIXME: This code works, but it is unclear that we actually want to insert
-  // a big chain of computation in order to make a value available in a block.
-  // This needs to be evaluated carefully to consider its cost trade offs.
-  
-  // Handle add with a constant RHS.
-  if (Inst->getOpcode() == Instruction::Add &&
-      isa<ConstantInt>(Inst->getOperand(1))) {
-    // PHI translate the LHS.
-    Value *OpVal = InsertPHITranslatedPointer(Inst->getOperand(0),
-                                              CurBB, PredBB, TD, DT, NewInsts);
-    if (OpVal == 0) return 0;
-    
-    BinaryOperator *Res = BinaryOperator::CreateAdd(OpVal, Inst->getOperand(1),
-                                           InVal->getName()+".phi.trans.insert",
-                                                    PredBB->getTerminator());
-    Res->setHasNoSignedWrap(cast<BinaryOperator>(Inst)->hasNoSignedWrap());
-    Res->setHasNoUnsignedWrap(cast<BinaryOperator>(Inst)->hasNoUnsignedWrap());
-    NewInsts.push_back(Res);
-    return Res;
-  }
-#endif
-  
-  return 0;
-}
-
 /// getNonLocalPointerDepFromBB - Perform a dependency query based on
 /// pointer/pointeesize starting at the end of StartBB.  Add any clobber/def
 /// results to the results vector and keep track of which blocks are visited in
@@ -989,14 +724,14 @@ InsertPHITranslatedPointer(Value *InVal, BasicBlock *CurBB,
 /// not compute dependence information for some reason.  This should be treated
 /// as a clobber dependence on the first instruction in the predecessor block.
 bool MemoryDependenceAnalysis::
-getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
+getNonLocalPointerDepFromBB(const PHITransAddr &Pointer, uint64_t PointeeSize,
                             bool isLoad, BasicBlock *StartBB,
                             SmallVectorImpl<NonLocalDepEntry> &Result,
                             DenseMap<BasicBlock*, Value*> &Visited,
                             bool SkipFirstBlock) {
   
   // Look up the cached info for Pointer.
-  ValueIsLoadPair CacheKey(Pointer, isLoad);
+  ValueIsLoadPair CacheKey(Pointer.getAddr(), isLoad);
   
   std::pair<BBSkipFirstBlockPair, NonLocalDepInfo> *CacheInfo =
     &NonLocalPointerDeps[CacheKey];
@@ -1013,8 +748,9 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
     if (!Visited.empty()) {
       for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
            I != E; ++I) {
-        DenseMap<BasicBlock*, Value*>::iterator VI = Visited.find(I->first);
-        if (VI == Visited.end() || VI->second == Pointer) continue;
+        DenseMap<BasicBlock*, Value*>::iterator VI = Visited.find(I->getBB());
+        if (VI == Visited.end() || VI->second == Pointer.getAddr())
+          continue;
         
         // We have a pointer mismatch in a block.  Just return clobber, saying
         // that something was clobbered in this result.  We could also do a
@@ -1025,8 +761,8 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
     
     for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
          I != E; ++I) {
-      Visited.insert(std::make_pair(I->first, Pointer));
-      if (!I->second.isNonLocal())
+      Visited.insert(std::make_pair(I->getBB(), Pointer.getAddr()));
+      if (!I->getResult().isNonLocal())
         Result.push_back(*I);
     }
     ++NumCacheCompleteNonLocalPtr;
@@ -1065,30 +801,27 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
       // Get the dependency info for Pointer in BB.  If we have cached
       // information, we will use it, otherwise we compute it.
       DEBUG(AssertSorted(*Cache, NumSortedEntries));
-      MemDepResult Dep = GetNonLocalInfoForBlock(Pointer, PointeeSize, isLoad,
-                                                 BB, Cache, NumSortedEntries);
+      MemDepResult Dep = GetNonLocalInfoForBlock(Pointer.getAddr(), PointeeSize,
+                                                 isLoad, BB, Cache,
+                                                 NumSortedEntries);
       
       // If we got a Def or Clobber, add this to the list of results.
       if (!Dep.isNonLocal()) {
-        Result.push_back(NonLocalDepEntry(BB, Dep));
+        Result.push_back(NonLocalDepEntry(BB, Dep, Pointer.getAddr()));
         continue;
       }
     }
     
     // If 'Pointer' is an instruction defined in this block, then we need to do
     // phi translation to change it into a value live in the predecessor block.
-    // If phi translation fails, then we can't continue dependence analysis.
-    Instruction *PtrInst = dyn_cast<Instruction>(Pointer);
-    bool NeedsPHITranslation = PtrInst && PtrInst->getParent() == BB;
-    
-    // If no PHI translation is needed, just add all the predecessors of this
-    // block to scan them as well.
-    if (!NeedsPHITranslation) {
+    // If not, we just add the predecessors to the worklist and scan them with
+    // the same Pointer.
+    if (!Pointer.NeedsPHITranslationFromBlock(BB)) {
       SkipFirstBlock = false;
       for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
         // Verify that we haven't looked at this block yet.
         std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
-          InsertRes = Visited.insert(std::make_pair(*PI, Pointer));
+          InsertRes = Visited.insert(std::make_pair(*PI, Pointer.getAddr()));
         if (InsertRes.second) {
           // First time we've looked at *PI.
           Worklist.push_back(*PI);
@@ -1098,16 +831,17 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
         // If we have seen this block before, but it was with a different
         // pointer then we have a phi translation failure and we have to treat
         // this as a clobber.
-        if (InsertRes.first->second != Pointer)
+        if (InsertRes.first->second != Pointer.getAddr())
           goto PredTranslationFailure;
       }
       continue;
     }
     
-    // If we do need to do phi translation, then there are a bunch of different
-    // cases, because we have to find a Value* live in the predecessor block. We
-    // know that PtrInst is defined in this block at least.
-
+    // We do need to do phi translation, if we know ahead of time we can't phi
+    // translate this value, don't even try.
+    if (!Pointer.IsPotentiallyPHITranslatable())
+      goto PredTranslationFailure;
+    
     // We may have added values to the cache list before this PHI translation.
     // If so, we haven't done anything to ensure that the cache remains sorted.
     // Sort it now (if needed) so that recursive invocations of
@@ -1117,19 +851,17 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
       SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
       NumSortedEntries = Cache->size();
     }
-    
-    // If this is a computation derived from a PHI node, use the suitably
-    // translated incoming values for each pred as the phi translated version.
-    if (!isPHITranslatable(PtrInst))
-      goto PredTranslationFailure;
-
     Cache = 0;
-      
+    
     for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
       BasicBlock *Pred = *PI;
-      // Get the PHI translated pointer in this predecessor.  This can fail and
-      // return null if not translatable.
-      Value *PredPtr = GetPHITranslatedValue(PtrInst, BB, Pred, TD);
+      
+      // Get the PHI translated pointer in this predecessor.  This can fail if
+      // not translatable, in which case the getAddr() returns null.
+      PHITransAddr PredPointer(Pointer);
+      PredPointer.PHITranslateValue(BB, Pred);
+
+      Value *PredPtrVal = PredPointer.getAddr();
       
       // Check to see if we have already visited this pred block with another
       // pointer.  If so, we can't do this lookup.  This failure can occur
@@ -1137,12 +869,12 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
       // the successor translates to a pointer value different than the
       // pointer the block was first analyzed with.
       std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
-        InsertRes = Visited.insert(std::make_pair(Pred, PredPtr));
+        InsertRes = Visited.insert(std::make_pair(Pred, PredPtrVal));
 
       if (!InsertRes.second) {
         // If the predecessor was visited with PredPtr, then we already did
         // the analysis and can ignore it.
-        if (InsertRes.first->second == PredPtr)
+        if (InsertRes.first->second == PredPtrVal)
           continue;
         
         // Otherwise, the block was previously analyzed with a different
@@ -1155,10 +887,11 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
       // predecessor, then we have to assume that the pointer is clobbered in
       // that predecessor.  We can still do PRE of the load, which would insert
       // a computation of the pointer in this predecessor.
-      if (PredPtr == 0) {
+      if (PredPtrVal == 0) {
         // Add the entry to the Result list.
         NonLocalDepEntry Entry(Pred,
-                               MemDepResult::getClobber(Pred->getTerminator()));
+                               MemDepResult::getClobber(Pred->getTerminator()),
+                               PredPtrVal);
         Result.push_back(Entry);
 
         // Add it to the cache for this CacheKey so that subsequent queries get
@@ -1167,27 +900,27 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
         MemoryDependenceAnalysis::NonLocalDepInfo::iterator It =
           std::upper_bound(Cache->begin(), Cache->end(), Entry);
         
-        if (It != Cache->begin() && prior(It)->first == Pred)
+        if (It != Cache->begin() && (It-1)->getBB() == Pred)
           --It;
 
-        if (It == Cache->end() || It->first != Pred) {
+        if (It == Cache->end() || It->getBB() != Pred) {
           Cache->insert(It, Entry);
           // Add it to the reverse map.
           ReverseNonLocalPtrDeps[Pred->getTerminator()].insert(CacheKey);
-        } else if (!It->second.isDirty()) {
+        } else if (!It->getResult().isDirty()) {
           // noop
-        } else if (It->second.getInst() == Pred->getTerminator()) {
+        } else if (It->getResult().getInst() == Pred->getTerminator()) {
           // Same instruction, clear the dirty marker.
-          It->second = Entry.second;
-        } else if (It->second.getInst() == 0) {
+          It->setResult(Entry.getResult(), PredPtrVal);
+        } else if (It->getResult().getInst() == 0) {
           // Dirty, with no instruction, just add this.
-          It->second = Entry.second;
+          It->setResult(Entry.getResult(), PredPtrVal);
           ReverseNonLocalPtrDeps[Pred->getTerminator()].insert(CacheKey);
         } else {
           // Otherwise, dirty with a different instruction.
-          RemoveFromReverseMap(ReverseNonLocalPtrDeps, It->second.getInst(),
-                               CacheKey);
-          It->second = Entry.second;
+          RemoveFromReverseMap(ReverseNonLocalPtrDeps,
+                               It->getResult().getInst(), CacheKey);
+          It->setResult(Entry.getResult(),PredPtrVal);
           ReverseNonLocalPtrDeps[Pred->getTerminator()].insert(CacheKey);
         }
         Cache = 0;
@@ -1201,7 +934,7 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
       
       // If we have a problem phi translating, fall through to the code below
       // to handle the failure condition.
-      if (getNonLocalPointerDepFromBB(PredPtr, PointeeSize, isLoad, Pred,
+      if (getNonLocalPointerDepFromBB(PredPointer, PointeeSize, isLoad, Pred,
                                       Result, Visited))
         goto PredTranslationFailure;
     }
@@ -1245,12 +978,12 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
     
     for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) {
       assert(I != Cache->rend() && "Didn't find current block??");
-      if (I->first != BB)
+      if (I->getBB() != BB)
         continue;
       
-      assert(I->second.isNonLocal() &&
+      assert(I->getResult().isNonLocal() &&
              "Should only be here with transparent block");
-      I->second = MemDepResult::getClobber(BB->begin());
+      I->setResult(MemDepResult::getClobber(BB->begin()), Pointer.getAddr());
       ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey);
       Result.push_back(*I);
       break;
@@ -1276,9 +1009,9 @@ RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
   NonLocalDepInfo &PInfo = It->second.second;
   
   for (unsigned i = 0, e = PInfo.size(); i != e; ++i) {
-    Instruction *Target = PInfo[i].second.getInst();
+    Instruction *Target = PInfo[i].getResult().getInst();
     if (Target == 0) continue;  // Ignore non-local dep results.
-    assert(Target->getParent() == PInfo[i].first);
+    assert(Target->getParent() == PInfo[i].getBB());
     
     // Eliminating the dirty entry from 'Cache', so update the reverse info.
     RemoveFromReverseMap(ReverseNonLocalPtrDeps, Target, P);
@@ -1315,7 +1048,7 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
     NonLocalDepInfo &BlockMap = NLDI->second.first;
     for (NonLocalDepInfo::iterator DI = BlockMap.begin(), DE = BlockMap.end();
          DI != DE; ++DI)
-      if (Instruction *Inst = DI->second.getInst())
+      if (Instruction *Inst = DI->getResult().getInst())
         RemoveFromReverseMap(ReverseNonLocalDeps, Inst, RemInst);
     NonLocalDeps.erase(NLDI);
   }
@@ -1403,10 +1136,10 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
       
       for (NonLocalDepInfo::iterator DI = INLD.first.begin(), 
            DE = INLD.first.end(); DI != DE; ++DI) {
-        if (DI->second.getInst() != RemInst) continue;
+        if (DI->getResult().getInst() != RemInst) continue;
         
         // Convert to a dirty entry for the subsequent instruction.
-        DI->second = NewDirtyVal;
+        DI->setResult(NewDirtyVal, DI->getAddress());
         
         if (Instruction *NextI = NewDirtyVal.getInst())
           ReverseDepsToAdd.push_back(std::make_pair(NextI, *I));
@@ -1445,10 +1178,10 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
       // Update any entries for RemInst to use the instruction after it.
       for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end();
            DI != DE; ++DI) {
-        if (DI->second.getInst() != RemInst) continue;
+        if (DI->getResult().getInst() != RemInst) continue;
         
         // Convert to a dirty entry for the subsequent instruction.
-        DI->second = NewDirtyVal;
+        DI->setResult(NewDirtyVal, DI->getAddress());
         
         if (Instruction *NewDirtyInst = NewDirtyVal.getInst())
           ReversePtrDepsToAdd.push_back(std::make_pair(NewDirtyInst, P));
@@ -1489,7 +1222,7 @@ void MemoryDependenceAnalysis::verifyRemoved(Instruction *D) const {
     const NonLocalDepInfo &Val = I->second.second;
     for (NonLocalDepInfo::const_iterator II = Val.begin(), E = Val.end();
          II != E; ++II)
-      assert(II->second.getInst() != D && "Inst occurs as NLPD value");
+      assert(II->getResult().getInst() != D && "Inst occurs as NLPD value");
   }
   
   for (NonLocalDepMapType::const_iterator I = NonLocalDeps.begin(),
@@ -1498,7 +1231,7 @@ void MemoryDependenceAnalysis::verifyRemoved(Instruction *D) const {
     const PerInstNLInfo &INLD = I->second;
     for (NonLocalDepInfo::const_iterator II = INLD.first.begin(),
          EE = INLD.first.end(); II  != EE; ++II)
-      assert(II->second.getInst() != D && "Inst occurs in data structures");
+      assert(II->getResult().getInst() != D && "Inst occurs in data structures");
   }
   
   for (ReverseDepMapType::const_iterator I = ReverseLocalDeps.begin(),
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
new file mode 100644
index 0000000..07e2919
--- /dev/null
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -0,0 +1,432 @@
+//===- PHITransAddr.cpp - PHI Translation for Addresses -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PHITransAddr class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static bool CanPHITrans(Instruction *Inst) {
+  if (isa<PHINode>(Inst) ||
+      isa<BitCastInst>(Inst) ||
+      isa<GetElementPtrInst>(Inst))
+    return true;
+  
+  if (Inst->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+  
+  //   cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
+  //   if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst))
+  //     cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
+  return false;
+}
+
+void PHITransAddr::dump() const {
+  if (Addr == 0) {
+    errs() << "PHITransAddr: null\n";
+    return;
+  }
+  errs() << "PHITransAddr: " << *Addr << "\n";
+  for (unsigned i = 0, e = InstInputs.size(); i != e; ++i)
+    errs() << "  Input #" << i << " is " << *InstInputs[i] << "\n";
+}
+
+
+static bool VerifySubExpr(Value *Expr,
+                          SmallVectorImpl<Instruction*> &InstInputs) {
+  // If this is a non-instruction value, there is nothing to do.
+  Instruction *I = dyn_cast<Instruction>(Expr);
+  if (I == 0) return true;
+  
+  // If it's an instruction, it is either in Tmp or its operands recursively
+  // are.
+  SmallVectorImpl<Instruction*>::iterator Entry =
+    std::find(InstInputs.begin(), InstInputs.end(), I);
+  if (Entry != InstInputs.end()) {
+    InstInputs.erase(Entry);
+    return true;
+  }
+  
+  // If it isn't in the InstInputs list it is a subexpr incorporated into the
+  // address.  Sanity check that it is phi translatable.
+  if (!CanPHITrans(I)) {
+    errs() << "Non phi translatable instruction found in PHITransAddr, either "
+              "something is missing from InstInputs or CanPHITrans is wrong:\n";
+    errs() << *I << '\n';
+    return false;
+  }
+  
+  // Validate the operands of the instruction.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!VerifySubExpr(I->getOperand(i), InstInputs))
+      return false;
+
+  return true;
+}
+
+/// Verify - Check internal consistency of this data structure.  If the
+/// structure is valid, it returns true.  If invalid, it prints errors and
+/// returns false.
+bool PHITransAddr::Verify() const {
+  if (Addr == 0) return true;
+  
+  SmallVector<Instruction*, 8> Tmp(InstInputs.begin(), InstInputs.end());  
+  
+  if (!VerifySubExpr(Addr, Tmp))
+    return false;
+  
+  if (!Tmp.empty()) {
+    errs() << "PHITransAddr inconsistent, contains extra instructions:\n";
+    for (unsigned i = 0, e = InstInputs.size(); i != e; ++i)
+      errs() << "  InstInput #" << i << " is " << *InstInputs[i] << "\n";
+    return false;
+  }
+  
+  // a-ok.
+  return true;
+}
+
+
+/// IsPotentiallyPHITranslatable - If this needs PHI translation, return true
+/// if we have some hope of doing it.  This should be used as a filter to
+/// avoid calling PHITranslateValue in hopeless situations.
+bool PHITransAddr::IsPotentiallyPHITranslatable() const {
+  // If the input value is not an instruction, or if it is not defined in CurBB,
+  // then we don't need to phi translate it.
+  Instruction *Inst = dyn_cast<Instruction>(Addr);
+  return Inst == 0 || CanPHITrans(Inst);
+}
+
+
+static void RemoveInstInputs(Value *V, 
+                             SmallVectorImpl<Instruction*> &InstInputs) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return;
+  
+  // If the instruction is in the InstInputs list, remove it.
+  SmallVectorImpl<Instruction*>::iterator Entry =
+    std::find(InstInputs.begin(), InstInputs.end(), I);
+  if (Entry != InstInputs.end()) {
+    InstInputs.erase(Entry);
+    return;
+  }
+  
+  assert(!isa<PHINode>(I) && "Error, removing something that isn't an input");
+  
+  // Otherwise, it must have instruction inputs itself.  Zap them recursively.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(i)))
+      RemoveInstInputs(Op, InstInputs);
+  }
+}
+
+Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
+                                         BasicBlock *PredBB) {
+  // If this is a non-instruction value, it can't require PHI translation.
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (Inst == 0) return V;
+  
+  // Determine whether 'Inst' is an input to our PHI translatable expression.
+  bool isInput = std::count(InstInputs.begin(), InstInputs.end(), Inst);
+
+  // Handle inputs instructions if needed.
+  if (isInput) {
+    if (Inst->getParent() != CurBB) {
+      // If it is an input defined in a different block, then it remains an
+      // input.
+      return Inst;
+    }
+
+    // If 'Inst' is defined in this block and is an input that needs to be phi
+    // translated, we need to incorporate the value into the expression or fail.
+
+    // In either case, the instruction itself isn't an input any longer.
+    InstInputs.erase(std::find(InstInputs.begin(), InstInputs.end(), Inst));
+    
+    // If this is a PHI, go ahead and translate it.
+    if (PHINode *PN = dyn_cast<PHINode>(Inst))
+      return AddAsInput(PN->getIncomingValueForBlock(PredBB));
+    
+    // If this is a non-phi value, and it is analyzable, we can incorporate it
+    // into the expression by making all instruction operands be inputs.
+    if (!CanPHITrans(Inst))
+      return 0;
+   
+    // All instruction operands are now inputs (and of course, they may also be
+    // defined in this block, so they may need to be phi translated themselves.
+    for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
+      if (Instruction *Op = dyn_cast<Instruction>(Inst->getOperand(i)))
+        InstInputs.push_back(Op);
+  }
+
+  // Ok, it must be an intermediate result (either because it started that way
+  // or because we just incorporated it into the expression).  See if its
+  // operands need to be phi translated, and if so, reconstruct it.
+  
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(Inst)) {
+    Value *PHIIn = PHITranslateSubExpr(BC->getOperand(0), CurBB, PredBB);
+    if (PHIIn == 0) return 0;
+    if (PHIIn == BC->getOperand(0))
+      return BC;
+    
+    // Find an available version of this cast.
+    
+    // Constants are trivial to find.
+    if (Constant *C = dyn_cast<Constant>(PHIIn))
+      return AddAsInput(ConstantExpr::getBitCast(C, BC->getType()));
+    
+    // Otherwise we have to see if a bitcasted version of the incoming pointer
+    // is available.  If so, we can use it, otherwise we have to fail.
+    for (Value::use_iterator UI = PHIIn->use_begin(), E = PHIIn->use_end();
+         UI != E; ++UI) {
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI))
+        if (BCI->getType() == BC->getType())
+          return BCI;
+    }
+    return 0;
+  }
+  
+  // Handle getelementptr with at least one PHI translatable operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+    SmallVector<Value*, 8> GEPOps;
+    bool AnyChanged = false;
+    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
+      Value *GEPOp = PHITranslateSubExpr(GEP->getOperand(i), CurBB, PredBB);
+      if (GEPOp == 0) return 0;
+      
+      AnyChanged |= GEPOp != GEP->getOperand(i);
+      GEPOps.push_back(GEPOp);
+    }
+    
+    if (!AnyChanged)
+      return GEP;
+    
+    // Simplify the GEP to handle 'gep x, 0' -> x etc.
+    if (Value *V = SimplifyGEPInst(&GEPOps[0], GEPOps.size(), TD)) {
+      for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
+        RemoveInstInputs(GEPOps[i], InstInputs);
+      
+      return AddAsInput(V);
+    }
+    
+    // Scan to see if we have this GEP available.
+    Value *APHIOp = GEPOps[0];
+    for (Value::use_iterator UI = APHIOp->use_begin(), E = APHIOp->use_end();
+         UI != E; ++UI) {
+      if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI))
+        if (GEPI->getType() == GEP->getType() &&
+            GEPI->getNumOperands() == GEPOps.size() &&
+            GEPI->getParent()->getParent() == CurBB->getParent()) {
+          bool Mismatch = false;
+          for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
+            if (GEPI->getOperand(i) != GEPOps[i]) {
+              Mismatch = true;
+              break;
+            }
+          if (!Mismatch)
+            return GEPI;
+        }
+    }
+    return 0;
+  }
+  
+  // Handle add with a constant RHS.
+  if (Inst->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Inst->getOperand(1))) {
+    // PHI translate the LHS.
+    Constant *RHS = cast<ConstantInt>(Inst->getOperand(1));
+    bool isNSW = cast<BinaryOperator>(Inst)->hasNoSignedWrap();
+    bool isNUW = cast<BinaryOperator>(Inst)->hasNoUnsignedWrap();
+    
+    Value *LHS = PHITranslateSubExpr(Inst->getOperand(0), CurBB, PredBB);
+    if (LHS == 0) return 0;
+    
+    // If the PHI translated LHS is an add of a constant, fold the immediates.
+    if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(LHS))
+      if (BOp->getOpcode() == Instruction::Add)
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
+          LHS = BOp->getOperand(0);
+          RHS = ConstantExpr::getAdd(RHS, CI);
+          isNSW = isNUW = false;
+          
+          // If the old 'LHS' was an input, add the new 'LHS' as an input.
+          if (std::count(InstInputs.begin(), InstInputs.end(), BOp)) {
+            RemoveInstInputs(BOp, InstInputs);
+            AddAsInput(LHS);
+          }
+        }
+    
+    // See if the add simplifies away.
+    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, TD)) {
+      // If we simplified the operands, the LHS is no longer an input, but Res
+      // is.
+      RemoveInstInputs(LHS, InstInputs);
+      return AddAsInput(Res);
+    }
+
+    // If we didn't modify the add, just return it.
+    if (LHS == Inst->getOperand(0) && RHS == Inst->getOperand(1))
+      return Inst;
+    
+    // Otherwise, see if we have this add available somewhere.
+    for (Value::use_iterator UI = LHS->use_begin(), E = LHS->use_end();
+         UI != E; ++UI) {
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(*UI))
+        if (BO->getOpcode() == Instruction::Add &&
+            BO->getOperand(0) == LHS && BO->getOperand(1) == RHS &&
+            BO->getParent()->getParent() == CurBB->getParent())
+          return BO;
+    }
+    
+    return 0;
+  }
+  
+  // Otherwise, we failed.
+  return 0;
+}
+
+
+/// PHITranslateValue - PHI translate the current address up the CFG from
+/// CurBB to Pred, updating our state the reflect any needed changes.  This
+/// returns true on failure and sets Addr to null.
+bool PHITransAddr::PHITranslateValue(BasicBlock *CurBB, BasicBlock *PredBB) {
+  assert(Verify() && "Invalid PHITransAddr!");
+  Addr = PHITranslateSubExpr(Addr, CurBB, PredBB);
+  assert(Verify() && "Invalid PHITransAddr!");
+  return Addr == 0;
+}
+
+/// GetAvailablePHITranslatedSubExpr - Return the value computed by
+/// PHITranslateSubExpr if it dominates PredBB, otherwise return null.
+Value *PHITransAddr::
+GetAvailablePHITranslatedSubExpr(Value *V, BasicBlock *CurBB,BasicBlock *PredBB,
+                                 const DominatorTree &DT) const {
+  PHITransAddr Tmp(V, TD);
+  Tmp.PHITranslateValue(CurBB, PredBB);
+  
+  // See if PHI translation succeeds.
+  V = Tmp.getAddr();
+  
+  // Make sure the value is live in the predecessor.
+  if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
+    if (!DT.dominates(Inst->getParent(), PredBB))
+      return 0;
+  return V;
+}
+
+
+/// PHITranslateWithInsertion - PHI translate this value into the specified
+/// predecessor block, inserting a computation of the value if it is
+/// unavailable.
+///
+/// All newly created instructions are added to the NewInsts list.  This
+/// returns null on failure.
+///
+Value *PHITransAddr::
+PHITranslateWithInsertion(BasicBlock *CurBB, BasicBlock *PredBB,
+                          const DominatorTree &DT,
+                          SmallVectorImpl<Instruction*> &NewInsts) {
+  unsigned NISize = NewInsts.size();
+  
+  // Attempt to PHI translate with insertion.
+  Addr = InsertPHITranslatedSubExpr(Addr, CurBB, PredBB, DT, NewInsts);
+  
+  // If successful, return the new value.
+  if (Addr) return Addr;
+  
+  // If not, destroy any intermediate instructions inserted.
+  while (NewInsts.size() != NISize)
+    NewInsts.pop_back_val()->eraseFromParent();
+  return 0;
+}
+
+
+/// InsertPHITranslatedPointer - Insert a computation of the PHI translated
+/// version of 'V' for the edge PredBB->CurBB into the end of the PredBB
+/// block.  All newly created instructions are added to the NewInsts list.
+/// This returns null on failure.
+///
+Value *PHITransAddr::
+InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
+                           BasicBlock *PredBB, const DominatorTree &DT,
+                           SmallVectorImpl<Instruction*> &NewInsts) {
+  // See if we have a version of this value already available and dominating
+  // PredBB.  If so, there is no need to insert a new instance of it.
+  if (Value *Res = GetAvailablePHITranslatedSubExpr(InVal, CurBB, PredBB, DT))
+    return Res;
+
+  // If we don't have an available version of this value, it must be an
+  // instruction.
+  Instruction *Inst = cast<Instruction>(InVal);
+  
+  // Handle bitcast of PHI translatable value.
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(Inst)) {
+    Value *OpVal = InsertPHITranslatedSubExpr(BC->getOperand(0),
+                                              CurBB, PredBB, DT, NewInsts);
+    if (OpVal == 0) return 0;
+    
+    // Otherwise insert a bitcast at the end of PredBB.
+    BitCastInst *New = new BitCastInst(OpVal, InVal->getType(),
+                                       InVal->getName()+".phi.trans.insert",
+                                       PredBB->getTerminator());
+    NewInsts.push_back(New);
+    return New;
+  }
+  
+  // Handle getelementptr with at least one PHI operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+    SmallVector<Value*, 8> GEPOps;
+    BasicBlock *CurBB = GEP->getParent();
+    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
+      Value *OpVal = InsertPHITranslatedSubExpr(GEP->getOperand(i),
+                                                CurBB, PredBB, DT, NewInsts);
+      if (OpVal == 0) return 0;
+      GEPOps.push_back(OpVal);
+    }
+    
+    GetElementPtrInst *Result = 
+    GetElementPtrInst::Create(GEPOps[0], GEPOps.begin()+1, GEPOps.end(),
+                              InVal->getName()+".phi.trans.insert",
+                              PredBB->getTerminator());
+    Result->setIsInBounds(GEP->isInBounds());
+    NewInsts.push_back(Result);
+    return Result;
+  }
+  
+#if 0
+  // FIXME: This code works, but it is unclear that we actually want to insert
+  // a big chain of computation in order to make a value available in a block.
+  // This needs to be evaluated carefully to consider its cost trade offs.
+  
+  // Handle add with a constant RHS.
+  if (Inst->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Inst->getOperand(1))) {
+    // PHI translate the LHS.
+    Value *OpVal = InsertPHITranslatedSubExpr(Inst->getOperand(0),
+                                              CurBB, PredBB, DT, NewInsts);
+    if (OpVal == 0) return 0;
+    
+    BinaryOperator *Res = BinaryOperator::CreateAdd(OpVal, Inst->getOperand(1),
+                                           InVal->getName()+".phi.trans.insert",
+                                                    PredBB->getTerminator());
+    Res->setHasNoSignedWrap(cast<BinaryOperator>(Inst)->hasNoSignedWrap());
+    Res->setHasNoUnsignedWrap(cast<BinaryOperator>(Inst)->hasNoUnsignedWrap());
+    NewInsts.push_back(Res);
+    return Res;
+  }
+#endif
+  
+  return 0;
+}
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index e767891..8148429 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp
@@ -35,6 +35,7 @@ namespace {
     LoopInfo *LI;
     std::set<BasicBlock*>  BBToVisit;
     std::map<Loop*,double> LoopExitWeights;
+    std::map<Edge,double>  MinimalWeight;
   public:
     static char ID; // Class identification, replacement for typeinfo
     explicit ProfileEstimatorPass(const double execcount = 0)
@@ -91,7 +92,7 @@ static void inline printEdgeError(ProfileInfo::Edge e, const char *M) {
 
 void inline ProfileEstimatorPass::printEdgeWeight(Edge E) {
   DEBUG(errs() << "-- Weight of Edge " << E << ":"
-               << format("%g", getEdgeWeight(E)) << "\n");
+               << format("%20.20g", getEdgeWeight(E)) << "\n");
 }
 
 // recurseBasicBlock() - This calculates the ProfileInfo estimation for a
@@ -174,6 +175,12 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
         double w = getEdgeWeight(*ei);
         if (w == MissingValue) {
           Edges.push_back(*ei);
+          // Check if there is a necessary minimal weight, if yes, subtract it 
+          // from weight.
+          if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
+            incoming -= MinimalWeight[*ei];
+            DEBUG(errs() << "Reserving " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
+          }
         } else {
           incoming -= w;
         }
@@ -191,11 +198,43 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
         printEdgeWeight(edge);
       }
     }
-    // Distribute remaining weight onto the exit edges.
+
+    // Distribute remaining weight to the exting edges. To prevent fractions
+    // from building up and provoking precision problems the weight which is to
+    // be distributed is split and the rounded, the last edge gets a somewhat
+    // bigger value, but we are close enough for an estimation.
+    double fraction = floor(incoming/Edges.size());
     for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
          ei != ee; ++ei) {
-      EdgeInformation[BB->getParent()][*ei] += incoming/Edges.size();
+      double w = 0;
+      if (ei != (ee-1)) {
+        w = fraction;
+        incoming -= fraction;
+      } else {
+        w = incoming;
+      }
+      EdgeInformation[BB->getParent()][*ei] += w;
+      // Read necessary minimal weight.
+      if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
+        EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
+        DEBUG(errs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
+      }
       printEdgeWeight(*ei);
+      
+      // Add minimal weight to paths to all exit edges, this is used to ensure
+      // that enough flow is reaching this edges.
+      Path p;
+      const BasicBlock *Dest = GetPath(BB, (*ei).first, p, GetPathToDest);
+      while (Dest != BB) {
+        const BasicBlock *Parent = p.find(Dest)->second;
+        Edge e = getEdge(Parent, Dest);
+        if (MinimalWeight.find(e) == MinimalWeight.end()) {
+          MinimalWeight[e] = 0;
+        }
+        MinimalWeight[e] += w;
+        DEBUG(errs() << "Minimal Weight for " << e << ": " << format("%.20g",MinimalWeight[e]) << "\n");
+        Dest = Parent;
+      }
     }
     // Increase flow into the loop.
     BBWeight *= (ExecCount+1);
@@ -203,7 +242,7 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
 
   BlockInformation[BB->getParent()][BB] = BBWeight;
   // Up until now we considered only the loop exiting edges, now we have a
-  // definite block weight and must ditribute this onto the outgoing edges.
+  // definite block weight and must distribute this onto the outgoing edges.
   // Since there may be already flow attached to some of the edges, read this
   // flow first and remember the edges that have still now flow attached.
   Edges.clear();
@@ -225,15 +264,32 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
         BBWeight -= getEdgeWeight(edge);
       } else {
         Edges.push_back(edge);
+        // If minimal weight is necessary, reserve weight by subtracting weight
+        // from block weight, this is readded later on.
+        if (MinimalWeight.find(edge) != MinimalWeight.end()) {
+          BBWeight -= MinimalWeight[edge];
+          DEBUG(errs() << "Reserving " << format("%.20g",MinimalWeight[edge]) << " at " << edge << "\n");
+        }
       }
     }
   }
 
+  double fraction = floor(BBWeight/Edges.size());
   // Finally we know what flow is still not leaving the block, distribute this
   // flow onto the empty edges.
   for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
        ei != ee; ++ei) {
-    EdgeInformation[BB->getParent()][*ei] += BBWeight/Edges.size();
+    if (ei != (ee-1)) {
+      EdgeInformation[BB->getParent()][*ei] += fraction;
+      BBWeight -= fraction;
+    } else {
+      EdgeInformation[BB->getParent()][*ei] += BBWeight;
+    }
+    // Readd minial necessary weight.
+    if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
+      EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
+      DEBUG(errs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
+    }
     printEdgeWeight(*ei);
   }
 
@@ -260,20 +316,24 @@ bool ProfileEstimatorPass::runOnFunction(Function &F) {
   for (Function::iterator bi = F.begin(), be = F.end(); bi != be; ++bi)
     BBToVisit.insert(bi);
 
+  // Clear Minimal Edges.
+  MinimalWeight.clear();
+
   DEBUG(errs() << "Working on function " << F.getNameStr() << "\n");
 
   // Since the entry block is the first one and has no predecessors, the edge
   // (0,entry) is inserted with the starting weight of 1.
   BasicBlock *entry = &F.getEntryBlock();
-  BlockInformation[&F][entry] = 1;
+  BlockInformation[&F][entry] = pow(2.0, 32.0);
   Edge edge = getEdge(0,entry);
-  EdgeInformation[&F][edge] = 1;
+  EdgeInformation[&F][edge] = BlockInformation[&F][entry];
   printEdgeWeight(edge);
 
   // Since recurseBasicBlock() maybe returns with a block which was not fully
-  // estimated, use recurseBasicBlock() until everything is calculated. 
+  // estimated, use recurseBasicBlock() until everything is calculated.
+  bool cleanup = false;
   recurseBasicBlock(entry);
-  while (BBToVisit.size() > 0) {
+  while (BBToVisit.size() > 0 && !cleanup) {
     // Remember number of open blocks, this is later used to check if progress
     // was made.
     unsigned size = BBToVisit.size();
@@ -287,21 +347,65 @@ bool ProfileEstimatorPass::runOnFunction(Function &F) {
       if (BBToVisit.size() < size) break;
     }
 
-    // If there was not a single block resovled, make some assumptions.
+    // If there was not a single block resolved, make some assumptions.
     if (BBToVisit.size() == size) {
-      BasicBlock *BB = *(BBToVisit.begin());
-      // Since this BB was not calculated because of missing incoming edges,
-      // set these edges to zero.
-      for (pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-           bbi != bbe; ++bbi) {
-        Edge e = getEdge(*bbi,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          EdgeInformation[&F][e] = 0;
-          DEBUG(errs() << "Assuming edge weight: ");
-          printEdgeWeight(e);
+      bool found = false;
+      for (std::set<BasicBlock*>::iterator BBI = BBToVisit.begin(), BBE = BBToVisit.end(); 
+           (BBI != BBE) && (!found); ++BBI) {
+        BasicBlock *BB = *BBI;
+        // Try each predecessor if it can be assumend.
+        for (pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+             (bbi != bbe) && (!found); ++bbi) {
+          Edge e = getEdge(*bbi,BB);
+          double w = getEdgeWeight(e);
+          // Check that edge from predecessor is still free.
+          if (w == MissingValue) {
+            // Check if there is a circle from this block to predecessor.
+            Path P;
+            const BasicBlock *Dest = GetPath(BB, *bbi, P, GetPathToDest);
+            if (Dest != *bbi) {
+              // If there is no circle, just set edge weight to 0
+              EdgeInformation[&F][e] = 0;
+              DEBUG(errs() << "Assuming edge weight: ");
+              printEdgeWeight(e);
+              found = true;
+            }
+          }
         }
       }
+      if (!found) {
+        cleanup = true;
+        DEBUG(errs() << "No assumption possible in Fuction "<<F.getName()<<", setting all to zero\n");
+      }
+    }
+  }
+  // In case there was no safe way to assume edges, set as a last measure, 
+  // set _everything_ to zero.
+  if (cleanup) {
+    FunctionInformation[&F] = 0;
+    BlockInformation[&F].clear();
+    EdgeInformation[&F].clear();
+    for (Function::const_iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+      const BasicBlock *BB = &(*FI);
+      BlockInformation[&F][BB] = 0;
+      pred_const_iterator predi = pred_begin(BB), prede = pred_end(BB);
+      if (predi == prede) {
+        Edge e = getEdge(0,BB);
+        setEdgeWeight(e,0);
+      }
+      for (;predi != prede; ++predi) {
+        Edge e = getEdge(*predi,BB);
+        setEdgeWeight(e,0);
+      }
+      succ_const_iterator succi = succ_begin(BB), succe = succ_end(BB);
+      if (succi == succe) {
+        Edge e = getEdge(BB,0);
+        setEdgeWeight(e,0);
+      }
+      for (;succi != succe; ++succi) {
+        Edge e = getEdge(*succi,BB);
+        setEdgeWeight(e,0);
+      }
     }
   }
 
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 7f24f5a..c49c6e1 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -11,26 +11,52 @@
 // "no profile" implementation.
 //
 //===----------------------------------------------------------------------===//
-
+#define DEBUG_TYPE "profile-info"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Format.h"
+#include "llvm/ADT/SmallSet.h"
 #include <set>
+#include <queue>
+#include <limits>
 using namespace llvm;
 
 // Register the ProfileInfo interface, providing a nice name to refer to.
 static RegisterAnalysisGroup<ProfileInfo> Z("Profile Information");
-char ProfileInfo::ID = 0;
 
-ProfileInfo::~ProfileInfo() {}
+namespace llvm {
+
+template <>
+ProfileInfoT<MachineFunction, MachineBasicBlock>::ProfileInfoT() {}
+template <>
+ProfileInfoT<MachineFunction, MachineBasicBlock>::~ProfileInfoT() {}
+
+template <>
+ProfileInfoT<Function, BasicBlock>::ProfileInfoT() {
+  MachineProfile = 0;
+}
+template <>
+ProfileInfoT<Function, BasicBlock>::~ProfileInfoT() {
+  if (MachineProfile) delete MachineProfile;
+}
+
+template<>
+char ProfileInfoT<Function,BasicBlock>::ID = 0;
+
+template<>
+char ProfileInfoT<MachineFunction, MachineBasicBlock>::ID = 0;
+
+template<>
+const double ProfileInfoT<Function,BasicBlock>::MissingValue = -1;
 
-const double ProfileInfo::MissingValue = -1;
+template<> const
+double ProfileInfoT<MachineFunction, MachineBasicBlock>::MissingValue = -1;
 
-double ProfileInfo::getExecutionCount(const BasicBlock *BB) {
+template<> double
+ProfileInfoT<Function,BasicBlock>::getExecutionCount(const BasicBlock *BB) {
   std::map<const Function*, BlockCounts>::iterator J =
     BlockInformation.find(BB->getParent());
   if (J != BlockInformation.end()) {
@@ -39,36 +65,74 @@ double ProfileInfo::getExecutionCount(const BasicBlock *BB) {
       return I->second;
   }
 
+  double Count = MissingValue;
+
   pred_const_iterator PI = pred_begin(BB), PE = pred_end(BB);
 
   // Are there zero predecessors of this block?
   if (PI == PE) {
-    // If this is the entry block, look for the Null -> Entry edge.
-    if (BB == &BB->getParent()->getEntryBlock())
-      return getEdgeWeight(getEdge(0, BB));
-    else
-      return 0;   // Otherwise, this is a dead block.
+    Edge e = getEdge(0,BB);
+    Count = getEdgeWeight(e);
+  } else {
+    // Otherwise, if there are predecessors, the execution count of this block is
+    // the sum of the edge frequencies from the incoming edges.
+    std::set<const BasicBlock*> ProcessedPreds;
+    Count = 0;
+    for (; PI != PE; ++PI)
+      if (ProcessedPreds.insert(*PI).second) {
+        double w = getEdgeWeight(getEdge(*PI, BB));
+        if (w == MissingValue) {
+          Count = MissingValue;
+          break;
+        }
+        Count += w;
+      }
   }
 
-  // Otherwise, if there are predecessors, the execution count of this block is
-  // the sum of the edge frequencies from the incoming edges.
-  std::set<const BasicBlock*> ProcessedPreds;
-  double Count = 0;
-  for (; PI != PE; ++PI)
-    if (ProcessedPreds.insert(*PI).second) {
-      double w = getEdgeWeight(getEdge(*PI, BB));
-      if (w == MissingValue) {
-        Count = MissingValue;
-        break;
-      }
-      Count += w;
+  // If the predecessors did not suffice to get block weight, try successors.
+  if (Count == MissingValue) {
+
+    succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
+
+    // Are there zero successors of this block?
+    if (SI == SE) {
+      Edge e = getEdge(BB,0);
+      Count = getEdgeWeight(e);
+    } else {
+      std::set<const BasicBlock*> ProcessedSuccs;
+      Count = 0;
+      for (; SI != SE; ++SI)
+        if (ProcessedSuccs.insert(*SI).second) {
+          double w = getEdgeWeight(getEdge(BB, *SI));
+          if (w == MissingValue) {
+            Count = MissingValue;
+            break;
+          }
+          Count += w;
+        }
     }
+  }
 
   if (Count != MissingValue) BlockInformation[BB->getParent()][BB] = Count;
   return Count;
 }
 
-double ProfileInfo::getExecutionCount(const Function *F) {
+template<>
+double ProfileInfoT<MachineFunction, MachineBasicBlock>::
+        getExecutionCount(const MachineBasicBlock *MBB) {
+  std::map<const MachineFunction*, BlockCounts>::iterator J =
+    BlockInformation.find(MBB->getParent());
+  if (J != BlockInformation.end()) {
+    BlockCounts::iterator I = J->second.find(MBB);
+    if (I != J->second.end())
+      return I->second;
+  }
+
+  return MissingValue;
+}
+
+template<>
+double ProfileInfoT<Function,BasicBlock>::getExecutionCount(const Function *F) {
   std::map<const Function*, double>::iterator J =
     FunctionInformation.find(F);
   if (J != FunctionInformation.end())
@@ -83,35 +147,211 @@ double ProfileInfo::getExecutionCount(const Function *F) {
   return Count;
 }
 
+template<>
+double ProfileInfoT<MachineFunction, MachineBasicBlock>::
+        getExecutionCount(const MachineFunction *MF) {
+  std::map<const MachineFunction*, double>::iterator J =
+    FunctionInformation.find(MF);
+  if (J != FunctionInformation.end())
+    return J->second;
+
+  double Count = getExecutionCount(&MF->front());
+  if (Count != MissingValue) FunctionInformation[MF] = Count;
+  return Count;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        setExecutionCount(const BasicBlock *BB, double w) {
+  DEBUG(errs() << "Creating Block " << BB->getName() 
+               << " (weight: " << format("%.20g",w) << ")\n");
+  BlockInformation[BB->getParent()][BB] = w;
+}
+
+template<>
+void ProfileInfoT<MachineFunction, MachineBasicBlock>::
+        setExecutionCount(const MachineBasicBlock *MBB, double w) {
+  DEBUG(errs() << "Creating Block " << MBB->getBasicBlock()->getName()
+               << " (weight: " << format("%.20g",w) << ")\n");
+  BlockInformation[MBB->getParent()][MBB] = w;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::addEdgeWeight(Edge e, double w) {
+  double oldw = getEdgeWeight(e);
+  assert (oldw != MissingValue && "Adding weight to Edge with no previous weight");
+  DEBUG(errs() << "Adding to Edge " << e
+               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
+  EdgeInformation[getFunction(e)][e] = oldw + w;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        addExecutionCount(const BasicBlock *BB, double w) {
+  double oldw = getExecutionCount(BB);
+  assert (oldw != MissingValue && "Adding weight to Block with no previous weight");
+  DEBUG(errs() << "Adding to Block " << BB->getName()
+               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
+  BlockInformation[BB->getParent()][BB] = oldw + w;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::removeBlock(const BasicBlock *BB) {
+  std::map<const Function*, BlockCounts>::iterator J =
+    BlockInformation.find(BB->getParent());
+  if (J == BlockInformation.end()) return;
+
+  DEBUG(errs() << "Deleting " << BB->getName() << "\n");
+  J->second.erase(BB);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::removeEdge(Edge e) {
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(getFunction(e));
+  if (J == EdgeInformation.end()) return;
+
+  DEBUG(errs() << "Deleting" << e << "\n");
+  J->second.erase(e);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        replaceEdge(const Edge &oldedge, const Edge &newedge) {
+  double w;
+  if ((w = getEdgeWeight(newedge)) == MissingValue) {
+    w = getEdgeWeight(oldedge);
+    DEBUG(errs() << "Replacing " << oldedge << " with " << newedge  << "\n");
+  } else {
+    w += getEdgeWeight(oldedge);
+    DEBUG(errs() << "Adding " << oldedge << " to " << newedge  << "\n");
+  }
+  setEdgeWeight(newedge,w);
+  removeEdge(oldedge);
+}
+
+template<>
+const BasicBlock *ProfileInfoT<Function,BasicBlock>::
+        GetPath(const BasicBlock *Src, const BasicBlock *Dest,
+                Path &P, unsigned Mode) {
+  const BasicBlock *BB = 0;
+  bool hasFoundPath = false;
+
+  std::queue<const BasicBlock *> BFS;
+  BFS.push(Src);
+
+  while(BFS.size() && !hasFoundPath) {
+    BB = BFS.front();
+    BFS.pop();
+
+    succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
+    if (Succ == End) {
+      P[0] = BB;
+      if (Mode & GetPathToExit) {
+        hasFoundPath = true;
+        BB = 0;
+      }
+    }
+    for(;Succ != End; ++Succ) {
+      if (P.find(*Succ) != P.end()) continue;
+      Edge e = getEdge(BB,*Succ);
+      if ((Mode & GetPathWithNewEdges) && (getEdgeWeight(e) != MissingValue)) continue;
+      P[*Succ] = BB;
+      BFS.push(*Succ);
+      if ((Mode & GetPathToDest) && *Succ == Dest) {
+        hasFoundPath = true;
+        BB = *Succ;
+        break;
+      }
+      if ((Mode & GetPathToValue) && (getExecutionCount(*Succ) != MissingValue)) {
+        hasFoundPath = true;
+        BB = *Succ;
+        break;
+      }
+    }
+  }
+
+  return BB;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        divertFlow(const Edge &oldedge, const Edge &newedge) {
+  DEBUG(errs() << "Diverting " << oldedge << " via " << newedge );
+
+  // First check if the old edge was taken, if not, just delete it...
+  if (getEdgeWeight(oldedge) == 0) {
+    removeEdge(oldedge);
+    return;
+  }
+
+  Path P;
+  P[newedge.first] = 0;
+  P[newedge.second] = newedge.first;
+  const BasicBlock *BB = GetPath(newedge.second,oldedge.second,P,GetPathToExit | GetPathToDest);
+
+  double w = getEdgeWeight (oldedge);
+  DEBUG(errs() << ", Weight: " << format("%.20g",w) << "\n");
+  do {
+    const BasicBlock *Parent = P.find(BB)->second;
+    Edge e = getEdge(Parent,BB);
+    double oldw = getEdgeWeight(e);
+    double oldc = getExecutionCount(e.first);
+    setEdgeWeight(e, w+oldw);
+    if (Parent != oldedge.first) {
+      setExecutionCount(e.first, w+oldc);
+    }
+    BB = Parent;
+  } while (BB != newedge.first);
+  removeEdge(oldedge);
+}
+
 /// Replaces all occurences of RmBB in the ProfilingInfo with DestBB.
 /// This checks all edges of the function the blocks reside in and replaces the
 /// occurences of RmBB with DestBB.
-void ProfileInfo::replaceAllUses(const BasicBlock *RmBB, 
-                                 const BasicBlock *DestBB) {
-  DEBUG(errs() << "Replacing " << RmBB->getNameStr()
-               << " with " << DestBB->getNameStr() << "\n");
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) {
+  DEBUG(errs() << "Replacing " << RmBB->getName()
+               << " with " << DestBB->getName() << "\n");
   const Function *F = DestBB->getParent();
   std::map<const Function*, EdgeWeights>::iterator J =
     EdgeInformation.find(F);
   if (J == EdgeInformation.end()) return;
 
-  for (EdgeWeights::iterator I = J->second.begin(), E = J->second.end();
-       I != E; ++I) {
-    Edge e = I->first;
-    Edge newedge; bool foundedge = false;
+  Edge e, newedge;
+  bool erasededge = false;
+  EdgeWeights::iterator I = J->second.begin(), E = J->second.end();
+  while(I != E) {
+    e = (I++)->first;
+    bool foundedge = false; bool eraseedge = false;
     if (e.first == RmBB) {
-      newedge = getEdge(DestBB, e.second);
-      foundedge = true;
+      if (e.second == DestBB) {
+        eraseedge = true;
+      } else {
+        newedge = getEdge(DestBB, e.second);
+        foundedge = true;
+      }
     }
     if (e.second == RmBB) {
-      newedge = getEdge(e.first, DestBB);
-      foundedge = true;
+      if (e.first == DestBB) {
+        eraseedge = true;
+      } else {
+        newedge = getEdge(e.first, DestBB);
+        foundedge = true;
+      }
     }
     if (foundedge) {
-      double w = getEdgeWeight(e);
-      EdgeInformation[F][newedge] = w;
-      DEBUG(errs() << "Replacing " << e << " with " << newedge  << "\n");
-      J->second.erase(e);
+      replaceEdge(e, newedge);
+    }
+    if (eraseedge) {
+      if (erasededge) {
+        Edge newedge = getEdge(DestBB, DestBB);
+        replaceEdge(e, newedge);
+      } else {
+        removeEdge(e);
+        erasededge = true;
+      }
     }
   }
 }
@@ -119,10 +359,11 @@ void ProfileInfo::replaceAllUses(const BasicBlock *RmBB,
 /// Splits an edge in the ProfileInfo and redirects flow over NewBB.
 /// Since its possible that there is more than one edge in the CFG from FristBB
 /// to SecondBB its necessary to redirect the flow proporionally.
-void ProfileInfo::splitEdge(const BasicBlock *FirstBB,
-                            const BasicBlock *SecondBB,
-                            const BasicBlock *NewBB,
-                            bool MergeIdenticalEdges) {
+template<>
+void ProfileInfoT<Function,BasicBlock>::splitEdge(const BasicBlock *FirstBB,
+                                                  const BasicBlock *SecondBB,
+                                                  const BasicBlock *NewBB,
+                                                  bool MergeIdenticalEdges) {
   const Function *F = FirstBB->getParent();
   std::map<const Function*, EdgeWeights>::iterator J =
     EdgeInformation.find(F);
@@ -153,7 +394,7 @@ void ProfileInfo::splitEdge(const BasicBlock *FirstBB,
 
   // We know now how many edges there are from FirstBB to SecondBB, reroute a
   // proportional part of the edge weight over NewBB.
-  double neww = w / succ_count;
+  double neww = floor(w / succ_count);
   ECs[n1] += neww;
   ECs[n2] += neww;
   BlockInformation[F][NewBB] += neww;
@@ -164,14 +405,672 @@ void ProfileInfo::splitEdge(const BasicBlock *FirstBB,
   }
 }
 
-raw_ostream& llvm::operator<<(raw_ostream &O, ProfileInfo::Edge E) {
+template<>
+void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *Old,
+                                                   const BasicBlock* New) {
+  const Function *F = Old->getParent();
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(F);
+  if (J == EdgeInformation.end()) return;
+
+  DEBUG(errs() << "Splitting " << Old->getName() << " to " << New->getName() << "\n");
+
+  std::set<Edge> Edges;
+  for (EdgeWeights::iterator ewi = J->second.begin(), ewe = J->second.end(); 
+       ewi != ewe; ++ewi) {
+    Edge old = ewi->first;
+    if (old.first == Old) {
+      Edges.insert(old);
+    }
+  }
+  for (std::set<Edge>::iterator EI = Edges.begin(), EE = Edges.end(); 
+       EI != EE; ++EI) {
+    Edge newedge = getEdge(New, EI->second);
+    replaceEdge(*EI, newedge);
+  }
+
+  double w = getExecutionCount(Old);
+  setEdgeWeight(getEdge(Old, New), w);
+  setExecutionCount(New, w);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *BB,
+                                                   const BasicBlock* NewBB,
+                                                   BasicBlock *const *Preds,
+                                                   unsigned NumPreds) {
+  const Function *F = BB->getParent();
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(F);
+  if (J == EdgeInformation.end()) return;
+
+  DEBUG(errs() << "Splitting " << NumPreds << " Edges from " << BB->getName() 
+               << " to " << NewBB->getName() << "\n");
+
+  // Collect weight that was redirected over NewBB.
+  double newweight = 0;
+  
+  std::set<const BasicBlock *> ProcessedPreds;
+  // For all requestes Predecessors.
+  for (unsigned pred = 0; pred < NumPreds; ++pred) {
+    const BasicBlock * Pred = Preds[pred];
+    if (ProcessedPreds.insert(Pred).second) {
+      // Create edges and read old weight.
+      Edge oldedge = getEdge(Pred, BB);
+      Edge newedge = getEdge(Pred, NewBB);
+
+      // Remember how much weight was redirected.
+      newweight += getEdgeWeight(oldedge);
+    
+      replaceEdge(oldedge,newedge);
+    }
+  }
+
+  Edge newedge = getEdge(NewBB,BB);
+  setEdgeWeight(newedge, newweight);
+  setExecutionCount(NewBB, newweight);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::transfer(const Function *Old,
+                                                 const Function *New) {
+  DEBUG(errs() << "Replacing Function " << Old->getName() << " with "
+               << New->getName() << "\n");
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(Old);
+  if(J != EdgeInformation.end()) {
+    EdgeInformation[New] = J->second;
+  }
+  EdgeInformation.erase(Old);
+  BlockInformation.erase(Old);
+  FunctionInformation.erase(Old);
+}
+
+static double readEdgeOrRemember(ProfileInfo::Edge edge, double w,
+                                 ProfileInfo::Edge &tocalc, unsigned &uncalc) {
+  if (w == ProfileInfo::MissingValue) {
+    tocalc = edge;
+    uncalc++;
+    return 0;
+  } else {
+    return w;
+  }
+}
+
+template<>
+bool ProfileInfoT<Function,BasicBlock>::
+        CalculateMissingEdge(const BasicBlock *BB, Edge &removed,
+                             bool assumeEmptySelf) {
+  Edge edgetocalc;
+  unsigned uncalculated = 0;
+
+  // collect weights of all incoming and outgoing edges, rememer edges that
+  // have no value
+  double incount = 0;
+  SmallSet<const BasicBlock*,8> pred_visited;
+  pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  if (bbi==bbe) {
+    Edge e = getEdge(0,BB);
+    incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
+  }
+  for (;bbi != bbe; ++bbi) {
+    if (pred_visited.insert(*bbi)) {
+      Edge e = getEdge(*bbi,BB);
+      incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
+    }
+  }
+
+  double outcount = 0;
+  SmallSet<const BasicBlock*,8> succ_visited;
+  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
+  if (sbbi==sbbe) {
+    Edge e = getEdge(BB,0);
+    if (getEdgeWeight(e) == MissingValue) {
+      double w = getExecutionCount(BB);
+      if (w != MissingValue) {
+        setEdgeWeight(e,w);
+        removed = e;
+      }
+    }
+    outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
+  }
+  for (;sbbi != sbbe; ++sbbi) {
+    if (succ_visited.insert(*sbbi)) {
+      Edge e = getEdge(BB,*sbbi);
+      outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
+    }
+  }
+
+  // if exactly one edge weight was missing, calculate it and remove it from
+  // spanning tree
+  if (uncalculated == 0 ) {
+    return true;
+  } else
+  if (uncalculated == 1) {
+    if (incount < outcount) {
+      EdgeInformation[BB->getParent()][edgetocalc] = outcount-incount;
+    } else {
+      EdgeInformation[BB->getParent()][edgetocalc] = incount-outcount;
+    }
+    DEBUG(errs() << "--Calc Edge Counter for " << edgetocalc << ": "
+                 << format("%.20g", getEdgeWeight(edgetocalc)) << "\n");
+    removed = edgetocalc;
+    return true;
+  } else 
+  if (uncalculated == 2 && assumeEmptySelf && edgetocalc.first == edgetocalc.second && incount == outcount) {
+    setEdgeWeight(edgetocalc, incount * 10);
+    removed = edgetocalc;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static void readEdge(ProfileInfo *PI, ProfileInfo::Edge e, double &calcw, std::set<ProfileInfo::Edge> &misscount) {
+  double w = PI->getEdgeWeight(e);
+  if (w != ProfileInfo::MissingValue) {
+    calcw += w;
+  } else {
+    misscount.insert(e);
+  }
+}
+
+template<>
+bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *BB) {
+  bool hasNoSuccessors = false;
+
+  double inWeight = 0;
+  std::set<Edge> inMissing;
+  std::set<const BasicBlock*> ProcessedPreds;
+  pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  if (bbi == bbe) {
+    readEdge(this,getEdge(0,BB),inWeight,inMissing);
+  }
+  for( ; bbi != bbe; ++bbi ) {
+    if (ProcessedPreds.insert(*bbi).second) {
+      readEdge(this,getEdge(*bbi,BB),inWeight,inMissing);
+    }
+  }
+
+  double outWeight = 0;
+  std::set<Edge> outMissing;
+  std::set<const BasicBlock*> ProcessedSuccs;
+  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
+  if (sbbi == sbbe) {
+    readEdge(this,getEdge(BB,0),outWeight,outMissing);
+    hasNoSuccessors = true;
+  }
+  for ( ; sbbi != sbbe; ++sbbi ) {
+    if (ProcessedSuccs.insert(*sbbi).second) {
+      readEdge(this,getEdge(BB,*sbbi),outWeight,outMissing);
+    }
+  }
+
+  double share;
+  std::set<Edge>::iterator ei,ee;
+  if (inMissing.size() == 0 && outMissing.size() > 0) {
+    ei = outMissing.begin();
+    ee = outMissing.end();
+    share = inWeight/outMissing.size();
+    setExecutionCount(BB,inWeight);
+  } else
+  if (inMissing.size() > 0 && outMissing.size() == 0 && outWeight == 0) {
+    ei = inMissing.begin();
+    ee = inMissing.end();
+    share = 0;
+    setExecutionCount(BB,0);
+  } else
+  if (inMissing.size() == 0 && outMissing.size() == 0) {
+    setExecutionCount(BB,outWeight);
+    return true;
+  } else {
+    return false;
+  }
+  for ( ; ei != ee; ++ei ) {
+    setEdgeWeight(*ei,share);
+  }
+  return true;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
+//  if (getExecutionCount(&(F->getEntryBlock())) == 0) {
+//    for (Function::const_iterator FI = F->begin(), FE = F->end();
+//         FI != FE; ++FI) {
+//      const BasicBlock* BB = &(*FI);
+//      {
+//        pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+//        if (NBB == End) {
+//          setEdgeWeight(getEdge(0,BB),0);
+//        }
+//        for(;NBB != End; ++NBB) {
+//          setEdgeWeight(getEdge(*NBB,BB),0);
+//        }
+//      }
+//      {
+//        succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+//        if (NBB == End) {
+//          setEdgeWeight(getEdge(0,BB),0);
+//        }
+//        for(;NBB != End; ++NBB) {
+//          setEdgeWeight(getEdge(*NBB,BB),0);
+//        }
+//      }
+//    }
+//    return;
+//  }
+  // The set of BasicBlocks that are still unvisited.
+  std::set<const BasicBlock*> Unvisited;
+
+  // The set of return edges (Edges with no successors).
+  std::set<Edge> ReturnEdges;
+  double ReturnWeight = 0;
+  
+  // First iterate over the whole function and collect:
+  // 1) The blocks in this function in the Unvisited set.
+  // 2) The return edges in the ReturnEdges set.
+  // 3) The flow that is leaving the function already via return edges.
+
+  // Data structure for searching the function.
+  std::queue<const BasicBlock *> BFS;
+  const BasicBlock *BB = &(F->getEntryBlock());
+  BFS.push(BB);
+  Unvisited.insert(BB);
+
+  while (BFS.size()) {
+    BB = BFS.front(); BFS.pop();
+    succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+    if (NBB == End) {
+      Edge e = getEdge(BB,0);
+      double w = getEdgeWeight(e);
+      if (w == MissingValue) {
+        // If the return edge has no value, try to read value from block.
+        double bw = getExecutionCount(BB);
+        if (bw != MissingValue) {
+          setEdgeWeight(e,bw);
+          ReturnWeight += bw;
+        } else {
+          // If both return edge and block provide no value, collect edge.
+          ReturnEdges.insert(e);
+        }
+      } else {
+        // If the return edge has a proper value, collect it.
+        ReturnWeight += w;
+      }
+    }
+    for (;NBB != End; ++NBB) {
+      if (Unvisited.insert(*NBB).second) {
+        BFS.push(*NBB);
+      }
+    }
+  }
+
+  while (Unvisited.size() > 0) {
+    unsigned oldUnvisitedCount = Unvisited.size();
+    bool FoundPath = false;
+
+    // If there is only one edge left, calculate it.
+    if (ReturnEdges.size() == 1) {
+      ReturnWeight = getExecutionCount(&(F->getEntryBlock())) - ReturnWeight;
+
+      Edge e = *ReturnEdges.begin();
+      setEdgeWeight(e,ReturnWeight);
+      setExecutionCount(e.first,ReturnWeight);
+
+      Unvisited.erase(e.first);
+      ReturnEdges.erase(e);
+      continue;
+    }
+
+    // Calculate all blocks where only one edge is missing, this may also
+    // resolve furhter return edges.
+    std::set<const BasicBlock *>::iterator FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE) {
+      const BasicBlock *BB = *FI; ++FI;
+      Edge e;
+      if(CalculateMissingEdge(BB,e,true)) {
+        if (BlockInformation[F].find(BB) == BlockInformation[F].end()) {
+          setExecutionCount(BB,getExecutionCount(BB));
+        }
+        Unvisited.erase(BB);
+        if (e.first != 0 && e.second == 0) {
+          ReturnEdges.erase(e);
+          ReturnWeight += getEdgeWeight(e);
+        }
+      }
+    }
+    if (oldUnvisitedCount > Unvisited.size()) continue;
+
+    // Estimate edge weights by dividing the flow proportionally.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE) {
+      const BasicBlock *BB = *FI; ++FI;
+      const BasicBlock *Dest = 0;
+      bool AllEdgesHaveSameReturn = true;
+      // Check each Successor, these must all end up in the same or an empty
+      // return block otherwise its dangerous to do an estimation on them.
+      for (succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
+           Succ != End; ++Succ) {
+        Path P;
+        GetPath(*Succ, 0, P, GetPathToExit);
+        if (Dest && Dest != P[0]) {
+          AllEdgesHaveSameReturn = false;
+        }
+        Dest = P[0];
+      }
+      if (AllEdgesHaveSameReturn) {
+        if(EstimateMissingEdges(BB)) {
+          Unvisited.erase(BB);
+          break;
+        }
+      }
+    }
+    if (oldUnvisitedCount > Unvisited.size()) continue;
+
+    // Check if there is a path to an block that has a known value and redirect
+    // flow accordingly.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      // Fetch path.
+      const BasicBlock *BB = *FI; ++FI;
+      Path P;
+      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToValue);
+
+      // Calculate incoming flow.
+      double iw = 0; unsigned inmissing = 0; unsigned incount = 0; unsigned invalid = 0;
+      std::set<const BasicBlock *> Processed;
+      for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+           NBB != End; ++NBB) {
+        if (Processed.insert(*NBB).second) {
+          Edge e = getEdge(*NBB, BB);
+          double ew = getEdgeWeight(e);
+          if (ew != MissingValue) {
+            iw += ew;
+            invalid++;
+          } else {
+            // If the path contains the successor, this means its a backedge,
+            // do not count as missing.
+            if (P.find(*NBB) == P.end())
+              inmissing++;
+          }
+          incount++;
+        }
+      }
+      if (inmissing == incount) continue;
+      if (invalid == 0) continue;
+
+      // Subtract (already) outgoing flow.
+      Processed.clear();
+      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+           NBB != End; ++NBB) {
+        if (Processed.insert(*NBB).second) {
+          Edge e = getEdge(BB, *NBB);
+          double ew = getEdgeWeight(e);
+          if (ew != MissingValue) {
+            iw -= ew;
+          }
+        }
+      }
+      if (iw < 0) continue;
+
+      // Check the recieving end of the path if it can handle the flow.
+      double ow = getExecutionCount(Dest);
+      Processed.clear();
+      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+           NBB != End; ++NBB) {
+        if (Processed.insert(*NBB).second) {
+          Edge e = getEdge(BB, *NBB);
+          double ew = getEdgeWeight(e);
+          if (ew != MissingValue) {
+            ow -= ew;
+          }
+        }
+      }
+      if (ow < 0) continue;
+
+      // Determine how much flow shall be used.
+      double ew = getEdgeWeight(getEdge(P[Dest],Dest));
+      if (ew != MissingValue) {
+        ew = ew<ow?ew:ow;
+        ew = ew<iw?ew:iw;
+      } else {
+        if (inmissing == 0)
+          ew = iw;
+      }
+
+      // Create flow.
+      if (ew != MissingValue) {
+        do {
+          Edge e = getEdge(P[Dest],Dest);
+          if (getEdgeWeight(e) == MissingValue) {
+            setEdgeWeight(e,ew);
+            FoundPath = true;
+          }
+          Dest = P[Dest];
+        } while (Dest != BB);
+      }
+    }
+    if (FoundPath) continue;
+
+    // Calculate a block with self loop.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+      bool SelfEdgeFound = false;
+      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+           NBB != End; ++NBB) {
+        if (*NBB == BB) {
+          SelfEdgeFound = true;
+          break;
+        }
+      }
+      if (SelfEdgeFound) {
+        Edge e = getEdge(BB,BB);
+        if (getEdgeWeight(e) == MissingValue) {
+          double iw = 0;
+          std::set<const BasicBlock *> Processed;
+          for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+               NBB != End; ++NBB) {
+            if (Processed.insert(*NBB).second) {
+              Edge e = getEdge(*NBB, BB);
+              double ew = getEdgeWeight(e);
+              if (ew != MissingValue) {
+                iw += ew;
+              }
+            }
+          }
+          setEdgeWeight(e,iw * 10);
+          FoundPath = true;
+        }
+      }
+    }
+    if (FoundPath) continue;
+
+    // Determine backedges, set them to zero.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+      const BasicBlock *Dest;
+      Path P;
+      bool BackEdgeFound = false;
+      for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+           NBB != End; ++NBB) {
+        Dest = GetPath(BB, *NBB, P, GetPathToDest | GetPathWithNewEdges);
+        if (Dest == *NBB) {
+          BackEdgeFound = true;
+          break;
+        }
+      }
+      if (BackEdgeFound) {
+        Edge e = getEdge(Dest,BB);
+        double w = getEdgeWeight(e);
+        if (w == MissingValue) {
+          setEdgeWeight(e,0);
+          FoundPath = true;
+        }
+        do {
+          Edge e = getEdge(P[Dest], Dest);
+          double w = getEdgeWeight(e);
+          if (w == MissingValue) {
+            setEdgeWeight(e,0);
+            FoundPath = true;
+          }
+          Dest = P[Dest];
+        } while (Dest != BB);
+      }
+    }
+    if (FoundPath) continue;
+
+    // Channel flow to return block.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+
+      Path P;
+      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToExit | GetPathWithNewEdges);
+      Dest = P[0];
+      if (!Dest) continue;
+
+      if (getEdgeWeight(getEdge(Dest,0)) == MissingValue) {
+        // Calculate incoming flow.
+        double iw = 0;
+        std::set<const BasicBlock *> Processed;
+        for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+             NBB != End; ++NBB) {
+          if (Processed.insert(*NBB).second) {
+            Edge e = getEdge(*NBB, BB);
+            double ew = getEdgeWeight(e);
+            if (ew != MissingValue) {
+              iw += ew;
+            }
+          }
+        }
+        do {
+          Edge e = getEdge(P[Dest], Dest);
+          double w = getEdgeWeight(e);
+          if (w == MissingValue) {
+            setEdgeWeight(e,iw);
+            FoundPath = true;
+          } else {
+            assert(0 && "Edge should not have value already!");
+          }
+          Dest = P[Dest];
+        } while (Dest != BB);
+      }
+    }
+    if (FoundPath) continue;
+
+    // Speculatively set edges to zero.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+
+      for (pred_const_iterator NBB = pred_begin(BB), End = pred_end(BB);
+           NBB != End; ++NBB) {
+        Edge e = getEdge(*NBB,BB);
+        double w = getEdgeWeight(e);
+        if (w == MissingValue) {
+          setEdgeWeight(e,0);
+          FoundPath = true;
+          break;
+        }
+      }
+    }
+    if (FoundPath) continue;
+
+    errs() << "{";
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE) {
+      const BasicBlock *BB = *FI; ++FI;
+      errs() << BB->getName();
+      if (FI != FE)
+        errs() << ",";
+    }
+    errs() << "}";
+
+    errs() << "ASSERT: could not repair function";
+    assert(0 && "could not repair function");
+  }
+
+  EdgeWeights J = EdgeInformation[F];
+  for (EdgeWeights::iterator EI = J.begin(), EE = J.end(); EI != EE; ++EI) {
+    Edge e = EI->first;
+
+    bool SuccFound = false;
+    if (e.first != 0) {
+      succ_const_iterator NBB = succ_begin(e.first), End = succ_end(e.first);
+      if (NBB == End) {
+        if (0 == e.second) {
+          SuccFound = true;
+        }
+      }
+      for (;NBB != End; ++NBB) {
+        if (*NBB == e.second) {
+          SuccFound = true;
+          break;
+        }
+      }
+      if (!SuccFound) {
+        removeEdge(e);
+      }
+    }
+  }
+}
+
+raw_ostream& operator<<(raw_ostream &O, const Function *F) {
+  return O << F->getName();
+}
+
+raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
+  return O << MF->getFunction()->getName() << "(MF)";
+}
+
+raw_ostream& operator<<(raw_ostream &O, const BasicBlock *BB) {
+  return O << BB->getName();
+}
+
+raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
+  return O << MBB->getBasicBlock()->getName() << "(MB)";
+}
+
+raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *, const BasicBlock *> E) {
   O << "(";
-  O << (E.first ? E.first->getNameStr() : "0");
+
+  if (E.first)
+    O << E.first;
+  else
+    O << "0";
+
+  O << ",";
+
+  if (E.second)
+    O << E.second;
+  else
+    O << "0";
+
+  return O << ")";
+}
+
+raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
+  O << "(";
+
+  if (E.first)
+    O << E.first;
+  else
+    O << "0";
+
   O << ",";
-  O << (E.second ? E.second->getNameStr() : "0");
+
+  if (E.second)
+    O << E.second;
+  else
+    O << "0";
+
   return O << ")";
 }
 
+} // namespace llvm
+
 //===----------------------------------------------------------------------===//
 //  NoProfile ProfileInfo implementation
 //
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index 9e1dfb6..cbd0430 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -74,6 +74,8 @@ X("profile-loader", "Load profile information from llvmprof.out", false, true);
 
 static RegisterAnalysisGroup<ProfileInfo> Y(X);
 
+const PassInfo *llvm::ProfileLoaderPassID = &X;
+
 ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
 
 /// createProfileLoaderPass - This function returns a Pass that loads the
@@ -112,46 +114,9 @@ void LoaderPass::recurseBasicBlock(const BasicBlock *BB) {
     recurseBasicBlock(*bbi);
   }
 
-  Edge edgetocalc;
-  unsigned uncalculated = 0;
-
-  // collect weights of all incoming and outgoing edges, rememer edges that
-  // have no value
-  double incount = 0;
-  SmallSet<const BasicBlock*,8> pred_visited;
-  pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi==bbe) {
-    readEdgeOrRemember(getEdge(0, BB),edgetocalc,uncalculated,incount);
-  }
-  for (;bbi != bbe; ++bbi) {
-    if (pred_visited.insert(*bbi)) {
-      readEdgeOrRemember(getEdge(*bbi, BB),edgetocalc,uncalculated,incount);
-    }
-  }
-
-  double outcount = 0;
-  SmallSet<const BasicBlock*,8> succ_visited;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi==sbbe) {
-    readEdgeOrRemember(getEdge(BB, 0),edgetocalc,uncalculated,outcount);
-  }
-  for (;sbbi != sbbe; ++sbbi) {
-    if (succ_visited.insert(*sbbi)) {
-      readEdgeOrRemember(getEdge(BB, *sbbi),edgetocalc,uncalculated,outcount);
-    }
-  }
-
-  // if exactly one edge weight was missing, calculate it and remove it from
-  // spanning tree
-  if (uncalculated == 1) {
-    if (incount < outcount) {
-      EdgeInformation[BB->getParent()][edgetocalc] = outcount-incount;
-    } else {
-      EdgeInformation[BB->getParent()][edgetocalc] = incount-outcount;
-    }
-    DEBUG(errs() << "--Calc Edge Counter for " << edgetocalc << ": "
-                 << format("%g", getEdgeWeight(edgetocalc)) << "\n");
-    SpanningTree.erase(edgetocalc);
+  Edge tocalc;
+  if (CalculateMissingEdge(BB, tocalc)) {
+    SpanningTree.erase(tocalc);
   }
 }
 
@@ -219,9 +184,9 @@ bool LoaderPass::runOnModule(Module &M) {
         }
       }
       while (SpanningTree.size() > 0) {
-#if 0
+
         unsigned size = SpanningTree.size();
-#endif
+
         BBisUnvisited.clear();
         for (std::set<Edge>::iterator ei = SpanningTree.begin(),
              ee = SpanningTree.end(); ei != ee; ++ei) {
@@ -231,17 +196,16 @@ bool LoaderPass::runOnModule(Module &M) {
         while (BBisUnvisited.size() > 0) {
           recurseBasicBlock(*BBisUnvisited.begin());
         }
-#if 0
+
         if (SpanningTree.size() == size) {
           DEBUG(errs()<<"{");
           for (std::set<Edge>::iterator ei = SpanningTree.begin(),
                ee = SpanningTree.end(); ei != ee; ++ei) {
-            DEBUG(errs()<<"("<<(ei->first?ei->first->getName():"0")<<","
-                        <<(ei->second?ei->second->getName():"0")<<"),");
+            DEBUG(errs()<< *ei <<",");
           }
           assert(0 && "No edge calculated!");
         }
-#endif
+
       }
     }
     if (ReadCount != Counters.size()) {
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
index 5f36294..36a80ba 100644
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ b/lib/Analysis/ProfileVerifierPass.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/Debug.h"
 #include <set>
 using namespace llvm;
@@ -29,44 +30,45 @@ static cl::opt<bool,false>
 ProfileVerifierDisableAssertions("profile-verifier-noassert",
      cl::desc("Disable assertions"));
 
-namespace {
-  class ProfileVerifierPass : public FunctionPass {
+namespace llvm {
+  template<class FType, class BType>
+  class ProfileVerifierPassT : public FunctionPass {
 
     struct DetailedBlockInfo {
-      const BasicBlock *BB;
-      double            BBWeight;
-      double            inWeight;
-      int               inCount;
-      double            outWeight;
-      int               outCount;
+      const BType *BB;
+      double      BBWeight;
+      double      inWeight;
+      int         inCount;
+      double      outWeight;
+      int         outCount;
     };
 
-    ProfileInfo *PI;
-    std::set<const BasicBlock*> BBisVisited;
-    std::set<const Function*>   FisVisited;
+    ProfileInfoT<FType, BType> *PI;
+    std::set<const BType*> BBisVisited;
+    std::set<const FType*>   FisVisited;
     bool DisableAssertions;
 
     // When debugging is enabled, the verifier prints a whole slew of debug
     // information, otherwise its just the assert. These are all the helper
     // functions.
     bool PrintedDebugTree;
-    std::set<const BasicBlock*> BBisPrinted;
+    std::set<const BType*> BBisPrinted;
     void debugEntry(DetailedBlockInfo*);
-    void printDebugInfo(const BasicBlock *BB);
+    void printDebugInfo(const BType *BB);
 
   public:
     static char ID; // Class identification, replacement for typeinfo
 
-    explicit ProfileVerifierPass () : FunctionPass(&ID) {
+    explicit ProfileVerifierPassT () : FunctionPass(&ID) {
       DisableAssertions = ProfileVerifierDisableAssertions;
     }
-    explicit ProfileVerifierPass (bool da) : FunctionPass(&ID), 
-                                             DisableAssertions(da) {
+    explicit ProfileVerifierPassT (bool da) : FunctionPass(&ID), 
+                                              DisableAssertions(da) {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
-      AU.addRequired<ProfileInfo>();
+      AU.addRequired<ProfileInfoT<FType, BType> >();
     }
 
     const char *getPassName() const {
@@ -74,271 +76,302 @@ namespace {
     }
 
     /// run - Verify the profile information.
-    bool runOnFunction(Function &F);
-    void recurseBasicBlock(const BasicBlock*);
+    bool runOnFunction(FType &F);
+    void recurseBasicBlock(const BType*);
 
-    bool   exitReachable(const Function*);
-    double ReadOrAssert(ProfileInfo::Edge);
+    bool   exitReachable(const FType*);
+    double ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge);
     void   CheckValue(bool, const char*, DetailedBlockInfo*);
   };
-}  // End of anonymous namespace
-
-char ProfileVerifierPass::ID = 0;
-static RegisterPass<ProfileVerifierPass>
-X("profile-verifier", "Verify profiling information", false, true);
 
-namespace llvm {
-  FunctionPass *createProfileVerifierPass() {
-    return new ProfileVerifierPass(ProfileVerifierDisableAssertions); 
-  }
-}
-
-void ProfileVerifierPass::printDebugInfo(const BasicBlock *BB) {
-
-  if (BBisPrinted.find(BB) != BBisPrinted.end()) return;
-
-  double BBWeight = PI->getExecutionCount(BB);
-  if (BBWeight == ProfileInfo::MissingValue) { BBWeight = 0; }
-  double inWeight = 0;
-  int inCount = 0;
-  std::set<const BasicBlock*> ProcessedPreds;
-  for ( pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-        bbi != bbe; ++bbi ) {
-    if (ProcessedPreds.insert(*bbi).second) {
-      ProfileInfo::Edge E = PI->getEdge(*bbi,BB);
-      double EdgeWeight = PI->getEdgeWeight(E);
-      if (EdgeWeight == ProfileInfo::MissingValue) { EdgeWeight = 0; }
-      errs() << "calculated in-edge " << E << ": " << EdgeWeight << "\n";
-      inWeight += EdgeWeight;
-      inCount++;
+  typedef ProfileVerifierPassT<Function, BasicBlock> ProfileVerifierPass;
+
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::printDebugInfo(const BType *BB) {
+
+    if (BBisPrinted.find(BB) != BBisPrinted.end()) return;
+
+    double BBWeight = PI->getExecutionCount(BB);
+    if (BBWeight == ProfileInfoT<FType, BType>::MissingValue) { BBWeight = 0; }
+    double inWeight = 0;
+    int inCount = 0;
+    std::set<const BType*> ProcessedPreds;
+    for ( pred_const_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+          bbi != bbe; ++bbi ) {
+      if (ProcessedPreds.insert(*bbi).second) {
+        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(*bbi,BB);
+        double EdgeWeight = PI->getEdgeWeight(E);
+        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
+        errs() << "calculated in-edge " << E << ": " 
+               << format("%20.20g",EdgeWeight) << "\n";
+        inWeight += EdgeWeight;
+        inCount++;
+      }
     }
-  }
-  double outWeight = 0;
-  int outCount = 0;
-  std::set<const BasicBlock*> ProcessedSuccs;
-  for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-        bbi != bbe; ++bbi ) {
-    if (ProcessedSuccs.insert(*bbi).second) {
-      ProfileInfo::Edge E = PI->getEdge(BB,*bbi);
-      double EdgeWeight = PI->getEdgeWeight(E);
-      if (EdgeWeight == ProfileInfo::MissingValue) { EdgeWeight = 0; }
-      errs() << "calculated out-edge " << E << ": " << EdgeWeight << "\n";
-      outWeight += EdgeWeight;
-      outCount++;
+    double outWeight = 0;
+    int outCount = 0;
+    std::set<const BType*> ProcessedSuccs;
+    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+          bbi != bbe; ++bbi ) {
+      if (ProcessedSuccs.insert(*bbi).second) {
+        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(BB,*bbi);
+        double EdgeWeight = PI->getEdgeWeight(E);
+        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
+        errs() << "calculated out-edge " << E << ": " 
+               << format("%20.20g",EdgeWeight) << "\n";
+        outWeight += EdgeWeight;
+        outCount++;
+      }
+    }
+    errs() << "Block " << BB->getNameStr()                << " in " 
+           << BB->getParent()->getNameStr()               << ":"
+           << "BBWeight="  << format("%20.20g",BBWeight)  << ","
+           << "inWeight="  << format("%20.20g",inWeight)  << ","
+           << "inCount="   << inCount                     << ","
+           << "outWeight=" << format("%20.20g",outWeight) << ","
+           << "outCount"   << outCount                    << "\n";
+
+    // mark as visited and recurse into subnodes
+    BBisPrinted.insert(BB);
+    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
+          bbi != bbe; ++bbi ) {
+      printDebugInfo(*bbi);
     }
   }
-  errs()<<"Block "<<BB->getNameStr()<<" in "<<BB->getParent()->getNameStr()
-        <<",BBWeight="<<BBWeight<<",inWeight="<<inWeight<<",inCount="<<inCount
-        <<",outWeight="<<outWeight<<",outCount"<<outCount<<"\n";
-
-  // mark as visited and recurse into subnodes
-  BBisPrinted.insert(BB);
-  for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-        bbi != bbe; ++bbi ) {
-    printDebugInfo(*bbi);
-  }
-}
 
-void ProfileVerifierPass::debugEntry (DetailedBlockInfo *DI) {
-  errs() << "TROUBLE: Block " << DI->BB->getNameStr() << " in "
-         << DI->BB->getParent()->getNameStr()  << ":";
-  errs() << "BBWeight="  << DI->BBWeight   << ",";
-  errs() << "inWeight="  << DI->inWeight   << ",";
-  errs() << "inCount="   << DI->inCount    << ",";
-  errs() << "outWeight=" << DI->outWeight  << ",";
-  errs() << "outCount="  << DI->outCount   << "\n";
-  if (!PrintedDebugTree) {
-    PrintedDebugTree = true;
-    printDebugInfo(&(DI->BB->getParent()->getEntryBlock()));
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::debugEntry (DetailedBlockInfo *DI) {
+    errs() << "TROUBLE: Block " << DI->BB->getNameStr()       << " in "
+           << DI->BB->getParent()->getNameStr()               << ":"
+           << "BBWeight="  << format("%20.20g",DI->BBWeight)  << ","
+           << "inWeight="  << format("%20.20g",DI->inWeight)  << ","
+           << "inCount="   << DI->inCount                     << ","
+           << "outWeight=" << format("%20.20g",DI->outWeight) << ","
+           << "outCount="  << DI->outCount                    << "\n";
+    if (!PrintedDebugTree) {
+      PrintedDebugTree = true;
+      printDebugInfo(&(DI->BB->getParent()->getEntryBlock()));
+    }
   }
-}
 
-// This compares A and B but considering maybe small differences.
-static bool Equals(double A, double B) { 
-  double maxRelativeError = 0.0000001;
-  if (A == B)
-    return true;
-  double relativeError;
-  if (fabs(B) > fabs(A)) 
-    relativeError = fabs((A - B) / B);
-  else 
-    relativeError = fabs((A - B) / A);
-  if (relativeError <= maxRelativeError) return true; 
-  return false; 
-}
+  // This compares A and B for equality.
+  static bool Equals(double A, double B) {
+    return A == B;
+  }
 
-// This checks if the function "exit" is reachable from an given function
-// via calls, this is necessary to check if a profile is valid despite the
-// counts not fitting exactly.
-bool ProfileVerifierPass::exitReachable(const Function *F) {
-  if (!F) return false;
+  // This checks if the function "exit" is reachable from an given function
+  // via calls, this is necessary to check if a profile is valid despite the
+  // counts not fitting exactly.
+  template<class FType, class BType>
+  bool ProfileVerifierPassT<FType, BType>::exitReachable(const FType *F) {
+    if (!F) return false;
 
-  if (FisVisited.count(F)) return false;
+    if (FisVisited.count(F)) return false;
 
-  Function *Exit = F->getParent()->getFunction("exit");
-  if (Exit == F) {
-    return true;
-  }
+    FType *Exit = F->getParent()->getFunction("exit");
+    if (Exit == F) {
+      return true;
+    }
 
-  FisVisited.insert(F);
-  bool exits = false;
-  for (const_inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-    if (const CallInst *CI = dyn_cast<CallInst>(&*I)) {
-      exits |= exitReachable(CI->getCalledFunction());
-      if (exits) break;
+    FisVisited.insert(F);
+    bool exits = false;
+    for (const_inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+      if (const CallInst *CI = dyn_cast<CallInst>(&*I)) {
+        FType *F = CI->getCalledFunction();
+        if (F) {
+          exits |= exitReachable(F);
+        } else {
+          // This is a call to a pointer, all bets are off...
+          exits = true;
+        }
+        if (exits) break;
+      }
     }
+    return exits;
   }
-  return exits;
-}
 
-#define ASSERTMESSAGE(M) \
-    errs() << (M) << "\n"; \
-    if (!DisableAssertions) assert(0 && (M));
-
-double ProfileVerifierPass::ReadOrAssert(ProfileInfo::Edge E) {
-  double EdgeWeight = PI->getEdgeWeight(E);
-  if (EdgeWeight == ProfileInfo::MissingValue) {
-    errs() << "Edge " << E << " in Function " 
-           << ProfileInfo::getFunction(E)->getNameStr() << ": ";
-    ASSERTMESSAGE("ASSERT:Edge has missing value");
-    return 0;
-  } else {
-    return EdgeWeight;
+  #define ASSERTMESSAGE(M) \
+    { errs() << "ASSERT:" << (M) << "\n"; \
+      if (!DisableAssertions) assert(0 && (M)); }
+
+  template<class FType, class BType>
+  double ProfileVerifierPassT<FType, BType>::ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge E) {
+    double EdgeWeight = PI->getEdgeWeight(E);
+    if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) {
+      errs() << "Edge " << E << " in Function " 
+             << ProfileInfoT<FType, BType>::getFunction(E)->getNameStr() << ": ";
+      ASSERTMESSAGE("Edge has missing value");
+      return 0;
+    } else {
+      if (EdgeWeight < 0) {
+        errs() << "Edge " << E << " in Function " 
+               << ProfileInfoT<FType, BType>::getFunction(E)->getNameStr() << ": ";
+        ASSERTMESSAGE("Edge has negative value");
+      }
+      return EdgeWeight;
+    }
   }
-}
 
-void ProfileVerifierPass::CheckValue(bool Error, const char *Message,
-                                     DetailedBlockInfo *DI) {
-  if (Error) {
-    DEBUG(debugEntry(DI));
-    errs() << "Block " << DI->BB->getNameStr() << " in Function " 
-           << DI->BB->getParent()->getNameStr() << ": ";
-    ASSERTMESSAGE(Message);
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::CheckValue(bool Error, 
+                                                      const char *Message,
+                                                      DetailedBlockInfo *DI) {
+    if (Error) {
+      DEBUG(debugEntry(DI));
+      errs() << "Block " << DI->BB->getNameStr() << " in Function " 
+             << DI->BB->getParent()->getNameStr() << ": ";
+      ASSERTMESSAGE(Message);
+    }
+    return;
   }
-  return;
-}
 
-// This calculates the Information for a block and then recurses into the
-// successors.
-void ProfileVerifierPass::recurseBasicBlock(const BasicBlock *BB) {
-
-  // Break the recursion by remembering all visited blocks.
-  if (BBisVisited.find(BB) != BBisVisited.end()) return;
-
-  // Use a data structure to store all the information, this can then be handed
-  // to debug printers.
-  DetailedBlockInfo DI;
-  DI.BB = BB;
-  DI.outCount = DI.inCount = 0;
-  DI.inWeight = DI.outWeight = 0.0;
-
-  // Read predecessors.
-  std::set<const BasicBlock*> ProcessedPreds;
-  pred_const_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
-  // If there are none, check for (0,BB) edge.
-  if (bpi == bpe) {
-    DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
-    DI.inCount++;
-  }
-  for (;bpi != bpe; ++bpi) {
-    if (ProcessedPreds.insert(*bpi).second) {
-      DI.inWeight += ReadOrAssert(PI->getEdge(*bpi,BB));
+  // This calculates the Information for a block and then recurses into the
+  // successors.
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::recurseBasicBlock(const BType *BB) {
+
+    // Break the recursion by remembering all visited blocks.
+    if (BBisVisited.find(BB) != BBisVisited.end()) return;
+
+    // Use a data structure to store all the information, this can then be handed
+    // to debug printers.
+    DetailedBlockInfo DI;
+    DI.BB = BB;
+    DI.outCount = DI.inCount = 0;
+    DI.inWeight = DI.outWeight = 0;
+
+    // Read predecessors.
+    std::set<const BType*> ProcessedPreds;
+    pred_const_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
+    // If there are none, check for (0,BB) edge.
+    if (bpi == bpe) {
+      DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
       DI.inCount++;
     }
-  }
+    for (;bpi != bpe; ++bpi) {
+      if (ProcessedPreds.insert(*bpi).second) {
+        DI.inWeight += ReadOrAssert(PI->getEdge(*bpi,BB));
+        DI.inCount++;
+      }
+    }
 
-  // Read successors.
-  std::set<const BasicBlock*> ProcessedSuccs;
-  succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-  // If there is an (0,BB) edge, consider it too. (This is done not only when
-  // there are no successors, but every time; not every function contains
-  // return blocks with no successors (think loop latch as return block)).
-  double w = PI->getEdgeWeight(PI->getEdge(BB,0));
-  if (w != ProfileInfo::MissingValue) {
-    DI.outWeight += w;
-    DI.outCount++;
-  }
-  for (;bbi != bbe; ++bbi) {
-    if (ProcessedSuccs.insert(*bbi).second) {
-      DI.outWeight += ReadOrAssert(PI->getEdge(BB,*bbi));
+    // Read successors.
+    std::set<const BType*> ProcessedSuccs;
+    succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+    // If there is an (0,BB) edge, consider it too. (This is done not only when
+    // there are no successors, but every time; not every function contains
+    // return blocks with no successors (think loop latch as return block)).
+    double w = PI->getEdgeWeight(PI->getEdge(BB,0));
+    if (w != ProfileInfoT<FType, BType>::MissingValue) {
+      DI.outWeight += w;
       DI.outCount++;
     }
-  }
+    for (;bbi != bbe; ++bbi) {
+      if (ProcessedSuccs.insert(*bbi).second) {
+        DI.outWeight += ReadOrAssert(PI->getEdge(BB,*bbi));
+        DI.outCount++;
+      }
+    }
 
-  // Read block weight.
-  DI.BBWeight = PI->getExecutionCount(BB);
-  CheckValue(DI.BBWeight == ProfileInfo::MissingValue,
-             "ASSERT:BasicBlock has missing value", &DI);
-
-  // Check if this block is a setjmp target.
-  bool isSetJmpTarget = false;
-  if (DI.outWeight > DI.inWeight) {
-    for (BasicBlock::const_iterator i = BB->begin(), ie = BB->end();
-         i != ie; ++i) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-        Function *F = CI->getCalledFunction();
-        if (F && (F->getNameStr() == "_setjmp")) {
-          isSetJmpTarget = true; break;
+    // Read block weight.
+    DI.BBWeight = PI->getExecutionCount(BB);
+    CheckValue(DI.BBWeight == ProfileInfoT<FType, BType>::MissingValue,
+               "BasicBlock has missing value", &DI);
+    CheckValue(DI.BBWeight < 0,
+               "BasicBlock has negative value", &DI);
+
+    // Check if this block is a setjmp target.
+    bool isSetJmpTarget = false;
+    if (DI.outWeight > DI.inWeight) {
+      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
+           i != ie; ++i) {
+        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
+          FType *F = CI->getCalledFunction();
+          if (F && (F->getNameStr() == "_setjmp")) {
+            isSetJmpTarget = true; break;
+          }
         }
       }
     }
-  }
-  // Check if this block is eventually reaching exit.
-  bool isExitReachable = false;
-  if (DI.inWeight > DI.outWeight) {
-    for (BasicBlock::const_iterator i = BB->begin(), ie = BB->end();
-         i != ie; ++i) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-        FisVisited.clear();
-        isExitReachable |= exitReachable(CI->getCalledFunction());
-        if (isExitReachable) break;
+    // Check if this block is eventually reaching exit.
+    bool isExitReachable = false;
+    if (DI.inWeight > DI.outWeight) {
+      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
+           i != ie; ++i) {
+        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
+          FType *F = CI->getCalledFunction();
+          if (F) {
+            FisVisited.clear();
+            isExitReachable |= exitReachable(F);
+          } else {
+            // This is a call to a pointer, all bets are off...
+            isExitReachable = true;
+          }
+          if (isExitReachable) break;
+        }
       }
     }
-  }
 
-  if (DI.inCount > 0 && DI.outCount == 0) {
-     // If this is a block with no successors.
-    if (!isSetJmpTarget) {
-      CheckValue(!Equals(DI.inWeight,DI.BBWeight), 
-                 "ASSERT:inWeight and BBWeight do not match", &DI);
+    if (DI.inCount > 0 && DI.outCount == 0) {
+       // If this is a block with no successors.
+      if (!isSetJmpTarget) {
+        CheckValue(!Equals(DI.inWeight,DI.BBWeight), 
+                   "inWeight and BBWeight do not match", &DI);
+      }
+    } else if (DI.inCount == 0 && DI.outCount > 0) {
+      // If this is a block with no predecessors.
+      if (!isExitReachable)
+        CheckValue(!Equals(DI.BBWeight,DI.outWeight), 
+                   "BBWeight and outWeight do not match", &DI);
+    } else {
+      // If this block has successors and predecessors.
+      if (DI.inWeight > DI.outWeight && !isExitReachable)
+        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
+                   "inWeight and outWeight do not match", &DI);
+      if (DI.inWeight < DI.outWeight && !isSetJmpTarget)
+        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
+                   "inWeight and outWeight do not match", &DI);
     }
-  } else if (DI.inCount == 0 && DI.outCount > 0) {
-    // If this is a block with no predecessors.
-    if (!isExitReachable)
-      CheckValue(!Equals(DI.BBWeight,DI.outWeight), 
-                 "ASSERT:BBWeight and outWeight do not match", &DI);
-  } else {
-    // If this block has successors and predecessors.
-    if (DI.inWeight > DI.outWeight && !isExitReachable)
-      CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                 "ASSERT:inWeight and outWeight do not match", &DI);
-    if (DI.inWeight < DI.outWeight && !isSetJmpTarget)
-      CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                 "ASSERT:inWeight and outWeight do not match", &DI);
-  }
 
 
-  // Mark this block as visited, rescurse into successors.
-  BBisVisited.insert(BB);
-  for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-        bbi != bbe; ++bbi ) {
-    recurseBasicBlock(*bbi);
+    // Mark this block as visited, rescurse into successors.
+    BBisVisited.insert(BB);
+    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
+          bbi != bbe; ++bbi ) {
+      recurseBasicBlock(*bbi);
+    }
   }
-}
 
-bool ProfileVerifierPass::runOnFunction(Function &F) {
-  PI = &getAnalysis<ProfileInfo>();
+  template<class FType, class BType>
+  bool ProfileVerifierPassT<FType, BType>::runOnFunction(FType &F) {
+    PI = getAnalysisIfAvailable<ProfileInfoT<FType, BType> >();
+    if (!PI)
+      ASSERTMESSAGE("No ProfileInfo available");
+
+    // Prepare global variables.
+    PrintedDebugTree = false;
+    BBisVisited.clear();
+
+    // Fetch entry block and recurse into it.
+    const BType *entry = &F.getEntryBlock();
+    recurseBasicBlock(entry);
+
+    if (PI->getExecutionCount(&F) != PI->getExecutionCount(entry))
+      ASSERTMESSAGE("Function count and entry block count do not match");
 
-  // Prepare global variables.
-  PrintedDebugTree = false;
-  BBisVisited.clear();
+    return false;
+  }
+
+  template<class FType, class BType>
+  char ProfileVerifierPassT<FType, BType>::ID = 0;
+}
 
-  // Fetch entry block and recurse into it.
-  const BasicBlock *entry = &F.getEntryBlock();
-  recurseBasicBlock(entry);
+static RegisterPass<ProfileVerifierPass>
+X("profile-verifier", "Verify profiling information", false, true);
 
-  if (!DisableAssertions)
-    assert((PI->getExecutionCount(&F)==PI->getExecutionCount(entry)) &&
-           "Function count and entry block count do not match");
-  return false;
+namespace llvm {
+  FunctionPass *createProfileVerifierPass() {
+    return new ProfileVerifierPass(ProfileVerifierDisableAssertions); 
+  }
 }
+
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index d674ee8..7157d47 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -357,7 +357,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   // without the other.
   SplitAddRecs(Ops, Ty, SE);
 
-  // Decend down the pointer's type and attempt to convert the other
+  // Descend down the pointer's type and attempt to convert the other
   // operands into GEP indices, at each level. The first index in a GEP
   // indexes into the array implied by the pointer operand; the rest of
   // the indices index into the element or field type selected by the
@@ -628,7 +628,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
     BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
     BasicBlock::iterator NewInsertPt =
-      next(BasicBlock::iterator(cast<Instruction>(V)));
+      llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
     while (isa<PHINode>(NewInsertPt)) ++NewInsertPt;
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
                       NewInsertPt);
@@ -844,7 +844,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
       if (L && S->hasComputableLoopEvolution(L))
         InsertPt = L->getHeader()->getFirstNonPHI();
       while (isInsertedInstruction(InsertPt))
-        InsertPt = next(BasicBlock::iterator(InsertPt));
+        InsertPt = llvm::next(BasicBlock::iterator(InsertPt));
       break;
     }
 
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 31d3ccc..22c6e3b 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -659,7 +659,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const TargetData *TD,
   switch (Operator::getOpcode(V)) {
   default: break;
   case Instruction::SExt:
-    Tmp = TyBits-cast<IntegerType>(U->getOperand(0)->getType())->getBitWidth();
+    Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
     return ComputeNumSignBits(U->getOperand(0), TD, Depth+1) + Tmp;
     
   case Instruction::AShr:
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 1b7c9c6..cad1d3b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -540,6 +540,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(arm_apcscc);
   KEYWORD(arm_aapcscc);
   KEYWORD(arm_aapcs_vfpcc);
+  KEYWORD(msp430_intrcc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index a92dbf8..0333eed 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -581,6 +581,37 @@ bool LLParser::ParseStandaloneMetadata() {
   return false;
 }
 
+/// ParseInlineMetadata:
+///   !{type %instr}
+///   !{...} MDNode
+///   !"foo" MDString
+bool LLParser::ParseInlineMetadata(Value *&V, PerFunctionState &PFS) {
+  assert(Lex.getKind() == lltok::Metadata && "Only for Metadata");
+  V = 0;
+
+  Lex.Lex();
+  if (Lex.getKind() == lltok::lbrace) {
+    Lex.Lex();
+    if (ParseTypeAndValue(V, PFS) ||
+        ParseToken(lltok::rbrace, "expected end of metadata node"))
+      return true;
+
+    Value *Vals[] = { V };
+    V = MDNode::get(Context, Vals, 1);
+    return false;
+  }
+
+  // Standalone metadata reference
+  // !{ ..., !42, ... }
+  if (!ParseMDNode((MetadataBase *&)V))
+    return false;
+
+  // MDString:
+  // '!' STRINGCONSTANT
+  if (ParseMDString((MetadataBase *&)V)) return true;
+  return false;
+}
+
 /// ParseAlias:
 ///   ::= GlobalVar '=' OptionalVisibility 'alias' OptionalLinkage Aliasee
 /// Aliasee
@@ -1043,6 +1074,7 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= 'arm_apcscc'
 ///   ::= 'arm_aapcscc'
 ///   ::= 'arm_aapcs_vfpcc'
+///   ::= 'msp430_intrcc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
@@ -1056,6 +1088,7 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_arm_apcscc:     CC = CallingConv::ARM_APCS; break;
   case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
   case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
+  case lltok::kw_msp430_intrcc:  CC = CallingConv::MSP430_INTR; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();
@@ -1377,15 +1410,23 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
     // Parse the argument.
     LocTy ArgLoc;
     PATypeHolder ArgTy(Type::getVoidTy(Context));
-    unsigned ArgAttrs1, ArgAttrs2;
+    unsigned ArgAttrs1 = Attribute::None;
+    unsigned ArgAttrs2 = Attribute::None;
     Value *V;
-    if (ParseType(ArgTy, ArgLoc) ||
-        ParseOptionalAttrs(ArgAttrs1, 0) ||
-        ParseValue(ArgTy, V, PFS) ||
-        // FIXME: Should not allow attributes after the argument, remove this in
-        // LLVM 3.0.
-        ParseOptionalAttrs(ArgAttrs2, 3))
+    if (ParseType(ArgTy, ArgLoc))
       return true;
+
+    if (Lex.getKind() == lltok::Metadata) {
+      if (ParseInlineMetadata(V, PFS))
+        return true;
+    } else {
+      if (ParseOptionalAttrs(ArgAttrs1, 0) ||
+          ParseValue(ArgTy, V, PFS) ||
+          // FIXME: Should not allow attributes after the argument, remove this
+          // in LLVM 3.0.
+          ParseOptionalAttrs(ArgAttrs2, 3))
+        return true;
+    }
     ArgList.push_back(ParamInfo(ArgLoc, V, ArgAttrs1|ArgAttrs2));
   }
 
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 1112dc4..d14b1cb 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -279,7 +279,9 @@ namespace llvm {
       LocTy Loc;
       return ParseTypeAndBasicBlock(BB, Loc, PFS);
     }
-  
+
+    bool ParseInlineMetadata(Value *&V, PerFunctionState &PFS);
+
     struct ParamInfo {
       LocTy Loc;
       Value *V;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 797c32e..1165766 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -69,6 +69,7 @@ namespace lltok {
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
     kw_x86_stdcallcc, kw_x86_fastcallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
+    kw_msp430_intrcc,
 
     kw_signext,
     kw_zeroext,
diff --git a/lib/Bitcode/Reader/Deserialize.cpp b/lib/Bitcode/Reader/Deserialize.cpp
index 67607ef..b8e720a 100644
--- a/lib/Bitcode/Reader/Deserialize.cpp
+++ b/lib/Bitcode/Reader/Deserialize.cpp
@@ -413,7 +413,7 @@ uintptr_t Deserializer::ReadInternalRefPtr() {
   return GetFinalPtr(E);
 }
 
-void Deserializer::BPEntry::SetPtr(BPNode*& FreeList, void* P) {
+void BPEntry::SetPtr(BPNode*& FreeList, void* P) {
   BPNode* Last = NULL;
   
   for (BPNode* N = Head; N != NULL; N=N->Next) {
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 8e3f8e7..bb61682 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -38,16 +38,19 @@ DebugMod("agg-antidep-debugmod",
                       cl::desc("Debug control for aggressive anti-dep breaker"),
                       cl::init(0), cl::Hidden);
 
-AggressiveAntiDepState::AggressiveAntiDepState(MachineBasicBlock *BB) :
-  GroupNodes(TargetRegisterInfo::FirstVirtualRegister, 0) {
-  // Initialize all registers to be in their own group. Initially we
-  // assign the register to the same-indexed GroupNode.
-  for (unsigned i = 0; i < TargetRegisterInfo::FirstVirtualRegister; ++i)
+AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
+                                               MachineBasicBlock *BB) :
+  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0) {
+
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < NumTargetRegs; ++i) {
+    // Initialize all registers to be in their own group. Initially we
+    // assign the register to the same-indexed GroupNode.
     GroupNodeIndices[i] = i;
-
-  // Initialize the indices to indicate that no registers are live.
-  std::fill(KillIndices, array_endof(KillIndices), ~0u);
-  std::fill(DefIndices, array_endof(DefIndices), BB->size());
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
 }
 
 unsigned AggressiveAntiDepState::GetGroup(unsigned Reg)
@@ -64,7 +67,7 @@ void AggressiveAntiDepState::GetGroupRegs(
   std::vector<unsigned> &Regs,
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference> *RegRefs)
 {
-  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg) {
+  for (unsigned Reg = 0; Reg != NumTargetRegs; ++Reg) {
     if ((GetGroup(Reg) == Group) && (RegRefs->count(Reg) > 0))
       Regs.push_back(Reg);
   }
@@ -137,7 +140,7 @@ AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
 
 void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   assert(State == NULL);
-  State = new AggressiveAntiDepState(BB);
+  State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
 
   bool IsReturnBlock = (!BB->empty() && BB->back().getDesc().isReturn());
   unsigned *KillIndices = State->GetKillIndices();
@@ -220,7 +223,7 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   DEBUG(errs() << "\tRegs:");
 
   unsigned *DefIndices = State->GetDefIndices();
-  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg) {
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
     // If Reg is current live, then mark that it can't be renamed as
     // we don't know the extent of its live-range anymore (now that it
     // has been scheduled). If it is not live but was defined in the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 8154d2d..d385a21 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -44,6 +44,10 @@ namespace llvm {
     } RegisterReference;
 
   private:
+    /// NumTargetRegs - Number of non-virtual target registers
+    /// (i.e. TRI->getNumRegs()).
+    const unsigned NumTargetRegs;
+
     /// GroupNodes - Implements a disjoint-union data structure to
     /// form register groups. A node is represented by an index into
     /// the vector. A node can "point to" itself to indicate that it
@@ -69,7 +73,7 @@ namespace llvm {
     unsigned DefIndices[TargetRegisterInfo::FirstVirtualRegister];
 
   public:
-    AggressiveAntiDepState(MachineBasicBlock *BB);
+    AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
     
     /// GetKillIndices - Return the kill indices.
     unsigned *GetKillIndices() { return KillIndices; }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 993cdbf..44fd176 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1374,6 +1374,7 @@ void AsmPrinter::processDebugLoc(const MachineInstr *MI,
       unsigned L = DW->RecordSourceLine(CurDLT.Line, CurDLT.Col,
                                         CurDLT.Scope);
       printLabel(L);
+      O << '\n';
       DW->BeginScope(MI, L);
       PrevDLT = CurDLT;
     }
@@ -1837,15 +1838,16 @@ void AsmPrinter::EmitComments(const MachineInstr &MI) const {
 
     // Print source line info.
     O.PadToColumn(MAI->getCommentColumn());
-    O << MAI->getCommentString() << " SrcLine ";
-    if (DLT.Scope) {
-      DICompileUnit CU(DLT.Scope);
-      if (!CU.isNull())
-        O << CU.getFilename() << " ";
-    }
-    O << DLT.Line;
+    O << MAI->getCommentString() << ' ';
+    DIScope Scope(DLT.Scope);
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (!Scope.isNull())
+      O << Scope.getFilename();
+    else
+      O << "<unknown>";
+    O << ':' << DLT.Line;
     if (DLT.Col != 0)
-      O << ":" << DLT.Col;
+      O << ':' << DLT.Col;
     Newline = true;
   }
 
@@ -1857,35 +1859,40 @@ void AsmPrinter::EmitComments(const MachineInstr &MI) const {
 
   // We assume a single instruction only has a spill or reload, not
   // both.
+  const MachineMemOperand *MMO;
   if (TM.getInstrInfo()->isLoadFromStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Reload";
+      O << MAI->getCommentString() << ' ' << MMO->getSize() << "-byte Reload";
       Newline = true;
     }
   }
-  else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, FI)) {
+  else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Folded Reload";
+      O << MAI->getCommentString() << ' '
+        << MMO->getSize() << "-byte Folded Reload";
       Newline = true;
     }
   }
   else if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Spill";
+      O << MAI->getCommentString() << ' ' << MMO->getSize() << "-byte Spill";
       Newline = true;
     }
   }
-  else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, FI)) {
+  else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Folded Spill";
+      O << MAI->getCommentString() << ' '
+        << MMO->getSize() << "-byte Folded Spill";
       Newline = true;
     }
   }
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index dc6a70a..cad8b89 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -274,7 +274,7 @@ namespace llvm {
   };
 
   //===--------------------------------------------------------------------===//
-  /// DIEString - A string value DIE.
+  /// DIEString - A string value DIE. This DIE keeps string reference only.
   ///
   class DIEString : public DIEValue {
     const StringRef Str;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index c2e1e05..c200a46 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -330,8 +330,8 @@ void DwarfDebug::addSInt(DIE *Die, unsigned Attribute,
   Die->addValue(Attribute, Form, Value);
 }
 
-/// addString - Add a string attribute data and value.
-///
+/// addString - Add a string attribute data and value. DIEString only
+/// keeps string reference. 
 void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form,
                            const StringRef String) {
   DIEValue *Value = new DIEString(String);
@@ -393,7 +393,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIVariable *V) {
     return;
 
   unsigned Line = V->getLineNumber();
-  unsigned FileID = findCompileUnit(V->getCompileUnit()).getID();
+  unsigned FileID = findCompileUnit(V->getCompileUnit())->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -407,7 +407,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIGlobal *G) {
     return;
 
   unsigned Line = G->getLineNumber();
-  unsigned FileID = findCompileUnit(G->getCompileUnit()).getID();
+  unsigned FileID = findCompileUnit(G->getCompileUnit())->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -425,7 +425,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DISubprogram *SP) {
 
 
   unsigned Line = SP->getLineNumber();
-  unsigned FileID = findCompileUnit(SP->getCompileUnit()).getID();
+  unsigned FileID = findCompileUnit(SP->getCompileUnit())->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -440,7 +440,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIType *Ty) {
     return;
 
   unsigned Line = Ty->getLineNumber();
-  unsigned FileID = findCompileUnit(CU).getID();
+  unsigned FileID = findCompileUnit(CU)->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -738,13 +738,49 @@ void DwarfDebug::addAddress(DIE *Die, unsigned Attribute,
   addBlock(Die, Attribute, 0, Block);
 }
 
+/// addToContextOwner - Add Die into the list of its context owner's children.
+void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) {
+  if (Context.isNull())
+    ModuleCU->addDie(Die);
+  else if (Context.isType()) {
+    DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context.getNode()));
+    ContextDIE->addChild(Die);
+  } else if (DIE *ContextDIE = ModuleCU->getDIE(Context.getNode()))
+    ContextDIE->addChild(Die);
+  else 
+    ModuleCU->addDie(Die);
+}
+
+/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+/// given DIType.
+DIE *DwarfDebug::getOrCreateTypeDIE(DIType Ty) {
+  DIE *TyDIE = ModuleCU->getDIE(Ty.getNode());
+  if (TyDIE)
+    return TyDIE;
+
+  // Create new type.
+  TyDIE = new DIE(dwarf::DW_TAG_base_type);
+  ModuleCU->insertDIE(Ty.getNode(), TyDIE);
+  if (Ty.isBasicType())
+    constructTypeDIE(*TyDIE, DIBasicType(Ty.getNode()));
+  else if (Ty.isCompositeType())
+    constructTypeDIE(*TyDIE, DICompositeType(Ty.getNode()));
+  else {
+    assert(Ty.isDerivedType() && "Unknown kind of DIType");
+    constructTypeDIE(*TyDIE, DIDerivedType(Ty.getNode()));
+  }
+
+  addToContextOwner(TyDIE, Ty.getContext());
+  return TyDIE;
+}
+
 /// addType - Add a new type attribute to the specified entity.
-void DwarfDebug::addType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty) {
+void DwarfDebug::addType(DIE *Entity, DIType Ty) {
   if (Ty.isNull())
     return;
 
   // Check for pre-existence.
-  DIEEntry *Entry = DW_Unit->getDIEEntry(Ty.getNode());
+  DIEEntry *Entry = ModuleCU->getDIEEntry(Ty.getNode());
 
   // If it exists then use the existing value.
   if (Entry) {
@@ -754,36 +790,17 @@ void DwarfDebug::addType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty) {
 
   // Set up proxy.
   Entry = createDIEEntry();
-  DW_Unit->insertDIEEntry(Ty.getNode(), Entry);
+  ModuleCU->insertDIEEntry(Ty.getNode(), Entry);
 
   // Construct type.
-  DIE *Buffer = new DIE(dwarf::DW_TAG_base_type);
-  if (Ty.isBasicType())
-    constructTypeDIE(DW_Unit, *Buffer, DIBasicType(Ty.getNode()));
-  else if (Ty.isCompositeType())
-    constructTypeDIE(DW_Unit, *Buffer, DICompositeType(Ty.getNode()));
-  else {
-    assert(Ty.isDerivedType() && "Unknown kind of DIType");
-    constructTypeDIE(DW_Unit, *Buffer, DIDerivedType(Ty.getNode()));
-  }
+  DIE *Buffer = getOrCreateTypeDIE(Ty);
 
-  // Add debug information entry to entity and appropriate context.
-  DIE *Die = NULL;
-  DIDescriptor Context = Ty.getContext();
-  if (!Context.isNull())
-    Die = DW_Unit->getDIE(Context.getNode());
-
-  if (Die)
-    Die->addChild(Buffer);
-  else
-    DW_Unit->addDie(Buffer);
   Entry->setEntry(Buffer);
   Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
 }
 
 /// constructTypeDIE - Construct basic type die from DIBasicType.
-void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
-                                  DIBasicType BTy) {
+void DwarfDebug::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   // Get core information.
   StringRef Name = BTy.getName();
   Buffer.setTag(dwarf::DW_TAG_base_type);
@@ -798,8 +815,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 }
 
 /// constructTypeDIE - Construct derived type die from DIDerivedType.
-void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
-                                  DIDerivedType DTy) {
+void DwarfDebug::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
   // Get core information.
   StringRef Name = DTy.getName();
   uint64_t Size = DTy.getSizeInBits() >> 3;
@@ -812,7 +828,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 
   // Map to main type, void will not have a type.
   DIType FromTy = DTy.getTypeDerivedFrom();
-  addType(DW_Unit, &Buffer, FromTy);
+  addType(&Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
@@ -828,8 +844,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 }
 
 /// constructTypeDIE - Construct type DIE from DICompositeType.
-void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
-                                  DICompositeType CTy) {
+void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   // Get core information.
   StringRef Name = CTy.getName();
 
@@ -840,7 +855,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
   switch (Tag) {
   case dwarf::DW_TAG_vector_type:
   case dwarf::DW_TAG_array_type:
-    constructArrayTypeDIE(DW_Unit, Buffer, &CTy);
+    constructArrayTypeDIE(Buffer, &CTy);
     break;
   case dwarf::DW_TAG_enumeration_type: {
     DIArray Elements = CTy.getTypeArray();
@@ -850,7 +865,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
       DIE *ElemDie = NULL;
       DIEnumerator Enum(Elements.getElement(i).getNode());
       if (!Enum.isNull()) {
-        ElemDie = constructEnumTypeDIE(DW_Unit, &Enum);
+        ElemDie = constructEnumTypeDIE(&Enum);
         Buffer.addChild(ElemDie);
       }
     }
@@ -860,7 +875,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
     // Add return type.
     DIArray Elements = CTy.getTypeArray();
     DIDescriptor RTy = Elements.getElement(0);
-    addType(DW_Unit, &Buffer, DIType(RTy.getNode()));
+    addType(&Buffer, DIType(RTy.getNode()));
 
     // Add prototype flag.
     addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
@@ -869,7 +884,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
     for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
       DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
       DIDescriptor Ty = Elements.getElement(i);
-      addType(DW_Unit, Arg, DIType(Ty.getNode()));
+      addType(Arg, DIType(Ty.getNode()));
       Buffer.addChild(Arg);
     }
   }
@@ -891,11 +906,9 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
         continue;
       DIE *ElemDie = NULL;
       if (Element.getTag() == dwarf::DW_TAG_subprogram)
-        ElemDie = createSubprogramDIE(DW_Unit,
-                                      DISubprogram(Element.getNode()));
+        ElemDie = createSubprogramDIE(DISubprogram(Element.getNode()));
       else
-        ElemDie = createMemberDIE(DW_Unit,
-                                  DIDerivedType(Element.getNode()));
+        ElemDie = createMemberDIE(DIDerivedType(Element.getNode()));
       Buffer.addChild(ElemDie);
     }
 
@@ -944,33 +957,32 @@ void DwarfDebug::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
   addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
   if (L)
     addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
-  if (H)
-    addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
+  addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
 
   Buffer.addChild(DW_Subrange);
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-void DwarfDebug::constructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+void DwarfDebug::constructArrayTypeDIE(DIE &Buffer,
                                        DICompositeType *CTy) {
   Buffer.setTag(dwarf::DW_TAG_array_type);
   if (CTy->getTag() == dwarf::DW_TAG_vector_type)
     addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
 
   // Emit derived type.
-  addType(DW_Unit, &Buffer, CTy->getTypeDerivedFrom());
+  addType(&Buffer, CTy->getTypeDerivedFrom());
   DIArray Elements = CTy->getTypeArray();
 
   // Get an anonymous type for index type.
-  DIE *IdxTy = DW_Unit->getIndexTyDie();
+  DIE *IdxTy = ModuleCU->getIndexTyDie();
   if (!IdxTy) {
     // Construct an anonymous type for index type.
     IdxTy = new DIE(dwarf::DW_TAG_base_type);
     addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
     addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             dwarf::DW_ATE_signed);
-    DW_Unit->addDie(IdxTy);
-    DW_Unit->setIndexTyDie(IdxTy);
+    ModuleCU->addDie(IdxTy);
+    ModuleCU->setIndexTyDie(IdxTy);
   }
 
   // Add subranges to array type.
@@ -982,7 +994,7 @@ void DwarfDebug::constructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 }
 
 /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-DIE *DwarfDebug::constructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy) {
+DIE *DwarfDebug::constructEnumTypeDIE(DIEnumerator *ETy) {
   DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
   StringRef Name = ETy->getName();
   addString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
@@ -992,8 +1004,7 @@ DIE *DwarfDebug::constructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy) {
 }
 
 /// createGlobalVariableDIE - Create new DIE using GV.
-DIE *DwarfDebug::createGlobalVariableDIE(CompileUnit *DW_Unit,
-                                         const DIGlobalVariable &GV) {
+DIE *DwarfDebug::createGlobalVariableDIE(const DIGlobalVariable &GV) {
   // If the global variable was optmized out then no need to create debug info
   // entry.
   if (!GV.getGlobal()) return NULL;
@@ -1014,7 +1025,7 @@ DIE *DwarfDebug::createGlobalVariableDIE(CompileUnit *DW_Unit,
     addString(GVDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
               LinkageName);
   }
-  addType(DW_Unit, GVDie, GV.getType());
+  addType(GVDie, GV.getType());
   if (!GV.isLocalToUnit())
     addUInt(GVDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
   addSourceLine(GVDie, &GV);
@@ -1030,13 +1041,13 @@ DIE *DwarfDebug::createGlobalVariableDIE(CompileUnit *DW_Unit,
 }
 
 /// createMemberDIE - Create new member DIE.
-DIE *DwarfDebug::createMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT){
+DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
   DIE *MemberDie = new DIE(DT.getTag());
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
   
-  addType(DW_Unit, MemberDie, DT.getTypeDerivedFrom());
+  addType(MemberDie, DT.getTypeDerivedFrom());
 
   addSourceLine(MemberDie, &DT);
 
@@ -1073,21 +1084,27 @@ DIE *DwarfDebug::createMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT){
   addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
 
   if (DT.isProtected())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, 0,
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
             dwarf::DW_ACCESS_protected);
   else if (DT.isPrivate())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, 0,
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
             dwarf::DW_ACCESS_private);
-
+  else if (DT.getTag() == dwarf::DW_TAG_inheritance)
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+            dwarf::DW_ACCESS_public);
+  if (DT.isVirtual())
+    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag,
+            dwarf::DW_VIRTUALITY_virtual);
   return MemberDie;
 }
 
 /// createSubprogramDIE - Create new DIE using SP.
-DIE *DwarfDebug::createSubprogramDIE(CompileUnit *DW_Unit,
-                                     const DISubprogram &SP,
-                                     bool IsConstructor,
-                                     bool IsInlined) {
-  DIE *SPDie = new DIE(dwarf::DW_TAG_subprogram);
+DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
+  DIE *SPDie = ModuleCU->getDIE(SP.getNode());
+  if (SPDie)
+    return SPDie;
+
+  SPDie = new DIE(dwarf::DW_TAG_subprogram);
   addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, SP.getName());
 
   StringRef LinkageName = SP.getLinkageName();
@@ -1103,9 +1120,6 @@ DIE *DwarfDebug::createSubprogramDIE(CompileUnit *DW_Unit,
   }
   addSourceLine(SPDie, &SP);
 
-  DICompositeType SPTy = SP.getType();
-  DIArray Args = SPTy.getTypeArray();
-
   // Add prototyped tag, if C or ObjC.
   unsigned Lang = SP.getCompileUnit().getLanguage();
   if (Lang == dwarf::DW_LANG_C99 || Lang == dwarf::DW_LANG_C89 ||
@@ -1113,98 +1127,56 @@ DIE *DwarfDebug::createSubprogramDIE(CompileUnit *DW_Unit,
     addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
 
   // Add Return Type.
+  DICompositeType SPTy = SP.getType();
+  DIArray Args = SPTy.getTypeArray();
   unsigned SPTag = SPTy.getTag();
-  if (!IsConstructor) {
-    if (Args.isNull() || SPTag != dwarf::DW_TAG_subroutine_type)
-      addType(DW_Unit, SPDie, SPTy);
-    else
-      addType(DW_Unit, SPDie, DIType(Args.getElement(0).getNode()));
+
+  if (Args.isNull() || SPTag != dwarf::DW_TAG_subroutine_type)
+    addType(SPDie, SPTy);
+  else
+    addType(SPDie, DIType(Args.getElement(0).getNode()));
+
+  unsigned VK = SP.getVirtuality();
+  if (VK) {
+    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
+    DIEBlock *Block = new DIEBlock();
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, 0, dwarf::DW_FORM_data1, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
+    ContainingTypeMap.insert(std::make_pair(SPDie, WeakVH(SP.getContainingType().getNode())));
   }
 
-  if (!SP.isDefinition()) {
+  if (MakeDecl || !SP.isDefinition()) {
     addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
 
     // Add arguments. Do not add arguments for subprogram definition. They will
-    // be handled through RecordVariable.
+    // be handled while processing variables.
+    DICompositeType SPTy = SP.getType();
+    DIArray Args = SPTy.getTypeArray();
+    unsigned SPTag = SPTy.getTag();
+
     if (SPTag == dwarf::DW_TAG_subroutine_type)
       for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
         DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        addType(DW_Unit, Arg, DIType(Args.getElement(i).getNode()));
+        addType(Arg, DIType(Args.getElement(i).getNode()));
         addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); // ??
         SPDie->addChild(Arg);
       }
   }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
-  DW_Unit->insertDIE(SP.getNode(), SPDie);
+  ModuleCU->insertDIE(SP.getNode(), SPDie);
   return SPDie;
 }
 
 /// findCompileUnit - Get the compile unit for the given descriptor.
 ///
-CompileUnit &DwarfDebug::findCompileUnit(DICompileUnit Unit) const {
+CompileUnit *DwarfDebug::findCompileUnit(DICompileUnit Unit) {
   DenseMap<Value *, CompileUnit *>::const_iterator I =
     CompileUnitMap.find(Unit.getNode());
-  assert(I != CompileUnitMap.end() && "Missing compile unit.");
-  return *I->second;
-}
-
-/// createDbgScopeVariable - Create a new scope variable.
-///
-DIE *DwarfDebug::createDbgScopeVariable(DbgVariable *DV, CompileUnit *Unit) {
-  // Get the descriptor.
-  const DIVariable &VD = DV->getVariable();
-  StringRef Name = VD.getName();
-  if (Name.empty())
-    return NULL;
-
-  // Translate tag to proper Dwarf tag.  The result variable is dropped for
-  // now.
-  unsigned Tag;
-  switch (VD.getTag()) {
-  case dwarf::DW_TAG_return_variable:
-    return NULL;
-  case dwarf::DW_TAG_arg_variable:
-    Tag = dwarf::DW_TAG_formal_parameter;
-    break;
-  case dwarf::DW_TAG_auto_variable:    // fall thru
-  default:
-    Tag = dwarf::DW_TAG_variable;
-    break;
-  }
-
-  // Define variable debug information entry.
-  DIE *VariableDie = new DIE(Tag);
-  addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-
-  // Add source line info if available.
-  addSourceLine(VariableDie, &VD);
-
-  // Add variable type.
-  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
-  // addresses instead.
-  if (VD.isBlockByrefVariable())
-    addType(Unit, VariableDie, getBlockByrefType(VD.getType(), Name));
-  else
-    addType(Unit, VariableDie, VD.getType());
-
-  // Add variable address.
-  // Variables for abstract instances of inlined functions don't get a
-  // location.
-  MachineLocation Location;
-  unsigned FrameReg;
-  int Offset = RI->getFrameIndexReference(*MF, DV->getFrameIndex(), FrameReg);
-  Location.set(FrameReg, Offset);
-
-
-  if (VD.hasComplexAddress())
-    addComplexAddress(DV, VariableDie, dwarf::DW_AT_location, Location);
-  else if (VD.isBlockByrefVariable())
-    addBlockByrefAddress(DV, VariableDie, dwarf::DW_AT_location, Location);
-  else
-    addAddress(VariableDie, dwarf::DW_AT_location, Location);
-
-  return VariableDie;
+  if (I == CompileUnitMap.end())
+    return constructCompileUnit(Unit.getNode());
+  return I->second;
 }
 
 /// getUpdatedDbgScope - Find or create DbgScope assicated with the instruction.
@@ -1295,6 +1267,28 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(MDNode *SPNode) {
 
  DIE *SPDie = ModuleCU->getDIE(SPNode);
  assert (SPDie && "Unable to find subprogram DIE!");
+ DISubprogram SP(SPNode);
+ if (SP.isDefinition() && !SP.getContext().isCompileUnit()) {
+   addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+  // Add arguments. 
+   DICompositeType SPTy = SP.getType();
+   DIArray Args = SPTy.getTypeArray();
+   unsigned SPTag = SPTy.getTag();
+   if (SPTag == dwarf::DW_TAG_subroutine_type)
+     for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
+       DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+       addType(Arg, DIType(Args.getElement(i).getNode()));
+       addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); // ??
+       SPDie->addChild(Arg);
+     }
+   DIE *SPDeclDie = SPDie;
+   SPDie = new DIE(dwarf::DW_TAG_subprogram);
+   addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, 
+               SPDeclDie);
+   
+   ModuleCU->addDie(SPDie);
+ }
+   
  addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
           DWLabel("func_begin", SubprogramCount));
  addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
@@ -1305,19 +1299,6 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(MDNode *SPNode) {
  if (!DISubprogram(SPNode).isLocalToUnit())
    addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
 
- // If there are global variables at this scope then add their dies.
- for (SmallVector<WeakVH, 4>::iterator SGI = ScopedGVs.begin(),
-        SGE = ScopedGVs.end(); SGI != SGE; ++SGI) {
-   MDNode *N = dyn_cast_or_null<MDNode>(*SGI);
-   if (!N) continue;
-   DIGlobalVariable GV(N);
-   if (GV.getContext().getNode() == SPNode) {
-     DIE *ScopedGVDie = createGlobalVariableDIE(ModuleCU, GV);
-     if (ScopedGVDie)
-       SPDie->addChild(ScopedGVDie);
-   }
- }
- 
  return SPDie;
 }
 
@@ -1401,8 +1382,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
 
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV,
-                                      DbgScope *Scope, CompileUnit *Unit) {
+DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
   // Get the descriptor.
   const DIVariable &VD = DV->getVariable();
   StringRef Name = VD.getName();
@@ -1451,9 +1431,9 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV,
     // FIXME: isBlockByrefVariable should be reformulated in terms of complex
     // addresses instead.
     if (VD.isBlockByrefVariable())
-      addType(Unit, VariableDie, getBlockByrefType(VD.getType(), Name));
+      addType(VariableDie, getBlockByrefType(VD.getType(), Name));
     else
-      addType(Unit, VariableDie, VD.getType());
+      addType(VariableDie, VD.getType());
   }
 
   // Add variable address.
@@ -1522,7 +1502,7 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   // Add variables to scope.
   SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
   for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
-    DIE *VariableDIE = constructVariableDIE(Variables[i], Scope, ModuleCU);
+    DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
     if (VariableDIE)
       ScopeDIE->addChild(VariableDIE);
   }
@@ -1579,7 +1559,7 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef DirName, StringRef FileName)
   return SrcId;
 }
 
-void DwarfDebug::constructCompileUnit(MDNode *N) {
+CompileUnit *DwarfDebug::constructCompileUnit(MDNode *N) {
   DICompileUnit DIUnit(N);
   StringRef FN = DIUnit.getFilename();
   StringRef Dir = DIUnit.getDirectory();
@@ -1618,6 +1598,7 @@ void DwarfDebug::constructCompileUnit(MDNode *N) {
 
   CompileUnitMap[DIUnit.getNode()] = Unit;
   CompileUnits.push_back(Unit);
+  return Unit;
 }
 
 void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
@@ -1631,14 +1612,16 @@ void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
   if (ModuleCU->getDIE(DI_GV.getNode()))
     return;
 
-  DIE *VariableDie = createGlobalVariableDIE(ModuleCU, DI_GV);
+  DIE *VariableDie = createGlobalVariableDIE(DI_GV);
+  if (!VariableDie)
+    return;
 
   // Add to map.
   ModuleCU->insertDIE(N, VariableDie);
 
   // Add to context owner.
-  ModuleCU->getCUDie()->addChild(VariableDie);
-
+  addToContextOwner(VariableDie, DI_GV.getContext());
+  
   // Expose as global. FIXME - need to check external flag.
   ModuleCU->addGlobal(DI_GV.getName(), VariableDie);
 
@@ -1663,13 +1646,15 @@ void DwarfDebug::constructSubprogramDIE(MDNode *N) {
     // class type.
     return;
 
-  DIE *SubprogramDie = createSubprogramDIE(ModuleCU, SP);
+  DIE *SubprogramDie = createSubprogramDIE(SP);
 
   // Add to map.
   ModuleCU->insertDIE(N, SubprogramDie);
 
   // Add to context owner.
-  ModuleCU->getCUDie()->addChild(SubprogramDie);
+  if (SP.getContext().getNode() == SP.getCompileUnit().getNode())
+    if (TopLevelDIEs.insert(SubprogramDie))
+      TopLevelDIEsVector.push_back(SubprogramDie);
 
   // Expose as global.
   ModuleCU->addGlobal(SP.getName(), SubprogramDie);
@@ -1709,21 +1694,16 @@ void DwarfDebug::beginModule(Module *M, MachineModuleInfo *mmi) {
   if (!ModuleCU)
     ModuleCU = CompileUnits[0];
 
-  // Create DIEs for each of the externally visible global variables.
-  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
-         E = DbgFinder.global_variable_end(); I != E; ++I) {
-    DIGlobalVariable GV(*I);
-    if (GV.getContext().getNode() != GV.getCompileUnit().getNode())
-      ScopedGVs.push_back(*I);
-    else
-      constructGlobalVariableDIE(*I);
-  }
-
   // Create DIEs for each subprogram.
   for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
          E = DbgFinder.subprogram_end(); I != E; ++I)
     constructSubprogramDIE(*I);
 
+  // Create DIEs for each global variable.
+  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+         E = DbgFinder.global_variable_end(); I != E; ++I)
+    constructGlobalVariableDIE(*I);
+
   MMI = mmi;
   shouldEmit = true;
   MMI->setDebugInfoAvailability(true);
@@ -1770,6 +1750,22 @@ void DwarfDebug::endModule() {
     addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
   }
 
+  // Insert top level DIEs.
+  for (SmallVector<DIE *, 4>::iterator TI = TopLevelDIEsVector.begin(),
+         TE = TopLevelDIEsVector.end(); TI != TE; ++TI)
+    ModuleCU->getCUDie()->addChild(*TI);
+
+  for (DenseMap<DIE *, WeakVH>::iterator CI = ContainingTypeMap.begin(),
+         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+    DIE *SPDie = CI->first;
+    MDNode *N = dyn_cast_or_null<MDNode>(CI->second);
+    if (!N) continue;
+    DIE *NDie = ModuleCU->getDIE(N);
+    if (!NDie) continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    addDIEEntry(NDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+  }
+
   // Standard sections final addresses.
   Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getTextSection());
   EmitLabel("text_end", 0);
@@ -1898,6 +1894,7 @@ void DwarfDebug::endScope(const MachineInstr *MI) {
 
   unsigned Label = MMI->NextLabelID();
   Asm->printLabel(Label);
+  O << '\n';
 
   SmallVector<DbgScope *, 2> &SD = I->second;
   for (SmallVector<DbgScope *, 2>::iterator SDI = SD.begin(), SDE = SD.end();
@@ -2092,17 +2089,15 @@ void DwarfDebug::endFunction(MachineFunction *MF) {
                                                MMI->getFrameMoves()));
 
   // Clear debug info
-  if (CurrentFnDbgScope) {
-    CurrentFnDbgScope = NULL;
-    DbgScopeMap.clear();
-    DbgScopeBeginMap.clear();
-    DbgScopeEndMap.clear();
-    ConcreteScopes.clear();
-    AbstractScopesList.clear();
-  }
+  CurrentFnDbgScope = NULL;
+  DbgScopeMap.clear();
+  DbgScopeBeginMap.clear();
+  DbgScopeEndMap.clear();
+  ConcreteScopes.clear();
+  AbstractScopesList.clear();
 
   Lines.clear();
-
+  
   if (TimePassesIsEnabled)
     DebugTimer->stopTimer();
 }
@@ -2337,13 +2332,16 @@ void DwarfDebug::emitDIE(DIE *Die) {
   }
 }
 
-/// emitDebugInfo / emitDebugInfoPerCU - Emit the debug info section.
+/// emitDebugInfo - Emit the debug info section.
 ///
-void DwarfDebug::emitDebugInfoPerCU(CompileUnit *Unit) {
-  DIE *Die = Unit->getCUDie();
+void DwarfDebug::emitDebugInfo() {
+  // Start debug info section.
+  Asm->OutStreamer.SwitchSection(
+                            Asm->getObjFileLowering().getDwarfInfoSection());
+  DIE *Die = ModuleCU->getCUDie();
 
   // Emit the compile units header.
-  EmitLabel("info_begin", Unit->getID());
+  EmitLabel("info_begin", ModuleCU->getID());
 
   // Emit size of content not including length itself
   unsigned ContentSize = Die->getSize() +
@@ -2364,17 +2362,10 @@ void DwarfDebug::emitDebugInfoPerCU(CompileUnit *Unit) {
   Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
   Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
   Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
-  EmitLabel("info_end", Unit->getID());
+  EmitLabel("info_end", ModuleCU->getID());
 
   Asm->EOL();
-}
-
-void DwarfDebug::emitDebugInfo() {
-  // Start debug info section.
-  Asm->OutStreamer.SwitchSection(
-                            Asm->getObjFileLowering().getDwarfInfoSection());
 
-  emitDebugInfoPerCU(ModuleCU);
 }
 
 /// emitAbbreviations - Emit the abbreviation section.
@@ -2534,9 +2525,9 @@ void DwarfDebug::emitDebugLines() {
         std::pair<unsigned, unsigned> SourceID =
           getSourceDirectoryAndFileIds(LineInfo.getSourceID());
         O << '\t' << MAI->getCommentString() << ' '
-          << getSourceDirectoryName(SourceID.first) << ' '
+          << getSourceDirectoryName(SourceID.first) << '/'
           << getSourceFileName(SourceID.second)
-          <<" :" << utostr_32(LineInfo.getLine()) << '\n';
+          << ':' << utostr_32(LineInfo.getLine()) << '\n';
       }
 
       // Define the line address.
@@ -2672,24 +2663,30 @@ DwarfDebug::emitFunctionDebugFrame(const FunctionDebugFrameInfo&DebugFrameInfo){
   Asm->EOL();
 }
 
-void DwarfDebug::emitDebugPubNamesPerCU(CompileUnit *Unit) {
-  EmitDifference("pubnames_end", Unit->getID(),
-                 "pubnames_begin", Unit->getID(), true);
+/// emitDebugPubNames - Emit visible names into a debug pubnames section.
+///
+void DwarfDebug::emitDebugPubNames() {
+  // Start the dwarf pubnames section.
+  Asm->OutStreamer.SwitchSection(
+                          Asm->getObjFileLowering().getDwarfPubNamesSection());
+
+  EmitDifference("pubnames_end", ModuleCU->getID(),
+                 "pubnames_begin", ModuleCU->getID(), true);
   Asm->EOL("Length of Public Names Info");
 
-  EmitLabel("pubnames_begin", Unit->getID());
+  EmitLabel("pubnames_begin", ModuleCU->getID());
 
   Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("DWARF Version");
 
   EmitSectionOffset("info_begin", "section_info",
-                    Unit->getID(), 0, true, false);
+                    ModuleCU->getID(), 0, true, false);
   Asm->EOL("Offset of Compilation Unit Info");
 
-  EmitDifference("info_end", Unit->getID(), "info_begin", Unit->getID(),
+  EmitDifference("info_end", ModuleCU->getID(), "info_begin", ModuleCU->getID(),
                  true);
   Asm->EOL("Compilation Unit Length");
 
-  const StringMap<DIE*> &Globals = Unit->getGlobals();
+  const StringMap<DIE*> &Globals = ModuleCU->getGlobals();
   for (StringMap<DIE*>::const_iterator
          GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
     const char *Name = GI->getKeyData();
@@ -2700,21 +2697,11 @@ void DwarfDebug::emitDebugPubNamesPerCU(CompileUnit *Unit) {
   }
 
   Asm->EmitInt32(0); Asm->EOL("End Mark");
-  EmitLabel("pubnames_end", Unit->getID());
+  EmitLabel("pubnames_end", ModuleCU->getID());
 
   Asm->EOL();
 }
 
-/// emitDebugPubNames - Emit visible names into a debug pubnames section.
-///
-void DwarfDebug::emitDebugPubNames() {
-  // Start the dwarf pubnames section.
-  Asm->OutStreamer.SwitchSection(
-                          Asm->getObjFileLowering().getDwarfPubNamesSection());
-
-  emitDebugPubNamesPerCU(ModuleCU);
-}
-
 void DwarfDebug::emitDebugPubTypes() {
   // Start the dwarf pubnames section.
   Asm->OutStreamer.SwitchSection(
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 679d9b9..12ad322 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -154,12 +154,14 @@ class DwarfDebug : public Dwarf {
   /// (at the end of the module) as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
 
+  DenseMap<DIE *, WeakVH> ContainingTypeMap;
+
   /// AbstractSubprogramDIEs - Collection of abstruct subprogram DIEs.
   SmallPtrSet<DIE *, 4> AbstractSubprogramDIEs;
 
-  /// ScopedGVs - Tracks global variables that are not at file scope.
-  /// For example void f() { static int b = 42; }
-  SmallVector<WeakVH, 4> ScopedGVs;
+  /// TopLevelDIEs - Collection of top level DIEs. 
+  SmallPtrSet<DIE *, 4> TopLevelDIEs;
+  SmallVector<DIE *, 4> TopLevelDIEsVector;
 
   typedef SmallVector<DbgScope *, 2> ScopeVector;
   typedef DenseMap<const MachineInstr *, ScopeVector>
@@ -307,53 +309,52 @@ class DwarfDebug : public Dwarf {
   void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
                             const MachineLocation &Location);
 
+  /// addToContextOwner - Add Die into the list of its context owner's children.
+  void addToContextOwner(DIE *Die, DIDescriptor Context);
+
   /// addType - Add a new type attribute to the specified entity.
-  void addType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty);
+  void addType(DIE *Entity, DIType Ty);
+
+  /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+  /// given DIType.
+  DIE *getOrCreateTypeDIE(DIType Ty);
 
   void addPubTypes(DISubprogram SP);
 
   /// constructTypeDIE - Construct basic type die from DIBasicType.
-  void constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+  void constructTypeDIE(DIE &Buffer,
                         DIBasicType BTy);
 
   /// constructTypeDIE - Construct derived type die from DIDerivedType.
-  void constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+  void constructTypeDIE(DIE &Buffer,
                         DIDerivedType DTy);
 
   /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+  void constructTypeDIE(DIE &Buffer,
                         DICompositeType CTy);
 
   /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
   void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
 
   /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-  void constructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer, 
+  void constructArrayTypeDIE(DIE &Buffer, 
                              DICompositeType *CTy);
 
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-  DIE *constructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy);
+  DIE *constructEnumTypeDIE(DIEnumerator *ETy);
 
   /// createGlobalVariableDIE - Create new DIE using GV.
-  DIE *createGlobalVariableDIE(CompileUnit *DW_Unit,
-                               const DIGlobalVariable &GV);
+  DIE *createGlobalVariableDIE(const DIGlobalVariable &GV);
 
   /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT);
+  DIE *createMemberDIE(const DIDerivedType &DT);
 
   /// createSubprogramDIE - Create new DIE using SP.
-  DIE *createSubprogramDIE(CompileUnit *DW_Unit,
-                           const DISubprogram &SP,
-                           bool IsConstructor = false,
-                           bool IsInlined = false);
+  DIE *createSubprogramDIE(const DISubprogram &SP, bool MakeDecl = false);
 
   /// findCompileUnit - Get the compile unit for the given descriptor. 
   ///
-  CompileUnit &findCompileUnit(DICompileUnit Unit) const;
-
-  /// createDbgScopeVariable - Create a new scope variable.
-  ///
-  DIE *createDbgScopeVariable(DbgVariable *DV, CompileUnit *Unit);
+  CompileUnit *findCompileUnit(DICompileUnit Unit);
 
   /// getUpdatedDbgScope - Find or create DbgScope assicated with 
   /// the instruction. Initialize scope and update scope hierarchy.
@@ -384,7 +385,7 @@ class DwarfDebug : public Dwarf {
   DIE *constructInlinedScopeDIE(DbgScope *Scope);
 
   /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, DbgScope *S, CompileUnit *Unit);
+  DIE *constructVariableDIE(DbgVariable *DV, DbgScope *S);
 
   /// constructScopeDIE - Construct a DIE for this scope.
   DIE *constructScopeDIE(DbgScope *Scope);
@@ -405,10 +406,8 @@ class DwarfDebug : public Dwarf {
   ///
   void computeSizeAndOffsets();
 
-  /// EmitDebugInfo / emitDebugInfoPerCU - Emit the debug info section.
+  /// EmitDebugInfo - Emit the debug info section.
   ///
-  void emitDebugInfoPerCU(CompileUnit *Unit);
-
   void emitDebugInfo();
 
   /// emitAbbreviations - Emit the abbreviation section.
@@ -432,8 +431,6 @@ class DwarfDebug : public Dwarf {
   /// section.
   void emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo);
 
-  void emitDebugPubNamesPerCU(CompileUnit *Unit);
-
   /// emitDebugPubNames - Emit visible names into a debug pubnames section.
   ///
   void emitDebugPubNames();
@@ -488,7 +485,7 @@ class DwarfDebug : public Dwarf {
   /// as well.
   unsigned GetOrCreateSourceID(StringRef DirName, StringRef FileName);
 
-  void constructCompileUnit(MDNode *N);
+  CompileUnit *constructCompileUnit(MDNode *N);
 
   void constructGlobalVariableDIE(MDNode *N);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 1c8b8f4..3fd077f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -292,14 +292,13 @@ void DwarfException::EmitFDE(const FunctionEHFrameInfo &EHFrameInfo) {
       Asm->EmitULEB128Bytes(is4Byte ? 4 : 8);
       Asm->EOL("Augmentation size");
 
+      // We force 32-bits here because we've encoded our LSDA in the CIE with
+      // `dwarf::DW_EH_PE_sdata4'. And the CIE and FDE should agree.
       if (EHFrameInfo.hasLandingPads)
-        EmitReference("exception", EHFrameInfo.Number, true, false);
-      else {
-        if (is4Byte)
-          Asm->EmitInt32((int)0);
-        else
-          Asm->EmitInt64((int)0);
-      }
+        EmitReference("exception", EHFrameInfo.Number, true, true);
+      else
+        Asm->EmitInt32((int)0);
+
       Asm->EOL("Language Specific Data Area");
     } else {
       Asm->EmitULEB128Bytes(0);
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index aff1665..aa01c5b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -119,7 +119,6 @@ class DwarfException : public Dwarf {
     static inline unsigned getTombstoneKey() { return -2U; }
     static unsigned getHashValue(const unsigned &Key) { return Key; }
     static bool isEqual(unsigned LHS, unsigned RHS) { return LHS == RHS; }
-    static bool isPod() { return true; }
   };
 
   /// PadRange - Structure holding a try-range and the associated landing pad.
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 8a62eb2..3887e6d 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -427,7 +427,7 @@ static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
 static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
                     const TargetInstrInfo *TII) {
   MachineFunction *MF = CurMBB->getParent();
-  MachineFunction::iterator I = next(MachineFunction::iterator(CurMBB));
+  MachineFunction::iterator I = llvm::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = 0, *FBB = 0;
   SmallVector<MachineOperand, 4> Cond;
   if (I != MF->end() &&
@@ -805,7 +805,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   // a compile-time infinite loop repeatedly doing and undoing the same
   // transformations.)
 
-  for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
        I != E; ++I) {
     if (I->pred_size() >= 2 && I->pred_size() < TailMergeThreshold) {
       SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
@@ -833,7 +833,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
               continue;
             // This is the QBB case described above
             if (!FBB)
-              FBB = next(MachineFunction::iterator(PBB));
+              FBB = llvm::next(MachineFunction::iterator(PBB));
           }
           // Failing case:  the only way IBB can be reached from PBB is via
           // exception handling.  Happens for landing pads.  Would be nice
@@ -1140,7 +1140,7 @@ ReoptimizeBlock:
       // falls through into MBB and we can't understand the prior block's branch
       // condition.
       if (MBB->empty()) {
-        bool PredHasNoFallThrough = TII->BlockHasNoFallThrough(PrevBB);
+        bool PredHasNoFallThrough = !PrevBB.canFallThrough();
         if (PredHasNoFallThrough || !PriorUnAnalyzable ||
             !PrevBB.isSuccessor(MBB)) {
           // If the prior block falls through into us, turn it into an
@@ -1239,7 +1239,7 @@ ReoptimizeBlock:
           // B elsewhere
           // next:
           if (CurFallsThru) {
-            MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
+            MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
             CurCond.clear();
             TII->InsertBranch(*MBB, NextBB, 0, CurCond);
           }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 6f86614..7a969f0 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMCodeGen
   AggressiveAntiDepBreaker.cpp
   BranchFolding.cpp
+  CalcSpillWeights.cpp
   CodePlacementOpt.cpp
   CriticalAntiDepBreaker.cpp
   DeadMachineInstructionElim.cpp
@@ -35,7 +36,9 @@ add_llvm_library(LLVMCodeGen
   MachinePassRegistry.cpp
   MachineRegisterInfo.cpp
   MachineSink.cpp
+  MachineSSAUpdater.cpp
   MachineVerifier.cpp
+  MaxStackAlignment.cpp
   ObjectCodeEmitter.cpp
   OcamlGC.cpp
   PHIElimination.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
new file mode 100644
index 0000000..dcffb8a2
--- /dev/null
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -0,0 +1,154 @@
+//===------------------------ CalcSpillWeights.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "calcspillweights"
+
+#include "llvm/Function.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+char CalculateSpillWeights::ID = 0;
+static RegisterPass<CalculateSpillWeights> X("calcspillweights",
+                                             "Calculate spill weights");
+
+void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
+  au.addRequired<LiveIntervals>();
+  au.addRequired<MachineLoopInfo>();
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
+
+  DEBUG(errs() << "********** Compute Spill Weights **********\n"
+               << "********** Function: "
+               << fn.getFunction()->getName() << '\n');
+
+  LiveIntervals *lis = &getAnalysis<LiveIntervals>();
+  MachineLoopInfo *loopInfo = &getAnalysis<MachineLoopInfo>();
+  const TargetInstrInfo *tii = fn.getTarget().getInstrInfo();
+  MachineRegisterInfo *mri = &fn.getRegInfo();
+
+  SmallSet<unsigned, 4> processed;
+  for (MachineFunction::iterator mbbi = fn.begin(), mbbe = fn.end();
+       mbbi != mbbe; ++mbbi) {
+    MachineBasicBlock* mbb = mbbi;
+    SlotIndex mbbEnd = lis->getMBBEndIdx(mbb);
+    MachineLoop* loop = loopInfo->getLoopFor(mbb);
+    unsigned loopDepth = loop ? loop->getLoopDepth() : 0;
+    bool isExiting = loop ? loop->isLoopExiting(mbb) : false;
+
+    for (MachineBasicBlock::const_iterator mii = mbb->begin(), mie = mbb->end();
+         mii != mie; ++mii) {
+      const MachineInstr *mi = mii;
+      if (tii->isIdentityCopy(*mi))
+        continue;
+
+      if (mi->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
+        continue;
+
+      for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+        const MachineOperand &mopi = mi->getOperand(i);
+        if (!mopi.isReg() || mopi.getReg() == 0)
+          continue;
+        unsigned reg = mopi.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(mopi.getReg()))
+          continue;
+        // Multiple uses of reg by the same instruction. It should not
+        // contribute to spill weight again.
+        if (!processed.insert(reg))
+          continue;
+
+        bool hasDef = mopi.isDef();
+        bool hasUse = !hasDef;
+        for (unsigned j = i+1; j != e; ++j) {
+          const MachineOperand &mopj = mi->getOperand(j);
+          if (!mopj.isReg() || mopj.getReg() != reg)
+            continue;
+          hasDef |= mopj.isDef();
+          hasUse |= mopj.isUse();
+          if (hasDef && hasUse)
+            break;
+        }
+
+        LiveInterval &regInt = lis->getInterval(reg);
+        float weight = lis->getSpillWeight(hasDef, hasUse, loopDepth);
+        if (hasDef && isExiting) {
+          // Looks like this is a loop count variable update.
+          SlotIndex defIdx = lis->getInstructionIndex(mi).getDefIndex();
+          const LiveRange *dlr =
+            lis->getInterval(reg).getLiveRangeContaining(defIdx);
+          if (dlr->end > mbbEnd)
+            weight *= 3.0F;
+        }
+        regInt.weight += weight;
+      }
+      processed.clear();
+    }
+  }
+
+  for (LiveIntervals::iterator I = lis->begin(), E = lis->end(); I != E; ++I) {
+    LiveInterval &li = *I->second;
+    if (TargetRegisterInfo::isVirtualRegister(li.reg)) {
+      // If the live interval length is essentially zero, i.e. in every live
+      // range the use follows def immediately, it doesn't make sense to spill
+      // it and hope it will be easier to allocate for this li.
+      if (isZeroLengthInterval(&li)) {
+        li.weight = HUGE_VALF;
+        continue;
+      }
+
+      bool isLoad = false;
+      SmallVector<LiveInterval*, 4> spillIs;
+      if (lis->isReMaterializable(li, spillIs, isLoad)) {
+        // If all of the definitions of the interval are re-materializable,
+        // it is a preferred candidate for spilling. If non of the defs are
+        // loads, then it's potentially very cheap to re-materialize.
+        // FIXME: this gets much more complicated once we support non-trivial
+        // re-materialization.
+        if (isLoad)
+          li.weight *= 0.9F;
+        else
+          li.weight *= 0.5F;
+      }
+
+      // Slightly prefer live interval that has been assigned a preferred reg.
+      std::pair<unsigned, unsigned> Hint = mri->getRegAllocationHint(li.reg);
+      if (Hint.first || Hint.second)
+        li.weight *= 1.01F;
+
+      // Divide the weight of the interval by its size.  This encourages
+      // spilling of intervals that are large and have few uses, and
+      // discourages spilling of small intervals with many uses.
+      li.weight /= lis->getApproximateInstructionCount(li) * SlotIndex::NUM;
+    }
+  }
+  
+  return false;
+}
+
+/// Returns true if the given live interval is zero length.
+bool CalculateSpillWeights::isZeroLengthInterval(LiveInterval *li) const {
+  for (LiveInterval::Ranges::const_iterator
+       i = li->ranges.begin(), e = li->ranges.end(); i != e; ++i)
+    if (i->end.getPrevIndex() > i->start)
+      return false;
+  return true;
+}
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index e9844d8..ff71f6b 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -182,7 +182,7 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
       // Move it and all the blocks that can reach it via fallthrough edges
       // exclusively, to keep existing fallthrough edges intact.
       MachineFunction::iterator Begin = Pred;
-      MachineFunction::iterator End = next(Begin);
+      MachineFunction::iterator End = llvm::next(Begin);
       while (Begin != MF.begin()) {
         MachineFunction::iterator Prior = prior(Begin);
         if (Prior == MF.begin())
@@ -255,7 +255,8 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
   // to the top of the loop to avoid loosing that fallthrough. Otherwise append
   // them to the bottom, even if it previously had a fallthrough, on the theory
   // that it's worth an extra branch to keep the loop contiguous.
-  MachineFunction::iterator InsertPt = next(MachineFunction::iterator(BotMBB));
+  MachineFunction::iterator InsertPt =
+    llvm::next(MachineFunction::iterator(BotMBB));
   bool InsertAtTop = false;
   if (TopMBB != MF.begin() &&
       !HasFallthrough(prior(MachineFunction::iterator(TopMBB))) &&
@@ -268,7 +269,7 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
   // with the loop header.
   SmallPtrSet<MachineBasicBlock *, 8> ContiguousBlocks;
   for (MachineFunction::iterator I = TopMBB,
-       E = next(MachineFunction::iterator(BotMBB)); I != E; ++I)
+       E = llvm::next(MachineFunction::iterator(BotMBB)); I != E; ++I)
     ContiguousBlocks.insert(I);
 
   // Find non-contigous blocks and fix them.
@@ -301,7 +302,7 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
       // Process this block and all loop blocks contiguous with it, to keep
       // them in their relative order.
       MachineFunction::iterator Begin = BB;
-      MachineFunction::iterator End = next(MachineFunction::iterator(BB));
+      MachineFunction::iterator End = llvm::next(MachineFunction::iterator(BB));
       for (; End != MF.end(); ++End) {
         if (!L->contains(End)) break;
         if (!HasAnalyzableTerminator(End)) break;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 1b39fec..3c7961c2 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -43,8 +43,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
             static_cast<const TargetRegisterClass *>(0));
 
   // Initialize the indices to indicate that no registers are live.
-  std::fill(KillIndices, array_endof(KillIndices), ~0u);
-  std::fill(DefIndices, array_endof(DefIndices), BB->size());
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
 
   // Clear "do not change" set.
   KeepRegs.clear();
@@ -122,7 +125,7 @@ void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   // may have been rescheduled and its lifetime may overlap with registers
   // in ways not reflected in our current liveness state. For each such
   // register, adjust the liveness state to be conservatively correct.
-  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg)
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg)
     if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
       assert(KillIndices[Reg] == ~0u && "Clobbered register is live!");
       // Mark this register to be non-renamable.
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 242cba5..297dd31 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -74,6 +74,9 @@ EnableFastISelOption("fast-isel", cl::Hidden,
 static cl::opt<bool> EnableSplitGEPGVN("split-gep-gvn", cl::Hidden,
     cl::desc("Split GEPs and run no-load GVN"));
 
+static cl::opt<bool> PreAllocTailDup("pre-regalloc-taildup", cl::Hidden,
+    cl::desc("Pre-register allocation tail duplication"));
+
 LLVMTargetMachine::LLVMTargetMachine(const Target &T,
                                      const std::string &TargetTriple)
   : TargetMachine(T) {
@@ -302,6 +305,13 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
                    /* allowDoubleDefs= */ true);
   }
 
+  // Pre-ra tail duplication.
+  if (OptLevel != CodeGenOpt::None &&
+      !DisableTailDuplicate && PreAllocTailDup) {
+    PM.add(createTailDuplicatePass(true));
+    printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
+  }
+
   // Run pre-ra passes.
   if (addPreRegAlloc(PM, OptLevel))
     printAndVerify(PM, "After PreRegAlloc passes",
@@ -348,7 +358,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
 
   // Tail duplication.
   if (OptLevel != CodeGenOpt::None && !DisableTailDuplicate) {
-    PM.add(createTailDuplicatePass());
+    PM.add(createTailDuplicatePass(false));
     printAndVerify(PM, "After TailDuplicate");
   }
 
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 8d632cb..cc286aa 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -847,7 +847,7 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
       if (vni->isUnused()) {
         OS << "x";
       } else {
-        if (!vni->isDefAccurate())
+        if (!vni->isDefAccurate() && !vni->isPHIDef())
           OS << "?";
         else
           OS << vni->def;
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 24adf36..8806439f 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -149,44 +149,69 @@ void LiveIntervals::dumpInstrs() const {
   printInstrs(errs());
 }
 
-/// conflictsWithPhysRegDef - Returns true if the specified register
-/// is defined during the duration of the specified interval.
-bool LiveIntervals::conflictsWithPhysRegDef(const LiveInterval &li,
-                                            VirtRegMap &vrm, unsigned reg) {
-  for (LiveInterval::Ranges::const_iterator
-         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
-    for (SlotIndex index = I->start.getBaseIndex(),
-           end = I->end.getPrevSlot().getBaseIndex().getNextIndex();
-           index != end;
-           index = index.getNextIndex()) {
-      // skip deleted instructions
-      while (index != end && !getInstructionFromIndex(index))
-        index = index.getNextIndex();
-      if (index == end) break;
+bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
+                                         VirtRegMap &vrm, unsigned reg) {
+  // We don't handle fancy stuff crossing basic block boundaries
+  if (li.ranges.size() != 1)
+    return true;
+  const LiveRange &range = li.ranges.front();
+  SlotIndex idx = range.start.getBaseIndex();
+  SlotIndex end = range.end.getPrevSlot().getBaseIndex().getNextIndex();
+
+  // Skip deleted instructions
+  MachineInstr *firstMI = getInstructionFromIndex(idx);
+  while (!firstMI && idx != end) {
+    idx = idx.getNextIndex();
+    firstMI = getInstructionFromIndex(idx);
+  }
+  if (!firstMI)
+    return false;
 
-      MachineInstr *MI = getInstructionFromIndex(index);
-      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
-        if (SrcReg == li.reg || DstReg == li.reg)
-          continue;
-      for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-        MachineOperand& mop = MI->getOperand(i);
-        if (!mop.isReg())
-          continue;
-        unsigned PhysReg = mop.getReg();
-        if (PhysReg == 0 || PhysReg == li.reg)
+  // Find last instruction in range
+  SlotIndex lastIdx = end.getPrevIndex();
+  MachineInstr *lastMI = getInstructionFromIndex(lastIdx);
+  while (!lastMI && lastIdx != idx) {
+    lastIdx = lastIdx.getPrevIndex();
+    lastMI = getInstructionFromIndex(lastIdx);
+  }
+  if (!lastMI)
+    return false;
+
+  // Range cannot cross basic block boundaries or terminators
+  MachineBasicBlock *MBB = firstMI->getParent();
+  if (MBB != lastMI->getParent() || lastMI->getDesc().isTerminator())
+    return true;
+
+  MachineBasicBlock::const_iterator E = lastMI;
+  ++E;
+  for (MachineBasicBlock::const_iterator I = firstMI; I != E; ++I) {
+    const MachineInstr &MI = *I;
+
+    // Allow copies to and from li.reg
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (tii_->isMoveInstr(MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (SrcReg == li.reg || DstReg == li.reg)
+        continue;
+
+    // Check for operands using reg
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e;  ++i) {
+      const MachineOperand& mop = MI.getOperand(i);
+      if (!mop.isReg())
+        continue;
+      unsigned PhysReg = mop.getReg();
+      if (PhysReg == 0 || PhysReg == li.reg)
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(PhysReg)) {
+        if (!vrm.hasPhys(PhysReg))
           continue;
-        if (TargetRegisterInfo::isVirtualRegister(PhysReg)) {
-          if (!vrm.hasPhys(PhysReg))
-            continue;
-          PhysReg = vrm.getPhys(PhysReg);
-        }
-        if (PhysReg && tri_->regsOverlap(PhysReg, reg))
-          return true;
+        PhysReg = vrm.getPhys(PhysReg);
       }
+      if (PhysReg && tri_->regsOverlap(PhysReg, reg))
+        return true;
     }
   }
 
+  // No conflicts found.
   return false;
 }
 
@@ -201,15 +226,9 @@ bool LiveIntervals::conflictsWithPhysRegRef(LiveInterval &li,
            end = I->end.getPrevSlot().getBaseIndex().getNextIndex();
            index != end;
            index = index.getNextIndex()) {
-      // Skip deleted instructions.
-      MachineInstr *MI = 0;
-      while (index != end) {
-        MI = getInstructionFromIndex(index);
-        if (MI)
-          break;
-        index = index.getNextIndex();
-      }
-      if (index == end) break;
+      MachineInstr *MI = getInstructionFromIndex(index);
+      if (!MI)
+        continue;               // skip deleted instructions
 
       if (JoinedCopies.count(MI))
         continue;
@@ -374,8 +393,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // Value#0 is now defined by the 2-addr instruction.
       OldValNo->def  = RedefIndex;
       OldValNo->setCopy(0);
-      if (MO.isEarlyClobber())
-        OldValNo->setHasRedefByEC(true);
       
       // Add the new live interval which replaces the range for the input copy.
       LiveRange LR(DefIndex, RedefIndex, ValNo);
@@ -411,7 +428,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         interval.removeRange(Start, End);        
         assert(interval.ranges.size() == 1 &&
                "Newly discovered PHI interval has >1 ranges.");
-        MachineBasicBlock *killMBB = getMBBFromIndex(interval.endIndex());
+        MachineBasicBlock *killMBB = getMBBFromIndex(VNI->def);
         VNI->addKill(indexes_->getTerminatorGap(killMBB));
         VNI->setHasPHIKill(true);
         DEBUG({
@@ -422,7 +439,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         // Replace the interval with one of a NEW value number.  Note that this
         // value number isn't actually defined by an instruction, weird huh? :)
         LiveRange LR(Start, End,
-                     interval.getNextValue(SlotIndex(getMBBStartIdx(mbb), true),
+                     interval.getNextValue(SlotIndex(getMBBStartIdx(Killer->getParent()), true),
                        0, false, VNInfoAllocator));
         LR.valno->setIsPHIDef(true);
         DEBUG(errs() << " replace range with " << LR);
@@ -513,8 +530,6 @@ void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
         if (mi->isRegTiedToUseOperand(DefIdx)) {
           // Two-address instruction.
           end = baseIndex.getDefIndex();
-          assert(!mi->getOperand(DefIdx).isEarlyClobber() &&
-                 "Two address instruction is an early clobber?"); 
         } else {
           // Another instruction redefines the register before it is ever read.
           // Then the register is essentially dead at the instruction that defines
@@ -730,8 +745,16 @@ unsigned LiveIntervals::getVNInfoSourceReg(const VNInfo *VNI) const {
   if (VNI->getCopy()->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
     // If it's extracting out of a physical register, return the sub-register.
     unsigned Reg = VNI->getCopy()->getOperand(1).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      unsigned SrcSubReg = VNI->getCopy()->getOperand(2).getImm();
+      unsigned DstSubReg = VNI->getCopy()->getOperand(0).getSubReg();
+      if (SrcSubReg == DstSubReg)
+        // %reg1034:3<def> = EXTRACT_SUBREG %EDX, 3
+        // reg1034 can still be coalesced to EDX.
+        return Reg;
+      assert(DstSubReg == 0);
       Reg = tri_->getSubReg(Reg, VNI->getCopy()->getOperand(2).getImm());
+    }
     return Reg;
   } else if (VNI->getCopy()->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
              VNI->getCopy()->getOpcode() == TargetInstrInfo::SUBREG_TO_REG)
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 68f80ac..3c88e37 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -720,6 +720,51 @@ bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
   return findKill(&MBB);
 }
 
+bool LiveVariables::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB) {
+  LiveVariables::VarInfo &VI = getVarInfo(Reg);
+
+  // Loop over all of the successors of the basic block, checking to see if
+  // the value is either live in the block, or if it is killed in the block.
+  std::vector<MachineBasicBlock*> OpSuccBlocks;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+         E = MBB.succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+
+    // Is it alive in this successor?
+    unsigned SuccIdx = SuccMBB->getNumber();
+    if (VI.AliveBlocks.test(SuccIdx))
+      return true;
+    OpSuccBlocks.push_back(SuccMBB);
+  }
+
+  // Check to see if this value is live because there is a use in a successor
+  // that kills it.
+  switch (OpSuccBlocks.size()) {
+  case 1: {
+    MachineBasicBlock *SuccMBB = OpSuccBlocks[0];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB)
+        return true;
+    break;
+  }
+  case 2: {
+    MachineBasicBlock *SuccMBB1 = OpSuccBlocks[0], *SuccMBB2 = OpSuccBlocks[1];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB1 ||
+          VI.Kills[i]->getParent() == SuccMBB2)
+        return true;
+    break;
+  }
+  default:
+    std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
+                             VI.Kills[i]->getParent()))
+        return true;
+  }
+  return false;
+}
+
 /// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
 /// variables that are live out of DomBB will be marked as passing live through
 /// BB.
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/LowerSubregs.cpp
index 30636a8..80eb6cd 100644
--- a/lib/CodeGen/LowerSubregs.cpp
+++ b/lib/CodeGen/LowerSubregs.cpp
@@ -312,7 +312,7 @@ bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
        mbbi != mbbe; ++mbbi) {
     for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
          mi != me;) {
-      MachineBasicBlock::iterator nmi = next(mi);
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
       MachineInstr *MI = mi;
       if (MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
         MadeChange |= LowerExtract(MI);
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index e55e369..a58286d 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -290,7 +290,7 @@ void MachineBasicBlock::updateTerminator() {
     } else {
       // The block has a fallthrough conditional branch.
       MachineBasicBlock *MBBA = *succ_begin();
-      MachineBasicBlock *MBBB = *next(succ_begin());
+      MachineBasicBlock *MBBB = *llvm::next(succ_begin());
       if (MBBA == TBB) std::swap(MBBB, MBBA);
       if (isLayoutSuccessor(TBB)) {
         if (TII->ReverseBranchCondition(Cond)) {
@@ -359,15 +359,10 @@ bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
 
 bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
   MachineFunction::const_iterator I(this);
-  return next(I) == MachineFunction::const_iterator(MBB);
+  return llvm::next(I) == MachineFunction::const_iterator(MBB);
 }
 
 bool MachineBasicBlock::canFallThrough() {
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  SmallVector<MachineOperand, 4> Cond;
-  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
-  bool BranchUnAnalyzable = TII->AnalyzeBranch(*this, TBB, FBB, Cond, true);
-
   MachineFunction::iterator Fallthrough = this;
   ++Fallthrough;
   // If FallthroughBlock is off the end of the function, it can't fall through.
@@ -378,16 +373,21 @@ bool MachineBasicBlock::canFallThrough() {
   if (!isSuccessor(Fallthrough))
     return false;
 
-  // If we couldn't analyze the branch, examine the last instruction.
-  // If the block doesn't end in a known control barrier, assume fallthrough
-  // is possible. The isPredicable check is needed because this code can be
-  // called during IfConversion, where an instruction which is normally a
-  // Barrier is predicated and thus no longer an actual control barrier. This
-  // is over-conservative though, because if an instruction isn't actually
-  // predicated we could still treat it like a barrier.
-  if (BranchUnAnalyzable)
+  // Analyze the branches, if any, at the end of the block.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond, true)) {
+    // If we couldn't analyze the branch, examine the last instruction.
+    // If the block doesn't end in a known control barrier, assume fallthrough
+    // is possible. The isPredicable check is needed because this code can be
+    // called during IfConversion, where an instruction which is normally a
+    // Barrier is predicated and thus no longer an actual control barrier. This
+    // is over-conservative though, because if an instruction isn't actually
+    // predicated we could still treat it like a barrier.
     return empty() || !back().getDesc().isBarrier() ||
            back().getDesc().isPredicable();
+  }
 
   // If there is no branch, control always falls through.
   if (TBB == 0) return true;
@@ -461,18 +461,19 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
   bool MadeChange = false;
   bool AddedFallThrough = false;
 
-  MachineFunction::iterator FallThru = next(MachineFunction::iterator(this));
+  MachineFunction::iterator FallThru =
+    llvm::next(MachineFunction::iterator(this));
   
-  // If this block ends with a conditional branch that falls through to its
-  // successor, set DestB as the successor.
   if (isCond) {
+    // If this block ends with a conditional branch that falls through to its
+    // successor, set DestB as the successor.
     if (DestB == 0 && FallThru != getParent()->end()) {
       DestB = FallThru;
       AddedFallThrough = true;
     }
   } else {
     // If this is an unconditional branch with no explicit dest, it must just be
-    // a fallthrough into DestB.
+    // a fallthrough into DestA.
     if (DestA == 0 && FallThru != getParent()->end()) {
       DestA = FallThru;
       AddedFallThrough = true;
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d20f446..dd6fd7e 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -328,7 +328,7 @@ void MachineFunction::print(raw_ostream &OS) const {
       if (I->second)
         OS << " in reg%" << I->second;
 
-      if (next(I) != E)
+      if (llvm::next(I) != E)
         OS << ", ";
     }
     OS << '\n';
@@ -342,7 +342,7 @@ void MachineFunction::print(raw_ostream &OS) const {
       else
         OS << "%physreg" << *I;
 
-      if (next(I) != E)
+      if (llvm::next(I) != E)
         OS << " ";
     }
     OS << '\n';
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index f73a5a3..12b974d 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -1058,6 +1058,22 @@ bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
   return true;
 }
 
+/// isConstantValuePHI - If the specified instruction is a PHI that always
+/// merges together the same virtual register, return the register, otherwise
+/// return 0.
+unsigned MachineInstr::isConstantValuePHI() const {
+  if (getOpcode() != TargetInstrInfo::PHI)
+    return 0;
+  assert(getNumOperands() >= 3 &&
+         "It's illegal to have a PHI without source operands");
+
+  unsigned Reg = getOperand(1).getReg();
+  for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
+    if (getOperand(i).getReg() != Reg)
+      return 0;
+  return Reg;
+}
+
 void MachineInstr::dump() const {
   errs() << "  " << *this;
 }
@@ -1150,9 +1166,14 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
     DebugLocTuple DLT = MF->getDebugLocTuple(debugLoc);
     DIScope Scope(DLT.Scope);
     OS << " dbg:";
+    // Omit the directory, since it's usually long and uninteresting.
     if (!Scope.isNull())
-      OS << Scope.getDirectory() << ':' << Scope.getFilename() << ':';
-    OS << DLT.Line << ":" << DLT.Col;
+      OS << Scope.getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << DLT.Line;
+    if (DLT.Col != 0)
+      OS << ':' << DLT.Col;
   }
 
   OS << "\n";
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index db77d19..63f4f18 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -62,11 +62,11 @@ MachineBasicBlock *MachineLoop::getBottomBlock() {
   MachineBasicBlock *BotMBB = getHeader();
   MachineFunction::iterator End = BotMBB->getParent()->end();
   if (BotMBB != prior(End)) {
-    MachineBasicBlock *NextMBB = next(MachineFunction::iterator(BotMBB));
+    MachineBasicBlock *NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
     while (contains(NextMBB)) {
       BotMBB = NextMBB;
-      if (BotMBB == next(MachineFunction::iterator(BotMBB))) break;
-      NextMBB = next(MachineFunction::iterator(BotMBB));
+      if (BotMBB == llvm::next(MachineFunction::iterator(BotMBB))) break;
+      NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
     }
   }
   return BotMBB;
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
new file mode 100644
index 0000000..292096f
--- /dev/null
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -0,0 +1,393 @@
+//===- MachineSSAUpdater.cpp - Unstructured SSA Update Tool ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineSSAUpdater class. It's based on SSAUpdater
+// class in lib/Transforms/Utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+typedef DenseMap<MachineBasicBlock*, unsigned> AvailableValsTy;
+typedef std::vector<std::pair<MachineBasicBlock*, unsigned> >
+                IncomingPredInfoTy;
+
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+static IncomingPredInfoTy &getIncomingPredInfo(void *IPI) {
+  return *static_cast<IncomingPredInfoTy*>(IPI);
+}
+
+
+MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
+                                     SmallVectorImpl<MachineInstr*> *NewPHI)
+  : AV(0), IPI(0), InsertedPHIs(NewPHI) {
+  TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+MachineSSAUpdater::~MachineSSAUpdater() {
+  delete &getAvailableVals(AV);
+  delete &getIncomingPredInfo(IPI);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates.  ProtoValue is the value used to name PHI nodes.
+void MachineSSAUpdater::Initialize(unsigned V) {
+  if (AV == 0)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+
+  if (IPI == 0)
+    IPI = new IncomingPredInfoTy();
+  else
+    getIncomingPredInfo(IPI).clear();
+
+  VR = V;
+  VRC = MRI->getRegClass(VR);
+}
+
+/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
+/// the specified block.
+bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, unsigned V) {
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+  return GetValueAtEndOfBlockInternal(BB);
+}
+
+static
+unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
+          SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> &PredValues) {
+  if (BB->empty())
+    return 0;
+
+  MachineBasicBlock::iterator I = BB->front();
+  if (I->getOpcode() != TargetInstrInfo::PHI)
+    return 0;
+
+  AvailableValsTy AVals;
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    AVals[PredValues[i].first] = PredValues[i].second;
+  while (I != BB->end() && I->getOpcode() == TargetInstrInfo::PHI) {
+    bool Same = true;
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = I->getOperand(i).getReg();
+      MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
+      if (AVals[SrcBB] != SrcReg) {
+        Same = false;
+        break;
+      }
+    }
+    if (Same)
+      return I->getOperand(0).getReg();
+    ++I;
+  }
+  return 0;
+}
+
+/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
+/// a value of the given register class at the start of the specified basic
+/// block. It returns the virtual register defined by the instruction.
+static
+MachineInstr *InsertNewDef(unsigned Opcode,
+                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
+                           const TargetRegisterClass *RC,
+                           MachineRegisterInfo *MRI, const TargetInstrInfo *TII) {
+  unsigned NewVR = MRI->createVirtualRegister(RC);
+  return BuildMI(*BB, I, DebugLoc::getUnknownLoc(), TII->get(Opcode), NewVR);
+}
+                          
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+///
+unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!getAvailableVals(AV).count(BB))
+    return GetValueAtEndOfBlockInternal(BB);
+
+  // If there are no predecessors, just return undef.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetInstrInfo::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> PredValues;
+  unsigned SingularValue = 0;
+
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue != 0)
+    return SingularValue;
+
+  // If an identical PHI is already in BB, just reuse it.
+  unsigned DupPHI = LookForIdenticalPHI(BB, PredValues);
+  if (DupPHI)
+    return DupPHI;
+
+  // Otherwise, we do need a PHI: insert one now.
+  MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+  MachineInstr *InsertedPHI = InsertNewDef(TargetInstrInfo::PHI, BB,
+                                           Loc, VRC, MRI, TII);
+
+  // Fill in all the predecessors of the PHI.
+  MachineInstrBuilder MIB(InsertedPHI);
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    MIB.addReg(PredValues[i].second).addMBB(PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    InsertedPHI->eraseFromParent();
+    return ConstVal;
+  }
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(errs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI->getOperand(0).getReg();
+}
+
+static
+MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
+                                         MachineOperand *U) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+    if (&MI->getOperand(i) == U)
+      return MI->getOperand(i+1).getMBB();
+  }
+
+  llvm_unreachable("MachineOperand::getParent() failure?");
+  return 0;
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
+  MachineInstr *UseMI = U.getParent();
+  unsigned NewVR = 0;
+  if (UseMI->getOpcode() == TargetInstrInfo::PHI) {
+    MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
+    NewVR = GetValueAtEndOfBlockInternal(SourceBB);
+  } else {
+    NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
+  }
+
+  U.setReg(NewVR);
+}
+
+void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) {
+  MRI->replaceRegWith(OldReg, NewReg);
+
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  for (DenseMap<MachineBasicBlock*, unsigned>::iterator
+         I = AvailableVals.begin(), E = AvailableVals.end(); I != E; ++I)
+    if (I->second == OldReg)
+      I->second = NewReg;
+}
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// walking predecessors inserting PHI nodes as needed until we get to a block
+/// where the value is available.
+///
+unsigned MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+
+  // Query AvailableVals by doing an insertion of null.
+  std::pair<AvailableValsTy::iterator, bool> InsertRes =
+    AvailableVals.insert(std::make_pair(BB, 0));
+
+  // Handle the case when the insertion fails because we have already seen BB.
+  if (!InsertRes.second) {
+    // If the insertion failed, there are two cases.  The first case is that the
+    // value is already available for the specified block.  If we get this, just
+    // return the value.
+    if (InsertRes.first->second != 0)
+      return InsertRes.first->second;
+
+    // Otherwise, if the value we find is null, then this is the value is not
+    // known but it is being computed elsewhere in our recursion.  This means
+    // that we have a cycle.  Handle this by inserting a PHI node and returning
+    // it.  When we get back to the first instance of the recursion we will fill
+    // in the PHI node.
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+    MachineInstr *NewPHI = InsertNewDef(TargetInstrInfo::PHI, BB, Loc,
+                                        VRC, MRI,TII);
+    unsigned NewVR = NewPHI->getOperand(0).getReg();
+    InsertRes.first->second = NewVR;
+    return NewVR;
+  }
+
+  // If there are no predecessors, then we must have found an unreachable block
+  // just return 'undef'.  Since there are no predecessors, InsertRes must not
+  // be invalidated.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetInstrInfo::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return InsertRes.first->second = NewDef->getOperand(0).getReg();
+  }
+
+  // Okay, the value isn't in the map and we just inserted a null in the entry
+  // to indicate that we're processing the block.  Since we have no idea what
+  // value is in this block, we have to recurse through our predecessors.
+  //
+  // While we're walking our predecessors, we keep track of them in a vector,
+  // then insert a PHI node in the end if we actually need one.  We could use a
+  // smallvector here, but that would take a lot of stack space for every level
+  // of the recursion, just use IncomingPredInfo as an explicit stack.
+  IncomingPredInfoTy &IncomingPredInfo = getIncomingPredInfo(IPI);
+  unsigned FirstPredInfoEntry = IncomingPredInfo.size();
+
+  // As we're walking the predecessors, keep track of whether they are all
+  // producing the same value.  If so, this value will capture it, if not, it
+  // will get reset to null.  We distinguish the no-predecessor case explicitly
+  // below.
+  unsigned SingularValue = 0;
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  /// Look up BB's entry in AvailableVals.  'InsertRes' may be invalidated.  If
+  /// this block is involved in a loop, a no-entry PHI node will have been
+  /// inserted as InsertedVal.  Otherwise, we'll still have the null we inserted
+  /// above.
+  unsigned &InsertedVal = AvailableVals[BB];
+
+  // If all the predecessor values are the same then we don't need to insert a
+  // PHI.  This is the simple and common case.
+  if (SingularValue) {
+    // If a PHI node got inserted, replace it with the singlar value and delete
+    // it.
+    if (InsertedVal) {
+      MachineInstr *OldVal = MRI->getVRegDef(InsertedVal);
+      // Be careful about dead loops.  These RAUW's also update InsertedVal.
+      assert(InsertedVal != SingularValue && "Dead loop?");
+      ReplaceRegWith(InsertedVal, SingularValue);
+      OldVal->eraseFromParent();
+    }
+
+    InsertedVal = SingularValue;
+
+    // Drop the entries we added in IncomingPredInfo to restore the stack.
+    IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
+                           IncomingPredInfo.end());
+    return InsertedVal;
+  }
+
+
+  // Otherwise, we do need a PHI: insert one now if we don't already have one.
+  MachineInstr *InsertedPHI;
+  if (InsertedVal == 0) {
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+    InsertedPHI = InsertNewDef(TargetInstrInfo::PHI, BB, Loc,
+                               VRC, MRI, TII);
+    InsertedVal = InsertedPHI->getOperand(0).getReg();
+  } else {
+    InsertedPHI = MRI->getVRegDef(InsertedVal);
+  }
+
+  // Fill in all the predecessors of the PHI.
+  MachineInstrBuilder MIB(InsertedPHI);
+  for (IncomingPredInfoTy::iterator I =
+         IncomingPredInfo.begin()+FirstPredInfoEntry,
+         E = IncomingPredInfo.end(); I != E; ++I)
+    MIB.addReg(I->second).addMBB(I->first);
+
+  // Drop the entries we added in IncomingPredInfo to restore the stack.
+  IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
+                         IncomingPredInfo.end());
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    MRI->replaceRegWith(InsertedVal, ConstVal);
+    InsertedPHI->eraseFromParent();
+    InsertedVal = ConstVal;
+  } else {
+    DEBUG(errs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+
+    // If the client wants to know about all new instructions, tell it.
+    if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+  }
+
+  return InsertedVal;
+}
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index d9f4c99..917d053 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -376,15 +376,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB doesn't fall through but is empty!", MBB);
       }
     }
-    if (TII->BlockHasNoFallThrough(*MBB)) {
-      if (MBB->empty()) {
-        report("TargetInstrInfo says the block has no fall through, but the "
-               "block is empty!", MBB);
-      } else if (!MBB->back().getDesc().isBarrier()) {
-        report("TargetInstrInfo says the block has no fall through, but the "
-               "block does not end in a barrier!", MBB);
-      }
-    }
   } else {
     // Block is last in function.
     if (MBB->empty()) {
diff --git a/lib/CodeGen/MaxStackAlignment.cpp b/lib/CodeGen/MaxStackAlignment.cpp
new file mode 100644
index 0000000..d327cfa
--- /dev/null
+++ b/lib/CodeGen/MaxStackAlignment.cpp
@@ -0,0 +1,70 @@
+//===-- MaxStackAlignment.cpp - Compute the required stack alignment -- ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for vector register usage and aligned local objects to
+// calculate the maximum required alignment for a function. This is used by
+// targets which support it to determine if dynamic stack realignment is
+// necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+namespace {
+  struct MaximalStackAlignmentCalculator : public MachineFunctionPass {
+    static char ID;
+    MaximalStackAlignmentCalculator() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      MachineRegisterInfo &RI = MF.getRegInfo();
+
+      // Calculate max stack alignment of all already allocated stack objects.
+      FFI->calculateMaxStackAlignment();
+      unsigned MaxAlign = FFI->getMaxAlignment();
+
+      // Be over-conservative: scan over all vreg defs and find whether vector
+      // registers are used. If yes, there is probability that vector registers
+      // will be spilled and thus the stack needs to be aligned properly.
+      // FIXME: It would be better to only do this if a spill actually
+      // happens rather than conseratively aligning the stack regardless.
+      for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
+           RegNum < RI.getLastVirtReg(); ++RegNum)
+        MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
+
+      if (FFI->getMaxAlignment() == MaxAlign)
+        return false;
+
+      FFI->setMaxAlignment(MaxAlign);
+      return true;
+    }
+
+    virtual const char *getPassName() const {
+      return "Stack Alignment Requirements Auto-Detector";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+
+  char MaximalStackAlignmentCalculator::ID = 0;
+}
+
+FunctionPass*
+llvm::createMaxStackAlignmentCalculatorPass() {
+  return new MaximalStackAlignmentCalculator();
+}
+
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 2e30cc6..c62d179 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -287,7 +287,7 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
     // Okay, if we now know that the value is not live out of the block, we can
     // add a kill marker in this block saying that it kills the incoming value!
-    if (!ValueIsUsed && !isLiveOut(SrcReg, opBlock, *LV)) {
+    if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
       // In our final twist, we have to decide which instruction kills the
       // register.  In most cases this is the copy, however, the first
       // terminator instruction at the end of the block may also use the value.
@@ -301,8 +301,8 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
         // Check that no other terminators use values.
 #ifndef NDEBUG
-        for (MachineBasicBlock::iterator TI = next(Term); TI != opBlock.end();
-             ++TI) {
+        for (MachineBasicBlock::iterator TI = llvm::next(Term);
+             TI != opBlock.end(); ++TI) {
           assert(!TI->readsRegister(SrcReg) &&
                  "Terminator instructions cannot use virtual registers unless"
                  "they are the first terminator in a block!");
@@ -353,59 +353,13 @@ bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // We break edges when registers are live out from the predecessor block
       // (not considering PHI nodes). If the register is live in to this block
       // anyway, we would gain nothing from splitting.
-      if (!LV.isLiveIn(Reg, MBB) && isLiveOut(Reg, *PreMBB, LV))
+      if (!LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB))
         SplitCriticalEdge(PreMBB, &MBB);
     }
   }
   return true;
 }
 
-bool llvm::PHIElimination::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB,
-                                     LiveVariables &LV) {
-  LiveVariables::VarInfo &VI = LV.getVarInfo(Reg);
-
-  // Loop over all of the successors of the basic block, checking to see if
-  // the value is either live in the block, or if it is killed in the block.
-  std::vector<MachineBasicBlock*> OpSuccBlocks;
-  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
-         E = MBB.succ_end(); SI != E; ++SI) {
-    MachineBasicBlock *SuccMBB = *SI;
-
-    // Is it alive in this successor?
-    unsigned SuccIdx = SuccMBB->getNumber();
-    if (VI.AliveBlocks.test(SuccIdx))
-      return true;
-    OpSuccBlocks.push_back(SuccMBB);
-  }
-
-  // Check to see if this value is live because there is a use in a successor
-  // that kills it.
-  switch (OpSuccBlocks.size()) {
-  case 1: {
-    MachineBasicBlock *SuccMBB = OpSuccBlocks[0];
-    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-      if (VI.Kills[i]->getParent() == SuccMBB)
-        return true;
-    break;
-  }
-  case 2: {
-    MachineBasicBlock *SuccMBB1 = OpSuccBlocks[0], *SuccMBB2 = OpSuccBlocks[1];
-    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-      if (VI.Kills[i]->getParent() == SuccMBB1 ||
-          VI.Kills[i]->getParent() == SuccMBB2)
-        return true;
-    break;
-  }
-  default:
-    std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
-    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-      if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
-                             VI.Kills[i]->getParent()))
-        return true;
-  }
-  return false;
-}
-
 MachineBasicBlock *PHIElimination::SplitCriticalEdge(MachineBasicBlock *A,
                                                      MachineBasicBlock *B) {
   assert(A && B && "Missing MBB end point");
@@ -423,7 +377,7 @@ MachineBasicBlock *PHIElimination::SplitCriticalEdge(MachineBasicBlock *A,
   ++NumSplits;
 
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
-  MF->insert(next(MachineFunction::iterator(A)), NMBB);
+  MF->insert(llvm::next(MachineFunction::iterator(A)), NMBB);
   DEBUG(errs() << "PHIElimination splitting critical edge:"
         " BB#" << A->getNumber()
         << " -- BB#" << NMBB->getNumber()
diff --git a/lib/CodeGen/PHIElimination.h b/lib/CodeGen/PHIElimination.h
index f5872cb..b0b71ce 100644
--- a/lib/CodeGen/PHIElimination.h
+++ b/lib/CodeGen/PHIElimination.h
@@ -93,12 +93,6 @@ namespace llvm {
     bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
                        LiveVariables &LV);
 
-    /// isLiveOut - Determine if Reg is live out from MBB, when not
-    /// considering PHI nodes. This means that Reg is either killed by
-    /// a successor block or passed through one.
-    bool isLiveOut(unsigned Reg, const MachineBasicBlock &MBB,
-                   LiveVariables &LV);
-
     /// SplitCriticalEdge - Split a critical edge from A to B by
     /// inserting a new MBB. Update branches in A and PHI instructions
     /// in B. Return the new block.
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 9101fce2..79be295 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -373,7 +373,8 @@ void SchedulePostRATDList::FinishBlock() {
 ///
 void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
   // Initialize the indices to indicate that no registers are live.
-  std::fill(KillIndices, array_endof(KillIndices), ~0u);
+  for (unsigned i = 0; i < TRI->getNumRegs(); ++i)
+    KillIndices[i] = ~0u;
 
   // Determine the live-out physregs for this block.
   if (!BB->empty() && BB->back().getDesc().isReturn()) {
@@ -510,12 +511,9 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       }
       
       if (MO.isKill() != kill) {
-        bool removed = ToggleKillFlag(MI, MO);
-        if (removed) {
-          DEBUG(errs() << "Fixed <removed> in ");
-        } else {
-          DEBUG(errs() << "Fixed " << MO << " in ");
-        }
+        DEBUG(errs() << "Fixing " << MO << " in ");
+        // Warning: ToggleKillFlag may invalidate MO.
+        ToggleKillFlag(MI, MO);
         DEBUG(MI->dump());
       }
       
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index 8f62345..b0d7a47 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -16,6 +16,7 @@
 
 #define DEBUG_TYPE "pre-alloc-split"
 #include "VirtRegMap.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -104,6 +105,7 @@ namespace {
       AU.addRequired<LiveStacks>();
       AU.addPreserved<LiveStacks>();
       AU.addPreserved<RegisterCoalescer>();
+      AU.addPreserved<CalculateSpillWeights>();
       if (StrongPHIElim)
         AU.addPreservedID(StrongPHIEliminationID);
       else
@@ -876,7 +878,7 @@ bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
   if (!ValNo->isDefAccurate() || DefMI->getParent() == BarrierMBB)
     KillPt = findSpillPoint(BarrierMBB, Barrier, NULL, RefsInMBB);
   else
-    KillPt = next(MachineBasicBlock::iterator(DefMI));
+    KillPt = llvm::next(MachineBasicBlock::iterator(DefMI));
   
   if (KillPt == DefMI->getParent()->end())
     return false;
@@ -1118,7 +1120,7 @@ bool PreAllocSplitting::SplitRegLiveInterval(LiveInterval *LI) {
           return false; // No gap to insert spill.
         }
       } else {
-        SpillPt = next(MachineBasicBlock::iterator(DefMI));
+        SpillPt = llvm::next(MachineBasicBlock::iterator(DefMI));
         if (SpillPt == DefMBB->end()) {
           DEBUG(errs() << "FAILED (could not find a suitable spill point).\n");
           return false; // No gap to insert spill.
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 8905f75..e94247f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -136,9 +136,10 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
 /// pseudo instructions.
 void PEI::calculateCallsInformation(MachineFunction &Fn) {
   const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  MachineFrameInfo *FFI = Fn.getFrameInfo();
 
   unsigned MaxCallFrameSize = 0;
-  bool HasCalls = false;
+  bool HasCalls = FFI->hasCalls();
 
   // Get the function call frame set-up and tear-down instruction opcode
   int FrameSetupOpcode   = RegInfo->getCallFrameSetupOpcode();
@@ -166,7 +167,6 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
         HasCalls = true;
       }
 
-  MachineFrameInfo *FFI = Fn.getFrameInfo();
   FFI->setHasCalls(HasCalls);
   FFI->setMaxCallFrameSize(MaxCallFrameSize);
 
@@ -674,7 +674,7 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
         if (PrevI == BB->end())
           I = BB->begin();     // The replaced instr was the first in the block.
         else
-          I = next(PrevI);
+          I = llvm::next(PrevI);
         continue;
       }
 
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index 4ff5129..c02d47b 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -16,6 +16,7 @@
 #include "VirtRegRewriter.h"
 #include "Spiller.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -59,6 +60,11 @@ PreSplitIntervals("pre-alloc-split",
                   cl::desc("Pre-register allocation live interval splitting"),
                   cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+TrivCoalesceEnds("trivial-coalesce-ends",
+                  cl::desc("Attempt trivial coalescing of interval ends"),
+                  cl::init(false), cl::Hidden);
+
 static RegisterRegAlloc
 linearscanRegAlloc("linearscan", "linear scan register allocator",
                    createLinearScanRegisterAllocator);
@@ -182,6 +188,7 @@ namespace {
       // Make sure PassManager knows which analyses to make available
       // to coalescing and which analyses coalescing invalidates.
       AU.addRequiredTransitive<RegisterCoalescer>();
+      AU.addRequired<CalculateSpillWeights>();
       if (PreSplitIntervals)
         AU.addRequiredID(PreAllocSplittingID);
       AU.addRequired<LiveStacks>();
@@ -390,66 +397,71 @@ void RALinScan::ComputeRelatedRegClasses() {
         RelatedRegClasses.unionSets(I->second, OneClassForEachPhysReg[*AS]);
 }
 
-/// attemptTrivialCoalescing - If a simple interval is defined by a copy,
-/// try allocate the definition the same register as the source register
-/// if the register is not defined during live time of the interval. This
-/// eliminate a copy. This is used to coalesce copies which were not
-/// coalesced away before allocation either due to dest and src being in
-/// different register classes or because the coalescer was overly
-/// conservative.
+/// attemptTrivialCoalescing - If a simple interval is defined by a copy, try
+/// allocate the definition the same register as the source register if the
+/// register is not defined during live time of the interval. If the interval is
+/// killed by a copy, try to use the destination register. This eliminates a
+/// copy. This is used to coalesce copies which were not coalesced away before
+/// allocation either due to dest and src being in different register classes or
+/// because the coalescer was overly conservative.
 unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
   unsigned Preference = vrm_->getRegAllocPref(cur.reg);
   if ((Preference && Preference == Reg) || !cur.containsOneValue())
     return Reg;
 
-  VNInfo *vni = cur.begin()->valno;
-  if ((vni->def == SlotIndex()) ||
-      vni->isUnused() || !vni->isDefAccurate())
+  // We cannot handle complicated live ranges. Simple linear stuff only.
+  if (cur.ranges.size() != 1)
     return Reg;
-  MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg, PhysReg;
-  if (!CopyMI ||
-      !tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+
+  const LiveRange &range = cur.ranges.front();
+
+  VNInfo *vni = range.valno;
+  if (vni->isUnused())
     return Reg;
-  PhysReg = SrcReg;
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-    if (!vrm_->isAssignedReg(SrcReg))
+
+  unsigned CandReg;
+  {
+    MachineInstr *CopyMI;
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (vni->def != SlotIndex() && vni->isDefAccurate() &&
+        (CopyMI = li_->getInstructionFromIndex(vni->def)) &&
+        tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      // Defined by a copy, try to extend SrcReg forward
+      CandReg = SrcReg;
+    else if (TrivCoalesceEnds &&
+             (CopyMI =
+              li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
+             tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+             cur.reg == SrcReg)
+      // Only used by a copy, try to extend DstReg backwards
+      CandReg = DstReg;
+    else
+      return Reg;
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(CandReg)) {
+    if (!vrm_->isAssignedReg(CandReg))
       return Reg;
-    PhysReg = vrm_->getPhys(SrcReg);
+    CandReg = vrm_->getPhys(CandReg);
   }
-  if (Reg == PhysReg)
+  if (Reg == CandReg)
     return Reg;
 
   const TargetRegisterClass *RC = mri_->getRegClass(cur.reg);
-  if (!RC->contains(PhysReg))
+  if (!RC->contains(CandReg))
     return Reg;
 
-  // Try to coalesce.
-  if (!li_->conflictsWithPhysRegDef(cur, *vrm_, PhysReg)) {
-    DEBUG(errs() << "Coalescing: " << cur << " -> " << tri_->getName(PhysReg)
-                 << '\n');
-    vrm_->clearVirt(cur.reg);
-    vrm_->assignVirt2Phys(cur.reg, PhysReg);
-
-    // Remove unnecessary kills since a copy does not clobber the register.
-    if (li_->hasInterval(SrcReg)) {
-      LiveInterval &SrcLI = li_->getInterval(SrcReg);
-      for (MachineRegisterInfo::use_iterator I = mri_->use_begin(cur.reg),
-             E = mri_->use_end(); I != E; ++I) {
-        MachineOperand &O = I.getOperand();
-        if (!O.isKill())
-          continue;
-        MachineInstr *MI = &*I;
-        if (SrcLI.liveAt(li_->getInstructionIndex(MI).getDefIndex()))
-          O.setIsKill(false);
-      }
-    }
+  if (li_->conflictsWithPhysReg(cur, *vrm_, CandReg))
+    return Reg;
 
-    ++NumCoalesce;
-    return PhysReg;
-  }
+  // Try to coalesce.
+  DEBUG(errs() << "Coalescing: " << cur << " -> " << tri_->getName(CandReg)
+        << '\n');
+  vrm_->clearVirt(cur.reg);
+  vrm_->assignVirt2Phys(cur.reg, CandReg);
 
-  return Reg;
+  ++NumCoalesce;
+  return CandReg;
 }
 
 bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
@@ -1261,9 +1273,9 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
 
   // The earliest start of a Spilled interval indicates up to where
   // in handled we need to roll back
+  assert(!spillIs.empty() && "No spill intervals?"); 
+  SlotIndex earliestStart = spillIs[0]->beginIndex();
   
-  LiveInterval *earliestStartInterval = cur;
-
   // Spill live intervals of virtual regs mapped to the physical register we
   // want to clear (and its aliases).  We only spill those that overlap with the
   // current interval as the rest do not affect its allocation. we also keep
@@ -1274,19 +1286,16 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     LiveInterval *sli = spillIs.back();
     spillIs.pop_back();
     DEBUG(errs() << "\t\t\tspilling(a): " << *sli << '\n');
-    earliestStartInterval =
-      (earliestStartInterval->beginIndex() < sli->beginIndex()) ?
-         earliestStartInterval : sli;
+    if (sli->beginIndex() < earliestStart)
+      earliestStart = sli->beginIndex();
        
     std::vector<LiveInterval*> newIs;
-    newIs = spiller_->spill(sli, spillIs);
+    newIs = spiller_->spill(sli, spillIs, &earliestStart);
     addStackInterval(sli, ls_, li_, mri_, *vrm_);
     std::copy(newIs.begin(), newIs.end(), std::back_inserter(added));
     spilled.insert(sli->reg);
   }
 
-  SlotIndex earliestStart = earliestStartInterval->beginIndex();
-
   DEBUG(errs() << "\t\trolling back to: " << earliestStart << '\n');
 
   // Scan handled in reverse order up to the earliest start of a
@@ -1295,7 +1304,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
   while (!handled_.empty()) {
     LiveInterval* i = handled_.back();
     // If this interval starts before t we are done.
-    if (i->beginIndex() < earliestStart)
+    if (!i->empty() && i->beginIndex() < earliestStart)
       break;
     DEBUG(errs() << "\t\t\tundo changes for: " << *i << '\n');
     handled_.pop_back();
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index c677d34..c2014a7 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -36,6 +36,7 @@
 #include "PBQP/Heuristics/Briggs.h"
 #include "VirtRegMap.h"
 #include "VirtRegRewriter.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -90,6 +91,7 @@ namespace {
       au.addRequired<LiveIntervals>();
       //au.addRequiredID(SplitCriticalEdgesID);
       au.addRequired<RegisterCoalescer>();
+      au.addRequired<CalculateSpillWeights>();
       au.addRequired<LiveStacks>();
       au.addPreserved<LiveStacks>();
       au.addRequired<MachineLoopInfo>();
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 94680ed..67bf209 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -125,7 +125,7 @@ void RegScavenger::forward() {
     Tracking = true;
   } else {
     assert(MBBI != MBB->end() && "Already at the end of the basic block!");
-    MBBI = next(MBBI);
+    MBBI = llvm::next(MBBI);
   }
 
   MachineInstr *MI = MBBI;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 06ffdd6..2b52187 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -119,7 +119,8 @@ namespace {
     /// it can be simplified or if things it uses can be simplified by bit
     /// propagation.  If so, return true.
     bool SimplifyDemandedBits(SDValue Op) {
-      APInt Demanded = APInt::getAllOnesValue(Op.getValueSizeInBits());
+      unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+      APInt Demanded = APInt::getAllOnesValue(BitWidth);
       return SimplifyDemandedBits(Op, Demanded);
     }
 
@@ -546,7 +547,8 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
         To[0].getNode()->dump(&DAG);
         errs() << " and " << NumTo-1 << " other values\n";
         for (unsigned i = 0, e = NumTo; i != e; ++i)
-          assert(N->getValueType(i) == To[i].getValueType() &&
+          assert((!To[i].getNode() ||
+                  N->getValueType(i) == To[i].getValueType()) &&
                  "Cannot combine value to value of different type!"));
   WorkListRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To, &DeadNodes);
@@ -1687,10 +1689,14 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
   // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  //
+  // do not sink logical op inside of a vector extend, since it may combine
+  // into a vsetcc.
   if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
        (N0.getOpcode() == ISD::TRUNCATE &&
         !TLI.isTruncateFree(N0.getOperand(0).getValueType(), VT))) &&
+      !VT.isVector() &&
       N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
       (!LegalOperations ||
        TLI.isOperationLegal(N->getOpcode(), N0.getOperand(0).getValueType()))) {
@@ -1943,8 +1949,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   }
 
   // fold (or x, undef) -> -1
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) {
+    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  }
   // fold (or c1, c2) -> c1|c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
@@ -2434,7 +2442,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
-  unsigned OpSizeInBits = VT.getSizeInBits();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold (shl c1, c2) -> c1<<c2
   if (N0C && N1C)
@@ -2450,7 +2458,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     return N0;
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
-                            APInt::getAllOnesValue(VT.getSizeInBits())))
+                            APInt::getAllOnesValue(OpSizeInBits)))
     return DAG.getConstant(0, VT);
   // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
@@ -2526,6 +2534,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold (sra c1, c2) -> (sra c1, c2)
   if (N0C && N1C)
@@ -2537,7 +2546,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (N0C && N0C->isAllOnesValue())
     return N0;
   // fold (sra x, (setge c, size(x))) -> undef
-  if (N1C && N1C->getZExtValue() >= VT.getSizeInBits())
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
     return DAG.getUNDEF(VT);
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
@@ -2545,7 +2554,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
-    unsigned LowBits = VT.getSizeInBits() - (unsigned)N1C->getZExtValue();
+    unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
     EVT EVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
     if ((!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, EVT)))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
@@ -2556,7 +2565,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (N1C && N0.getOpcode() == ISD::SRA) {
     if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
-      if (Sum >= VT.getSizeInBits()) Sum = VT.getSizeInBits()-1;
+      if (Sum >= OpSizeInBits) Sum = OpSizeInBits-1;
       return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0.getOperand(0),
                          DAG.getConstant(Sum, N1C->getValueType(0)));
     }
@@ -2572,9 +2581,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     const ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     if (N01C && N1C) {
       // Determine what the truncate's result bitsize and type would be.
-      unsigned VTValSize = VT.getSizeInBits();
       EVT TruncVT =
-        EVT::getIntegerVT(*DAG.getContext(), VTValSize - N1C->getZExtValue());
+        EVT::getIntegerVT(*DAG.getContext(), OpSizeInBits - N1C->getZExtValue());
       // Determine the residual right-shift amount.
       signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
 
@@ -2607,7 +2615,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       EVT TruncVT = N1.getValueType();
       SDValue N100 = N1.getOperand(0).getOperand(0);
       APInt TruncC = N101C->getAPIntValue();
-      TruncC.trunc(TruncVT.getSizeInBits());
+      TruncC.trunc(TruncVT.getScalarType().getSizeInBits());
       return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
                          DAG.getNode(ISD::AND, N->getDebugLoc(),
                                      TruncVT,
@@ -2636,7 +2644,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
-  unsigned OpSizeInBits = VT.getSizeInBits();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold (srl c1, c2) -> c1 >>u c2
   if (N0C && N1C)
@@ -3029,7 +3037,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       else if (Op.getValueType().bitsGT(VT))
         Op = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), VT, Op);
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, Op,
-                         DAG.getValueType(N0.getValueType()));
+                         DAG.getValueType(N0.getValueType().getScalarType()));
     }
   }
 
@@ -3170,7 +3178,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     } else if (Op.getValueType().bitsGT(VT)) {
       Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
     }
-    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(), N0.getValueType());
+    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
+                                  N0.getValueType().getScalarType());
   }
 
   // Fold (zext (and (trunc x), cst)) -> (and x, cst),
@@ -3193,6 +3202,19 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                        X, DAG.getConstant(Mask, VT));
   }
 
+  // Fold (zext (and x, cst)) -> (and (zext x), cst)
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      N0.getOperand(0).getOpcode() != ISD::TRUNCATE &&
+      N0.getOperand(0).hasOneUse()) {
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT,
+                                   N0.getOperand(0)),
+                       DAG.getConstant(Mask, VT));
+  }
+
   // fold (zext (load x)) -> (zext (truncate (zextload x)))
   if (ISD::isNON_EXTLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
@@ -3269,6 +3291,26 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     if (SCC.getNode()) return SCC;
   }
 
+  // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse()) {
+    if (N0.getOpcode() == ISD::SHL) {
+      // If the original shl may be shifting out bits, do not perform this
+      // transformation.
+      unsigned ShAmt = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+      unsigned KnownZeroBits = N0.getOperand(0).getValueType().getSizeInBits() -
+        N0.getOperand(0).getOperand(0).getValueType().getSizeInBits();
+      if (ShAmt > KnownZeroBits)
+        return SDValue();
+    }
+    DebugLoc dl = N->getDebugLoc();
+    return DAG.getNode(N0.getOpcode(), dl, VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0.getOperand(0)),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0.getOperand(1)));
+  }
+
   return SDValue();
 }
 
@@ -3529,7 +3571,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT EVT = cast<VTSDNode>(N1)->getVT();
-  unsigned VTBits = VT.getSizeInBits();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
   unsigned EVTBits = EVT.getSizeInBits();
 
   // fold (sext_in_reg c1) -> c1
@@ -3537,7 +3579,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
-  if (DAG.ComputeNumSignBits(N0) >= VT.getSizeInBits()-EVTBits+1)
+  if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
     return N0;
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
@@ -3552,7 +3594,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // if x is small enough.
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N00 = N0.getOperand(0);
-    if (N00.getValueType().getSizeInBits() < EVTBits)
+    if (N00.getValueType().getScalarType().getSizeInBits() < EVTBits)
       return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N00, N1);
   }
 
@@ -3576,11 +3618,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
   if (N0.getOpcode() == ISD::SRL) {
     if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
-      if (ShAmt->getZExtValue()+EVTBits <= VT.getSizeInBits()) {
+      if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
         // We can turn this into an SRA iff the input to the SRL is already sign
         // extended enough.
         unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
-        if (VT.getSizeInBits()-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
+        if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
           return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT,
                              N0.getOperand(0), N0.getOperand(1));
       }
@@ -3681,7 +3723,6 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
     return SDValue();
   EVT LD1VT = LD1->getValueType(0);
-  const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
 
   if (ISD::isNON_EXTLoad(LD2) &&
       LD2->hasOneUse() &&
@@ -3689,7 +3730,7 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
       // If one is volatile it might be ok, but play conservative and bail out.
       !LD1->isVolatile() &&
       !LD2->isVolatile() &&
-      TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) {
+      DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
     unsigned Align = LD1->getAlignment();
     unsigned NewAlign = TLI.getTargetData()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
@@ -4804,49 +4845,6 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
-/// InferAlignment - If we can infer some alignment information from this
-/// pointer, return it.
-static unsigned InferAlignment(SDValue Ptr, SelectionDAG &DAG) {
-  // If this is a direct reference to a stack slot, use information about the
-  // stack slot's alignment.
-  int FrameIdx = 1 << 31;
-  int64_t FrameOffset = 0;
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
-    FrameIdx = FI->getIndex();
-  } else if (Ptr.getOpcode() == ISD::ADD &&
-             isa<ConstantSDNode>(Ptr.getOperand(1)) &&
-             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
-    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
-    FrameOffset = Ptr.getConstantOperandVal(1);
-  }
-
-  if (FrameIdx != (1 << 31)) {
-    // FIXME: Handle FI+CST.
-    const MachineFrameInfo &MFI = *DAG.getMachineFunction().getFrameInfo();
-    if (MFI.isFixedObjectIndex(FrameIdx)) {
-      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
-
-      // The alignment of the frame index can be determined from its offset from
-      // the incoming frame position.  If the frame object is at offset 32 and
-      // the stack is guaranteed to be 16-byte aligned, then we know that the
-      // object is 16-byte aligned.
-      unsigned StackAlign = DAG.getTarget().getFrameInfo()->getStackAlignment();
-      unsigned Align = MinAlign(ObjectOffset, StackAlign);
-
-      // Finally, the frame object itself may have a known alignment.  Factor
-      // the alignment + offset into a new alignment.  For example, if we know
-      // the  FI is 8 byte aligned, but the pointer is 4 off, we really have a
-      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
-      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
-      unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
-                                      FrameOffset);
-      return std::max(Align, FIInfoAlign);
-    }
-  }
-
-  return 0;
-}
-
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -4854,7 +4852,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
-    if (unsigned Align = InferAlignment(Ptr, DAG)) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getAlignment())
         return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
                               LD->getValueType(0),
@@ -5079,7 +5077,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
-    if (unsigned Align = InferAlignment(Ptr, DAG)) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > ST->getAlignment())
         return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
                                  Ptr, ST->getSrcValue(),
@@ -5231,7 +5229,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // SimplifyDemandedBits, which only works if the value has a single use.
     if (SimplifyDemandedBits(Value,
                              APInt::getLowBitsSet(
-                               Value.getValueSizeInBits(),
+                               Value.getValueType().getScalarType().getSizeInBits(),
                                ST->getMemoryVT().getSizeInBits())))
       return SDValue(N, 0);
   }
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 5eb9ca1..4ead9c9 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -532,7 +532,15 @@ bool FastISel::SelectBitCast(User *I) {
 
 bool
 FastISel::SelectInstruction(Instruction *I) {
-  return SelectOperator(I, I->getOpcode());
+  // First, try doing target-independent selection.
+  if (SelectOperator(I, I->getOpcode()))
+    return true;
+
+  // Next, try calling the target to attempt to handle the instruction.
+  if (TargetSelectInstruction(I))
+    return true;
+
+  return false;
 }
 
 /// FastEmitBranch - Emit an unconditional branch to the given block,
@@ -541,7 +549,7 @@ FastISel::SelectInstruction(Instruction *I) {
 void
 FastISel::FastEmitBranch(MachineBasicBlock *MSucc) {
   MachineFunction::iterator NextMBB =
-     next(MachineFunction::iterator(MBB));
+     llvm::next(MachineFunction::iterator(MBB));
 
   if (MBB->isLayoutSuccessor(MSucc)) {
     // The unconditional fall-through case, which needs no instructions.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 273dbf0..f9c05d0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -232,7 +232,7 @@ void SelectionDAGLegalize::LegalizeDAG() {
   // node is only legalized after all of its operands are legalized.
   DAG.AssignTopologicalOrder();
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = prior(DAG.allnodes_end()); I != next(E); ++I)
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
     LegalizeOp(SDValue(I, 0));
 
   // Finally, it's possible the root changed.  Get the new root.
@@ -2294,9 +2294,15 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     // NOTE: we could fall back on load/store here too for targets without
     // SAR.  However, it is doubtful that any exist.
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
-    unsigned BitsDiff = Node->getValueType(0).getSizeInBits() -
+    EVT VT = Node->getValueType(0);
+    EVT ShiftAmountTy = TLI.getShiftAmountTy();
+    if (VT.isVector()) {
+      ShiftAmountTy = VT;
+      VT = VT.getVectorElementType();
+    }
+    unsigned BitsDiff = VT.getSizeInBits() -
                         ExtraVT.getSizeInBits();
-    SDValue ShiftCst = DAG.getConstant(BitsDiff, TLI.getShiftAmountTy());
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, ShiftAmountTy);
     Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
                        Node->getOperand(0), ShiftCst);
     Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
@@ -3059,8 +3065,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node,
 
 // SelectionDAG::Legalize - This is the entry point for the file.
 //
-void SelectionDAG::Legalize(bool TypesNeedLegalizing,
-                            CodeGenOpt::Level OptLevel) {
+void SelectionDAG::Legalize(CodeGenOpt::Level OptLevel) {
   /// run - This is the main entry point to this class.
   ///
   SelectionDAGLegalize(*this, OptLevel).LegalizeDAG();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8ac8063..2f4457e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1167,55 +1167,62 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   GetExpandedInteger(N->getOperand(0), InL, InH);
 
   SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy);
-  SDValue Amt2 = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
-  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
-                             Amt, NVBitsNode, ISD::SETULT);
+  SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
+  SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue isShort = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
+                                 Amt, NVBitsNode, ISD::SETULT);
 
-  SDValue Lo1, Hi1, Lo2, Hi2;
+  SDValue LoS, HiS, LoL, HiL;
   switch (N->getOpcode()) {
   default: llvm_unreachable("Unknown shift");
   case ISD::SHL:
-    // ShAmt < NVTBits
-    Lo1 = DAG.getConstant(0, NVT);                  // Low part is zero.
-    Hi1 = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
-
-    // ShAmt >= NVTBits
-    Lo2 = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
-    Hi2 = DAG.getNode(ISD::OR, dl, NVT,
+    // Short: ShAmt < NVTBits
+    LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    HiS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
-                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt2));
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    LoL = DAG.getConstant(0, NVT);                        // Lo part is zero.
+    HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRL:
-    // ShAmt < NVTBits
-    Hi1 = DAG.getConstant(0, NVT);                  // Hi part is zero.
-    Lo1 = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
-
-    // ShAmt >= NVTBits
-    Hi2 = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
-    Lo2 = DAG.getNode(ISD::OR, dl, NVT,
-                     DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
-                     DAG.getNode(ISD::SHL, dl, NVT, InH, Amt2));
-
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getConstant(0, NVT);                        // Hi part is zero.
+    LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRA:
-    // ShAmt < NVTBits
-    Hi1 = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
-                       DAG.getConstant(NVTBits-1, ShTy));
-    Lo1 = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
-
-    // ShAmt >= NVTBits
-    Hi2 = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
-    Lo2 = DAG.getNode(ISD::OR, dl, NVT,
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
-                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt2));
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
+                      DAG.getConstant(NVTBits-1, ShTy));
+    LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
     return true;
   }
 
@@ -1989,7 +1996,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:
-  case ISD::ROTR: Res = ExpandIntOp_Shift(N); break;
+  case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
+  case ISD::RETURNADDR:
+  case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2173,6 +2182,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Lo);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
+  // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
+  // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
+  // constant to valid type.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return DAG.UpdateNodeOperands(SDValue(N, 0), Lo);
+}
+
 SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   EVT DstVT = N->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 2ee9f8a..c35f7ad 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -362,6 +362,7 @@ private:
   SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue ExpandIntOp_TRUNCATE(SDNode *N);
   SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_RETURNADDR(SDNode *N);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
                                   ISD::CondCode &CCCode, DebugLoc dl);
@@ -516,6 +517,7 @@ private:
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
@@ -559,6 +561,7 @@ private:
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
@@ -601,6 +604,7 @@ private:
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_UNDEF(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 785c2ad..2625245 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -79,7 +79,7 @@ bool VectorLegalizer::Run() {
   // node is only legalized after all of its operands are legalized.
   DAG.AssignTopologicalOrder();
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = prior(DAG.allnodes_end()); I != next(E); ++I)
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
     LegalizeOp(SDValue(I, 0));
 
   // Finally, it's possible the root changed.  Get the new root.
@@ -179,6 +179,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FFLOOR:
+  case ISD::SIGN_EXTEND_INREG:
     QueryType = Node->getValueType(0);
     break;
   case ISD::SINT_TO_FP:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 023324b..cf67ab9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -54,6 +54,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
@@ -195,6 +196,13 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   return InOp;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), EltVT,
+                     LHS, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
@@ -401,6 +409,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: SplitVecRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
@@ -700,6 +709,18 @@ void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
   Hi = DAG.getUNDEF(HiVT);
 }
 
+void DAGTypeLegalizer::SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
+                   N->getOperand(1));
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
+                   N->getOperand(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
@@ -1141,6 +1162,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
   case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
@@ -1691,6 +1713,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
                      WidenVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_SIGN_EXTEND_INREG(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(),
+                     WidenVT, WidenLHS, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index d53de34..b2ee8bb 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -20,10 +20,16 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+cl::opt<bool>
+DisableInstScheduling("disable-inst-scheduling",
+                      cl::init(false),
+                      cl::desc("Disable instruction scheduling"));
+
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
   : ScheduleDAG(mf) {
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c38c79b..da55e6b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -47,6 +48,8 @@
 #include <cmath>
 using namespace llvm;
 
+extern cl::opt<bool> DisableInstScheduling;
+
 /// makeVTList - Return an instance of the SDVTList struct initialized with the
 /// specified members.
 static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
@@ -551,6 +554,9 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
     }
 
     DeallocateNode(N);
+
+    // Remove the ordering of this node.
+    if (Ordering) Ordering->remove(N);
   }
 }
 
@@ -576,6 +582,9 @@ void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
   N->DropOperands();
 
   DeallocateNode(N);
+
+  // Remove the ordering of this node.
+  if (Ordering) Ordering->remove(N);
 }
 
 void SelectionDAG::DeallocateNode(SDNode *N) {
@@ -587,6 +596,9 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
   N->NodeType = ISD::DELETED_NODE;
 
   NodeAllocator.Deallocate(AllNodes.remove(N));
+
+  // Remove the ordering of this node.
+  if (Ordering) Ordering->remove(N);
 }
 
 /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
@@ -690,7 +702,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 1);
   AddNodeIDCustom(ID, N);
-  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Ordering) Ordering->remove(Node);
+  return Node;
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
@@ -707,7 +721,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 2);
   AddNodeIDCustom(ID, N);
-  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Ordering) Ordering->remove(Node);
+  return Node;
 }
 
 
@@ -724,7 +740,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, NumOps);
   AddNodeIDCustom(ID, N);
-  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Ordering) Ordering->remove(Node);
+  return Node;
 }
 
 /// VerifyNode - Sanity check the given node.  Aborts if it is invalid.
@@ -777,8 +795,13 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 SelectionDAG::SelectionDAG(TargetLowering &tli, FunctionLoweringInfo &fli)
   : TLI(tli), FLI(fli), DW(0),
     EntryNode(ISD::EntryToken, DebugLoc::getUnknownLoc(),
-    getVTList(MVT::Other)), Root(getEntryNode()) {
+              getVTList(MVT::Other)),
+    Root(getEntryNode()), Ordering(0) {
   AllNodes.push_back(&EntryNode);
+  if (DisableInstScheduling) {
+    Ordering = new NodeOrdering();
+    Ordering->add(&EntryNode);
+  }
 }
 
 void SelectionDAG::init(MachineFunction &mf, MachineModuleInfo *mmi,
@@ -791,6 +814,7 @@ void SelectionDAG::init(MachineFunction &mf, MachineModuleInfo *mmi,
 
 SelectionDAG::~SelectionDAG() {
   allnodes_clear();
+  delete Ordering;
 }
 
 void SelectionDAG::allnodes_clear() {
@@ -816,6 +840,10 @@ void SelectionDAG::clear() {
   EntryNode.UseList = 0;
   AllNodes.push_back(&EntryNode);
   Root = getEntryNode();
+  if (DisableInstScheduling) {
+    Ordering = new NodeOrdering();
+    Ordering->add(&EntryNode);
+  }
 }
 
 SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
@@ -831,8 +859,12 @@ SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
 }
 
 SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, EVT VT) {
+  assert(!VT.isVector() &&
+         "getZeroExtendInReg should use the vector element type instead of "
+         "the vector type!");
   if (Op.getValueType() == VT) return Op;
-  APInt Imm = APInt::getLowBitsSet(Op.getValueSizeInBits(),
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  APInt Imm = APInt::getLowBitsSet(BitWidth,
                                    VT.getSizeInBits());
   return getNode(ISD::AND, DL, Op.getValueType(), Op,
                  getConstant(Imm, Op.getValueType()));
@@ -872,14 +904,17 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
   ID.AddPointer(&Val);
   void *IP = 0;
   SDNode *N = NULL;
-  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP))) {
+    if (Ordering) Ordering->add(N);
     if (!VT.isVector())
       return SDValue(N, 0);
+  }
   if (!N) {
     N = NodeAllocator.Allocate<ConstantSDNode>();
     new (N) ConstantSDNode(isT, &Val, EltVT);
     CSEMap.InsertNode(N, IP);
     AllNodes.push_back(N);
+    if (Ordering) Ordering->add(N);
   }
 
   SDValue Result(N, 0);
@@ -916,14 +951,17 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
   ID.AddPointer(&V);
   void *IP = 0;
   SDNode *N = NULL;
-  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP))) {
+    if (Ordering) Ordering->add(N);
     if (!VT.isVector())
       return SDValue(N, 0);
+  }
   if (!N) {
     N = NodeAllocator.Allocate<ConstantFPSDNode>();
     new (N) ConstantFPSDNode(isTarget, &V, EltVT);
     CSEMap.InsertNode(N, IP);
     AllNodes.push_back(N);
+    if (Ordering) Ordering->add(N);
   }
 
   SDValue Result(N, 0);
@@ -978,12 +1016,15 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<GlobalAddressSDNode>();
   new (N) GlobalAddressSDNode(Opc, GV, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -993,12 +1034,15 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
   ID.AddInteger(FI);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<FrameIndexSDNode>();
   new (N) FrameIndexSDNode(FI, VT, isTarget);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1012,12 +1056,15 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
   ID.AddInteger(JTI);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<JumpTableSDNode>();
   new (N) JumpTableSDNode(JTI, VT, isTarget, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1037,12 +1084,15 @@ SDValue SelectionDAG::getConstantPool(Constant *C, EVT VT,
   ID.AddPointer(C);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<ConstantPoolSDNode>();
   new (N) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1063,12 +1113,15 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   C->AddSelectionDAGCSEId(ID);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<ConstantPoolSDNode>();
   new (N) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1077,12 +1130,15 @@ SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
   ID.AddPointer(MBB);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<BasicBlockSDNode>();
   new (N) BasicBlockSDNode(MBB);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1098,6 +1154,7 @@ SDValue SelectionDAG::getValueType(EVT VT) {
   N = NodeAllocator.Allocate<VTSDNode>();
   new (N) VTSDNode(VT);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1107,6 +1164,7 @@ SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
   N = NodeAllocator.Allocate<ExternalSymbolSDNode>();
   new (N) ExternalSymbolSDNode(false, Sym, 0, VT);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1119,6 +1177,7 @@ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
   N = NodeAllocator.Allocate<ExternalSymbolSDNode>();
   new (N) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1131,6 +1190,7 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
     new (N) CondCodeSDNode(Cond);
     CondCodeNodes[Cond] = N;
     AllNodes.push_back(N);
+    if (Ordering) Ordering->add(N);
   }
   return SDValue(CondCodeNodes[Cond], 0);
 }
@@ -1223,8 +1283,10 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
     ID.AddInteger(MaskVec[i]);
 
   void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
 
   // Allocate the mask array for the node out of the BumpPtrAllocator, since
   // SDNode doesn't have access to it.  This memory will be "leaked" when
@@ -1236,6 +1298,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
   new (N) ShuffleVectorSDNode(VT, dl, N1, N2, MaskAlloc);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1253,12 +1316,15 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, DebugLoc dl,
   SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
   AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), &Ops[0], 5);
   void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   CvtRndSatSDNode *N = NodeAllocator.Allocate<CvtRndSatSDNode>();
   new (N) CvtRndSatSDNode(VT, dl, Ops, 5, Code);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1267,12 +1333,15 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   AddNodeIDNode(ID, ISD::Register, getVTList(VT), 0, 0);
   ID.AddInteger(RegNo);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<RegisterSDNode>();
   new (N) RegisterSDNode(RegNo, VT);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1284,12 +1353,15 @@ SDValue SelectionDAG::getLabel(unsigned Opcode, DebugLoc dl,
   AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), &Ops[0], 1);
   ID.AddInteger(LabelID);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<LabelSDNode>();
   new (N) LabelSDNode(Opcode, dl, Root, LabelID);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1303,12 +1375,15 @@ SDValue SelectionDAG::getBlockAddress(BlockAddress *BA, EVT VT,
   ID.AddPointer(BA);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<BlockAddressSDNode>();
   new (N) BlockAddressSDNode(Opc, VT, BA, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1321,13 +1396,16 @@ SDValue SelectionDAG::getSrcValue(const Value *V) {
   ID.AddPointer(V);
 
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
 
   SDNode *N = NodeAllocator.Allocate<SrcValueSDNode>();
   new (N) SrcValueSDNode(V);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1480,7 +1558,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
   if (Op.getValueType().isVector())
     return false;
 
-  unsigned BitWidth = Op.getValueSizeInBits();
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
   return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
 }
 
@@ -1503,7 +1581,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
                                      APInt &KnownZero, APInt &KnownOne,
                                      unsigned Depth) const {
   unsigned BitWidth = Mask.getBitWidth();
-  assert(BitWidth == Op.getValueType().getSizeInBits() &&
+  assert(BitWidth == Op.getValueType().getScalarType().getSizeInBits() &&
          "Mask size mismatches value type size!");
 
   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
@@ -1760,7 +1838,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
     APInt InMask    = Mask;
     InMask.trunc(InBits);
@@ -1774,7 +1852,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InSignBit = APInt::getSignBit(InBits);
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
     APInt InMask = Mask;
@@ -1815,7 +1893,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::ANY_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InMask = Mask;
     InMask.trunc(InBits);
     KnownZero.trunc(InBits);
@@ -1827,7 +1905,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::TRUNCATE: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InMask = Mask;
     InMask.zext(InBits);
     KnownZero.zext(InBits);
@@ -1960,7 +2038,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
-  unsigned VTBits = VT.getSizeInBits();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
@@ -1987,7 +2065,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   case ISD::SIGN_EXTEND:
-    Tmp = VTBits-Op.getOperand(0).getValueType().getSizeInBits();
+    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
   case ISD::SIGN_EXTEND_INREG:
@@ -2238,13 +2316,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<SDNode>();
   new (N) SDNode(Opcode, DL, getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -2349,6 +2430,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isFloatingPoint() &&
            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (Operand.getOpcode() == ISD::UNDEF)
       return getUNDEF(VT);
     break;
@@ -2356,8 +2441,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid SIGN_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().bitsLT(VT)
-           && "Invalid sext node, dst < src!");
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid sext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
     break;
@@ -2365,8 +2454,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ZERO_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().bitsLT(VT)
-           && "Invalid zext node, dst < src!");
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid zext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
       return getNode(ISD::ZERO_EXTEND, DL, VT,
                      Operand.getNode()->getOperand(0));
@@ -2375,8 +2468,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ANY_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().bitsLT(VT)
-           && "Invalid anyext node, dst < src!");
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid anyext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
@@ -2385,14 +2482,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid TRUNCATE!");
     if (Operand.getValueType() == VT) return Operand;   // noop truncate
-    assert(Operand.getValueType().bitsGT(VT)
-           && "Invalid truncate node, src < dst!");
+    assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) &&
+           "Invalid truncate node, src < dst!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::TRUNCATE)
       return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
     else if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
              OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
-      if (Operand.getNode()->getOperand(0).getValueType().bitsLT(VT))
+      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+            .bitsLT(VT.getScalarType()))
         return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
       else if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
         return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
@@ -2447,8 +2549,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     SDValue Ops[1] = { Operand };
     AddNodeIDNode(ID, Opcode, VTs, Ops, 1);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     N = NodeAllocator.Allocate<UnarySDNode>();
     new (N) UnarySDNode(Opcode, DL, VTs, Operand);
     CSEMap.InsertNode(N, IP);
@@ -2458,6 +2562,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
   }
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -2623,6 +2728,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     assert(VT == N1.getValueType() && "Not an inreg extend!");
     assert(VT.isInteger() && EVT.isInteger() &&
            "Cannot *_EXTEND_INREG FP types");
+    assert(!EVT.isVector() &&
+           "AssertSExt/AssertZExt type should be the vector element type "
+           "rather than the vector type!");
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (VT == EVT) return N1; // noop assertion.
     break;
@@ -2632,12 +2740,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     assert(VT == N1.getValueType() && "Not an inreg extend!");
     assert(VT.isInteger() && EVT.isInteger() &&
            "Cannot *_EXTEND_INREG FP types");
-    assert(EVT.bitsLE(VT) && "Not extending!");
+    assert(!EVT.isVector() &&
+           "SIGN_EXTEND_INREG type should be the vector element type rather "
+           "than the vector type!");
+    assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
 
     if (N1C) {
       APInt Val = N1C->getAPIntValue();
-      unsigned FromBits = cast<VTSDNode>(N2)->getVT().getSizeInBits();
+      unsigned FromBits = EVT.getSizeInBits();
       Val <<= Val.getBitWidth()-FromBits;
       Val = Val.ashr(Val.getBitWidth()-FromBits);
       return getConstant(Val, VT);
@@ -2859,8 +2970,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     N = NodeAllocator.Allocate<BinarySDNode>();
     new (N) BinarySDNode(Opcode, DL, VTs, N1, N2);
     CSEMap.InsertNode(N, IP);
@@ -2870,6 +2983,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   }
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -2936,8 +3050,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     N = NodeAllocator.Allocate<TernarySDNode>();
     new (N) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
     CSEMap.InsertNode(N, IP);
@@ -2945,7 +3061,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     N = NodeAllocator.Allocate<TernarySDNode>();
     new (N) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
   }
+
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -3541,12 +3659,14 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode* N = NodeAllocator.Allocate<AtomicSDNode>();
   new (N) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain, Ptr, Cmp, Swp, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3604,12 +3724,14 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode* N = NodeAllocator.Allocate<AtomicSDNode>();
   new (N) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain, Ptr, Val, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3682,6 +3804,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
     void *IP = 0;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
     }
 
@@ -3693,6 +3816,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
     new (N) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps, MemVT, MMO);
   }
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3732,16 +3856,15 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
     assert(VT == MemVT && "Non-extending load from different memory type!");
   } else {
     // Extending load.
-    if (VT.isVector())
-      assert(MemVT.getVectorNumElements() == VT.getVectorNumElements() &&
-             "Invalid vector extload!");
-    else
-      assert(MemVT.bitsLT(VT) &&
-             "Should only be an extending load, not truncating!");
-    assert((ExtType == ISD::EXTLOAD || VT.isInteger()) &&
-           "Cannot sign/zero extend a FP/Vector load!");
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
     assert(VT.isInteger() == MemVT.isInteger() &&
            "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use trunc store to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+           "Cannot use trunc store to change the number of vector elements!");
   }
 
   bool Indexed = AM != ISD::UNINDEXED;
@@ -3758,12 +3881,14 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode *N = NodeAllocator.Allocate<LoadSDNode>();
   new (N) LoadSDNode(Ops, dl, VTs, AM, ExtType, MemVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3834,12 +3959,14 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
   new (N) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3874,10 +4001,15 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   if (VT == SVT)
     return getStore(Chain, dl, Val, Ptr, MMO);
 
-  assert(VT.bitsGT(SVT) && "Not a truncation?");
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
   assert(VT.isInteger() == SVT.isInteger() &&
          "Can't do FP-INT conversion!");
-
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+         "Cannot use trunc store to change the number of vector elements!");
 
   SDVTList VTs = getVTList(MVT::Other);
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -3889,12 +4021,14 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
   new (N) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3911,14 +4045,17 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
   new (N) StoreSDNode(Ops, dl, VTs, AM,
                       ST->isTruncatingStore(), ST->getMemoryVT(),
                       ST->getMemOperand());
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3984,8 +4121,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     AddNodeIDNode(ID, Opcode, VTs, Ops, NumOps);
     void *IP = 0;
 
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
 
     N = NodeAllocator.Allocate<SDNode>();
     new (N) SDNode(Opcode, DL, VTs, Ops, NumOps);
@@ -3996,6 +4135,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   }
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -4051,8 +4191,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     if (NumOps == 1) {
       N = NodeAllocator.Allocate<UnarySDNode>();
       new (N) UnarySDNode(Opcode, DL, VTList, Ops[0]);
@@ -4083,6 +4225,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
     }
   }
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -4166,7 +4309,7 @@ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
                           I->VTs[2] == VT3 && I->VTs[3] == VT4)
       return *I;
 
-  EVT *Array = Allocator.Allocate<EVT>(3);
+  EVT *Array = Allocator.Allocate<EVT>(4);
   Array[0] = VT1;
   Array[1] = VT2;
   Array[2] = VT3;
@@ -4545,8 +4688,10 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
   if (VTs.VTs[VTs.NumVTs-1] != MVT::Flag) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
-    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(ON);
       return ON;
+    }
   }
 
   if (!RemoveNodeFromCSEMaps(N))
@@ -4610,6 +4755,7 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
 
   if (IP)
     CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  if (Ordering) Ordering->add(N);
   return N;
 }
 
@@ -4748,8 +4894,10 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, ~Opcode, VTs, Ops, NumOps);
     IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return cast<MachineSDNode>(E);
+    }
   }
 
   // Allocate a new MachineSDNode.
@@ -4771,6 +4919,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
     CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -4807,8 +4956,10 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return E;
+    }
   }
   return NULL;
 }
@@ -5867,6 +6018,99 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
                  &Scalars[0], Scalars.size());
 }
 
+
+/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a 
+/// location that is 'Dist' units away from the location that the 'Base' load 
+/// is loading from.
+bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, 
+                                     unsigned Bytes, int Dist) const {
+  if (LD->getChain() != Base->getChain())
+    return false;
+  EVT VT = LD->getValueType(0);
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LD->getOperand(1);
+  SDValue BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+  if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
+    if (V && (V->getSExtValue() == Dist*Bytes))
+      return true;
+  }
+
+  GlobalValue *GV1 = NULL;
+  GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+
+/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
+/// it cannot be inferred.
+unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+  // If this is a GlobalAddress + cst, return the alignment.
+  GlobalValue *GV;
+  int64_t GVOffset = 0;
+  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset))
+    return MinAlign(GV->getAlignment(), GVOffset);
+
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = 1 << 31;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (Ptr.getOpcode() == ISD::ADD &&
+             isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != (1 << 31)) {
+    // FIXME: Handle FI+CST.
+    const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
+    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                    FrameOffset);
+    if (MFI.isFixedObjectIndex(FrameIdx)) {
+      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
+
+      // The alignment of the frame index can be determined from its offset from
+      // the incoming frame position.  If the frame object is at offset 32 and
+      // the stack is guaranteed to be 16-byte aligned, then we know that the
+      // object is 16-byte aligned.
+      unsigned StackAlign = getTarget().getFrameInfo()->getStackAlignment();
+      unsigned Align = MinAlign(ObjectOffset, StackAlign);
+
+      // Finally, the frame object itself may have a known alignment.  Factor
+      // the alignment + offset into a new alignment.  For example, if we know
+      // the FI is 8 byte aligned, but the pointer is 4 off, we really have a
+      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
+      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
+      return std::max(Align, FIInfoAlign);
+    }
+    return FIInfoAlign;
+  }
+
+  return 0;
+}
+
 void SelectionDAG::dump() const {
   errs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
 
@@ -5882,6 +6126,9 @@ void SelectionDAG::dump() const {
   errs() << "\n\n";
 }
 
+void SelectionDAG::NodeOrdering::dump() const {
+}
+
 void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
   print_types(OS, G);
   print_details(OS, G);
@@ -6022,4 +6269,3 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
       return false;
   return true;
 }
-
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 57d8903..7568384 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -583,6 +583,9 @@ void SelectionDAGBuilder::visit(Instruction &I) {
 }
 
 void SelectionDAGBuilder::visit(unsigned Opcode, User &I) {
+  // Tell the DAG that we're processing a new instruction.
+  DAG.NewInst();
+
   // Note: this doesn't use InstVisitor, because it has to work with
   // ConstantExpr's in addition to instructions.
   switch (Opcode) {
@@ -2108,7 +2111,7 @@ void SelectionDAGBuilder::visitSelect(User &I) {
 
     for (unsigned i = 0; i != NumValues; ++i)
       Values[i] = DAG.getNode(ISD::SELECT, getCurDebugLoc(),
-                              TrueVal.getValueType(), Cond,
+                              TrueVal.getNode()->getValueType(i), Cond,
                               SDValue(TrueVal.getNode(), TrueVal.getResNo() + i),
                               SDValue(FalseVal.getNode(), FalseVal.getResNo() + i));
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c39437f..a640c7d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -60,8 +60,6 @@
 using namespace llvm;
 
 static cl::opt<bool>
-DisableLegalizeTypes("disable-legalize-types", cl::Hidden);
-static cl::opt<bool>
 EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
           cl::desc("Enable verbose messages in the \"fast\" "
                    "instruction selector"));
@@ -362,6 +360,39 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
+/// SetDebugLoc - Update MF's and SDB's DebugLocs if debug information is
+/// attached with this instruction.
+static void SetDebugLoc(unsigned MDDbgKind,
+                        MetadataContext &TheMetadata,
+                        Instruction *I,
+                        SelectionDAGBuilder *SDB,
+                        FastISel *FastIS,
+                        MachineFunction *MF) {
+  if (!isa<DbgInfoIntrinsic>(I)) 
+    if (MDNode *Dbg = TheMetadata.getMD(MDDbgKind, I)) {
+      DILocation DILoc(Dbg);
+      DebugLoc Loc = ExtractDebugLocation(DILoc, MF->getDebugLocInfo());
+
+      SDB->setCurDebugLoc(Loc);
+
+      if (FastIS)
+        FastIS->setCurDebugLoc(Loc);
+
+      // If the function doesn't have a default debug location yet, set
+      // it. This is kind of a hack.
+      if (MF->getDefaultDebugLoc().isUnknown())
+        MF->setDefaultDebugLoc(Loc);
+    }
+}
+
+/// ResetDebugLoc - Set MF's and SDB's DebugLocs to Unknown.
+static void ResetDebugLoc(SelectionDAGBuilder *SDB,
+                          FastISel *FastIS) {
+  SDB->setCurDebugLoc(DebugLoc::getUnknownLoc());
+  if (FastIS)
+    FastIS->setCurDebugLoc(DebugLoc::getUnknownLoc());
+}
+
 void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
                                         BasicBlock::iterator Begin,
                                         BasicBlock::iterator End,
@@ -373,20 +404,16 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
   // Lower all of the non-terminator instructions. If a call is emitted
   // as a tail call, cease emitting nodes for this block.
   for (BasicBlock::iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
-    if (MDDbgKind) {
-      // Update DebugLoc if debug information is attached with this
-      // instruction.
-      if (!isa<DbgInfoIntrinsic>(I)) 
-        if (MDNode *Dbg = TheMetadata.getMD(MDDbgKind, I)) {
-          DILocation DILoc(Dbg);
-          DebugLoc Loc = ExtractDebugLocation(DILoc, MF->getDebugLocInfo());
-          SDB->setCurDebugLoc(Loc);
-          if (MF->getDefaultDebugLoc().isUnknown())
-            MF->setDefaultDebugLoc(Loc);
-        }
-    }
-    if (!isa<TerminatorInst>(I))
+    if (MDDbgKind)
+      SetDebugLoc(MDDbgKind, TheMetadata, I, SDB, 0, MF);
+
+    if (!isa<TerminatorInst>(I)) {
       SDB->visit(*I);
+
+      // Set the current debug location back to "unknown" so that it doesn't
+      // spuriously apply to subsequent instructions.
+      ResetDebugLoc(SDB, 0);
+    }
   }
 
   if (!SDB->HasTailCall) {
@@ -401,7 +428,9 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
       HandlePHINodesInSuccessorBlocks(LLVMBB);
 
       // Lower the terminator after the copies are emitted.
+      SetDebugLoc(MDDbgKind, TheMetadata, LLVMBB->getTerminator(), SDB, 0, MF);
       SDB->visit(*LLVMBB->getTerminator());
+      ResetDebugLoc(SDB, 0);
     }
   }
 
@@ -498,75 +527,73 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
-  if (!DisableLegalizeTypes) {
-    if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
-                                                 BlockName);
+  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
+                                               BlockName);
+
+  bool Changed;
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Type Legalization", GroupName);
+    Changed = CurDAG->LegalizeTypes();
+  } else {
+    Changed = CurDAG->LegalizeTypes();
+  }
+
+  DEBUG(errs() << "Type-legalized selection DAG:\n");
+  DEBUG(CurDAG->dump());
 
-    bool Changed;
+  if (Changed) {
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
     if (TimePassesIsEnabled) {
-      NamedRegionTimer T("Type Legalization", GroupName);
-      Changed = CurDAG->LegalizeTypes();
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName);
+      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
     } else {
-      Changed = CurDAG->LegalizeTypes();
+      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
     }
 
-    DEBUG(errs() << "Type-legalized selection DAG:\n");
+    DEBUG(errs() << "Optimized type-legalized selection DAG:\n");
     DEBUG(CurDAG->dump());
+  }
 
-    if (Changed) {
-      if (ViewDAGCombineLT)
-        CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
-
-      // Run the DAG combiner in post-type-legalize mode.
-      if (TimePassesIsEnabled) {
-        NamedRegionTimer T("DAG Combining after legalize types", GroupName);
-        CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
-      } else {
-        CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
-      }
-
-      DEBUG(errs() << "Optimized type-legalized selection DAG:\n");
-      DEBUG(CurDAG->dump());
-    }
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Vector Legalization", GroupName);
+    Changed = CurDAG->LegalizeVectors();
+  } else {
+    Changed = CurDAG->LegalizeVectors();
+  }
 
+  if (Changed) {
     if (TimePassesIsEnabled) {
-      NamedRegionTimer T("Vector Legalization", GroupName);
-      Changed = CurDAG->LegalizeVectors();
+      NamedRegionTimer T("Type Legalization 2", GroupName);
+      Changed = CurDAG->LegalizeTypes();
     } else {
-      Changed = CurDAG->LegalizeVectors();
+      Changed = CurDAG->LegalizeTypes();
     }
 
-    if (Changed) {
-      if (TimePassesIsEnabled) {
-        NamedRegionTimer T("Type Legalization 2", GroupName);
-        Changed = CurDAG->LegalizeTypes();
-      } else {
-        Changed = CurDAG->LegalizeTypes();
-      }
-
-      if (ViewDAGCombineLT)
-        CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
-      // Run the DAG combiner in post-type-legalize mode.
-      if (TimePassesIsEnabled) {
-        NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
-        CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-      } else {
-        CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-      }
-
-      DEBUG(errs() << "Optimized vector-legalized selection DAG:\n");
-      DEBUG(CurDAG->dump());
+    // Run the DAG combiner in post-type-legalize mode.
+    if (TimePassesIsEnabled) {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
+      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+    } else {
+      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
     }
+
+    DEBUG(errs() << "Optimized vector-legalized selection DAG:\n");
+    DEBUG(CurDAG->dump());
   }
 
   if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
 
   if (TimePassesIsEnabled) {
     NamedRegionTimer T("DAG Legalization", GroupName);
-    CurDAG->Legalize(DisableLegalizeTypes, OptLevel);
+    CurDAG->Legalize(OptLevel);
   } else {
-    CurDAG->Legalize(DisableLegalizeTypes, OptLevel);
+    CurDAG->Legalize(OptLevel);
   }
 
   DEBUG(errs() << "Legalized selection DAG:\n");
@@ -738,24 +765,11 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
       FastIS->startNewBlock(BB);
       // Do FastISel on as many instructions as possible.
       for (; BI != End; ++BI) {
-        if (MDDbgKind) {
-          // Update DebugLoc if debug information is attached with this
-          // instruction.
-          if (!isa<DbgInfoIntrinsic>(BI)) 
-            if (MDNode *Dbg = TheMetadata.getMD(MDDbgKind, BI)) {
-              DILocation DILoc(Dbg);
-              DebugLoc Loc = ExtractDebugLocation(DILoc,
-                                                  MF.getDebugLocInfo());
-              FastIS->setCurDebugLoc(Loc);
-              if (MF.getDefaultDebugLoc().isUnknown())
-                MF.setDefaultDebugLoc(Loc);
-            }
-        }
-
         // Just before the terminator instruction, insert instructions to
         // feed PHI nodes in successor blocks.
         if (isa<TerminatorInst>(BI))
           if (!HandlePHINodesInSuccessorBlocksFast(LLVMBB, FastIS)) {
+            ResetDebugLoc(SDB, FastIS);
             if (EnableFastISelVerbose || EnableFastISelAbort) {
               errs() << "FastISel miss: ";
               BI->dump();
@@ -765,13 +779,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
             break;
           }
 
+        if (MDDbgKind)
+          SetDebugLoc(MDDbgKind, TheMetadata, BI, SDB, FastIS, &MF);
+
         // First try normal tablegen-generated "fast" selection.
-        if (FastIS->SelectInstruction(BI))
+        if (FastIS->SelectInstruction(BI)) {
+          ResetDebugLoc(SDB, FastIS);
           continue;
+        }
 
-        // Next, try calling the target to attempt to handle the instruction.
-        if (FastIS->TargetSelectInstruction(BI))
-          continue;
+        // Clear out the debug location so that it doesn't carry over to
+        // unrelated instructions.
+        ResetDebugLoc(SDB, FastIS);
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
         if (isa<CallInst>(BI)) {
@@ -786,10 +805,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
               R = FuncInfo->CreateRegForValue(BI);
           }
 
-          SDB->setCurDebugLoc(FastIS->getCurDebugLoc());
-
           bool HadTailCall = false;
-          SelectBasicBlock(LLVMBB, BI, next(BI), HadTailCall);
+          SelectBasicBlock(LLVMBB, BI, llvm::next(BI), HadTailCall);
 
           // If the call was emitted as a tail call, we're done with the block.
           if (HadTailCall) {
@@ -823,9 +840,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
     // not handled by FastISel. If FastISel is not run, this is the entire
     // block.
     if (BI != End) {
-      // If FastISel is run and it has known DebugLoc then use it.
-      if (FastIS && !FastIS->getCurDebugLoc().isUnknown())
-        SDB->setCurDebugLoc(FastIS->getCurDebugLoc());
       bool HadTailCall;
       SelectBasicBlock(LLVMBB, BI, End, HadTailCall);
     }
@@ -1313,14 +1327,6 @@ SDNode *SelectionDAGISel::Select_UNDEF(const SDValue &N) {
                               N.getValueType());
 }
 
-SDNode *SelectionDAGISel::Select_DBG_LABEL(const SDValue &N) {
-  SDValue Chain = N.getOperand(0);
-  unsigned C = cast<LabelSDNode>(N)->getLabelID();
-  SDValue Tmp = CurDAG->getTargetConstant(C, MVT::i32);
-  return CurDAG->SelectNodeTo(N.getNode(), TargetInstrInfo::DBG_LABEL,
-                              MVT::Other, Tmp, Chain);
-}
-
 SDNode *SelectionDAGISel::Select_EH_LABEL(const SDValue &N) {
   SDValue Chain = N.getOperand(0);
   unsigned C = cast<LabelSDNode>(N)->getLabelID();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index c5adc50..83fa5a8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -29,14 +29,14 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
-#include <fstream>
 using namespace llvm;
 
 namespace llvm {
   template<>
   struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
 
-    DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+    explicit DOTGraphTraits(bool isSimple=false) :
+      DefaultDOTGraphTraits(isSimple) {}
 
     static bool hasEdgeDestLabels() {
       return true;
@@ -50,6 +50,11 @@ namespace llvm {
       return ((const SDNode *) Node)->getValueType(i).getEVTString();
     }
 
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
+      return itostr(I - SDNodeIterator::begin((SDNode *) Node));
+    }
+
     /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
     /// should actually target another edge source, not a node.  If this method
     /// is implemented, getEdgeTarget should be implemented.
@@ -76,12 +81,12 @@ namespace llvm {
     static bool renderGraphFromBottomUp() {
       return true;
     }
-    
+
     static bool hasNodeAddressLabel(const SDNode *Node,
                                     const SelectionDAG *Graph) {
       return true;
     }
-    
+
     /// If you want to override the dot attributes printed for a particular
     /// edge, override this method.
     template<typename EdgeIter>
@@ -94,7 +99,7 @@ namespace llvm {
         return "color=blue,style=dashed";
       return "";
     }
-    
+
 
     static std::string getSimpleNodeLabel(const SDNode *Node,
                                           const SelectionDAG *G) {
@@ -132,7 +137,7 @@ namespace llvm {
 
 std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
                                                         const SelectionDAG *G) {
-  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel (Node, G);
+  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel(Node, G);
 }
 
 
@@ -142,7 +147,7 @@ std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
 void SelectionDAG::viewGraph(const std::string &Title) {
 // This code is only for debugging!
 #ifndef NDEBUG
-  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getNameStr(), 
+  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getNameStr(),
             false, Title);
 #else
   errs() << "SelectionDAG::viewGraph is only available in debug builds on "
@@ -186,7 +191,7 @@ const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
 #ifndef NDEBUG
   std::map<const SDNode *, std::string>::const_iterator I =
     NodeGraphAttrs.find(N);
-    
+
   if (I != NodeGraphAttrs.end())
     return I->second;
   else
@@ -252,8 +257,7 @@ void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
     // Visually mark that we hit the limit
     if (strcmp(Color, "red") == 0) {
       setSubgraphColorHelper(N, "blue", visited, 0, printed);
-    }
-    else if (strcmp(Color, "yellow") == 0) {
+    } else if (strcmp(Color, "yellow") == 0) {
       setSubgraphColorHelper(N, "green", visited, 0, printed);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 68bc2d6..1026169 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -911,7 +911,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                           TargetLoweringOpt &TLO,
                                           unsigned Depth) const {
   unsigned BitWidth = DemandedMask.getBitWidth();
-  assert(Op.getValueSizeInBits() == BitWidth &&
+  assert(Op.getValueType().getScalarType().getSizeInBits() == BitWidth &&
          "Mask size mismatches value type size!");
   APInt NewMask = DemandedMask;
   DebugLoc dl = Op.getDebugLoc();
@@ -1240,7 +1240,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // demand the input sign bit.
       APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
       if (HighBits.intersects(NewMask))
-        InDemandedMask |= APInt::getSignBit(VT.getSizeInBits());
+        InDemandedMask |= APInt::getSignBit(VT.getScalarType().getSizeInBits());
       
       if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
                                KnownZero, KnownOne, TLO, Depth+1))
@@ -2184,48 +2184,6 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA,
 }
 
 
-/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a 
-/// location that is 'Dist' units away from the location that the 'Base' load 
-/// is loading from.
-bool TargetLowering::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, 
-                                       unsigned Bytes, int Dist, 
-                                       const MachineFrameInfo *MFI) const {
-  if (LD->getChain() != Base->getChain())
-    return false;
-  EVT VT = LD->getValueType(0);
-  if (VT.getSizeInBits() / 8 != Bytes)
-    return false;
-
-  SDValue Loc = LD->getOperand(1);
-  SDValue BaseLoc = Base->getOperand(1);
-  if (Loc.getOpcode() == ISD::FrameIndex) {
-    if (BaseLoc.getOpcode() != ISD::FrameIndex)
-      return false;
-    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
-    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
-    int FS  = MFI->getObjectSize(FI);
-    int BFS = MFI->getObjectSize(BFI);
-    if (FS != BFS || FS != (int)Bytes) return false;
-    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
-  }
-  if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
-    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
-    if (V && (V->getSExtValue() == Dist*Bytes))
-      return true;
-  }
-
-  GlobalValue *GV1 = NULL;
-  GlobalValue *GV2 = NULL;
-  int64_t Offset1 = 0;
-  int64_t Offset2 = 0;
-  bool isGA1 = isGAPlusOffset(Loc.getNode(), GV1, Offset1);
-  bool isGA2 = isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
-  if (isGA1 && isGA2 && GV1 == GV2)
-    return Offset1 == (Offset2 + Dist*Bytes);
-  return false;
-}
-
-
 SDValue TargetLowering::
 PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   // Default implementation: no optimization.
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index 5876371..ed407eb 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -130,7 +130,8 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   // See PR3149:
   // 172     %ECX<def> = MOV32rr %reg1039<kill>
   // 180     INLINEASM <es:subl $5,$1
-  //         sbbl $3,$0>, 10, %EAX<def>, 14, %ECX<earlyclobber,def>, 9, %EAX<kill>,
+  //         sbbl $3,$0>, 10, %EAX<def>, 14, %ECX<earlyclobber,def>, 9,
+  //         %EAX<kill>,
   // 36, <fi#0>, 1, %reg0, 0, 9, %ECX<kill>, 36, <fi#1>, 1, %reg0, 0
   // 188     %EAX<def> = MOV32rr %EAX<kill>
   // 196     %ECX<def> = MOV32rr %ECX<kill>
@@ -281,12 +282,12 @@ TransferImplicitOps(MachineInstr *MI, MachineInstr *NewMI) {
   }
 }
 
-/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with IntA
-/// being the source and IntB being the dest, thus this defines a value number
-/// in IntB.  If the source value number (in IntA) is defined by a commutable
-/// instruction and its other operand is coalesced to the copy dest register,
-/// see if we can transform the copy into a noop by commuting the definition. For
-/// example,
+/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// IntA being the source and IntB being the dest, thus this defines a value
+/// number in IntB.  If the source value number (in IntA) is defined by a
+/// commutable instruction and its other operand is coalesced to the copy dest
+/// register, see if we can transform the copy into a noop by commuting the
+/// definition. For example,
 ///
 ///  A3 = op A2 B0<kill>
 ///    ...
@@ -508,7 +509,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     if (BHasSubRegs) {
       for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
         LiveInterval &SRLI = li_->getInterval(*SR);
-        SRLI.MergeInClobberRange(*li_, AI->start, End, li_->getVNInfoAllocator());
+        SRLI.MergeInClobberRange(*li_, AI->start, End,
+                                 li_->getVNInfoAllocator());
       }
     }
   }
@@ -708,7 +710,8 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
       checkForDeadDef = true;
     }
 
-  MachineBasicBlock::iterator MII = next(MachineBasicBlock::iterator(CopyMI));
+  MachineBasicBlock::iterator MII =
+    llvm::next(MachineBasicBlock::iterator(CopyMI));
   tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, tri_);
   MachineInstr *NewMI = prior(MII);
 
@@ -1314,7 +1317,13 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
                       "coalesced to another register.\n");
       return false;  // Not coalescable.
     }
-  } else if (!tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)){
+  } else if (tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+    if (SrcSubIdx && DstSubIdx && SrcSubIdx != DstSubIdx) {
+      // e.g. %reg16404:1<def> = MOV8rr %reg16412:2<kill>
+      Again = true;
+      return false;  // Not coalescable.
+    }
+  } else {
     llvm_unreachable("Unrecognized copy instruction!");
   }
 
@@ -1611,9 +1620,9 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
           }
         }
       } else {
-        // If the virtual register live interval is long but it has low use desity,
-        // do not join them, instead mark the physical register as its allocation
-        // preference.
+        // If the virtual register live interval is long but it has low use
+        // density, do not join them, instead mark the physical register as its
+        // allocation preference.
         LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
         unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
         unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
@@ -1938,6 +1947,10 @@ bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
     if (Overlaps) {
       // If we haven't already recorded that this value # is safe, check it.
       if (!InVector(LHSIt->valno, EliminatedLHSVals)) {
+        // If it's re-defined by an early clobber somewhere in the live range,
+        // then conservatively abort coalescing.
+        if (LHSIt->valno->hasRedefByEC())
+          return false;
         // Copy from the RHS?
         if (!RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg))
           return false;    // Nope, bail out.
@@ -1977,6 +1990,10 @@ bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
           // if coalescing succeeds.  Just skip the liverange.
           if (++LHSIt == LHSEnd) break;
         } else {
+          // If it's re-defined by an early clobber somewhere in the live range,
+          // then conservatively abort coalescing.
+          if (LHSIt->valno->hasRedefByEC())
+            return false;
           // Otherwise, if this is a copy from the RHS, mark it as being merged
           // in.
           if (RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg)) {
@@ -2316,6 +2333,10 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
       if (LHSValNoAssignments[I->valno->id] !=
           RHSValNoAssignments[J->valno->id])
         return false;
+      // If it's re-defined by an early clobber somewhere in the live range,
+      // then conservatively abort coalescing.
+      if (NewVNInfo[LHSValNoAssignments[I->valno->id]]->hasRedefByEC())
+        return false;
     }
 
     if (I->end < J->end) {
@@ -2401,9 +2422,15 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
 
     // If this isn't a copy nor a extract_subreg, we can't join intervals.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    bool isInsUndef = false;
     if (Inst->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(1).getReg();
+    } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
+      DstReg = Inst->getOperand(0).getReg();
+      SrcReg = Inst->getOperand(2).getReg();
+      if (Inst->getOperand(1).isUndef())
+        isInsUndef = true;
     } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
                Inst->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
       DstReg = Inst->getOperand(0).getReg();
@@ -2413,7 +2440,8 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
 
     bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
     bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    if (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty())
+    if (isInsUndef ||
+        (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty()))
       ImpDefCopies.push_back(CopyRec(Inst, 0));
     else if (SrcIsPhys || DstIsPhys)
       PhysCopies.push_back(CopyRec(Inst, 0));
@@ -2421,9 +2449,9 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
       VirtCopies.push_back(CopyRec(Inst, 0));
   }
 
-  // Try coalescing implicit copies first, followed by copies to / from
-  // physical registers, then finally copies from virtual registers to
-  // virtual registers.
+  // Try coalescing implicit copies and insert_subreg <undef> first,
+  // followed by copies to / from physical registers, then finally copies
+  // from virtual registers to virtual registers.
   for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) {
     CopyRec &TheCopy = ImpDefCopies[i];
     bool Again = false;
@@ -2594,114 +2622,6 @@ void SimpleRegisterCoalescing::releaseMemory() {
   ReMatDefs.clear();
 }
 
-/// Returns true if the given live interval is zero length.
-static bool isZeroLengthInterval(LiveInterval *li, LiveIntervals *li_) {
-  for (LiveInterval::Ranges::const_iterator
-         i = li->ranges.begin(), e = li->ranges.end(); i != e; ++i)
-    if (i->end.getPrevIndex() > i->start)
-      return false;
-  return true;
-}
-
-
-void SimpleRegisterCoalescing::CalculateSpillWeights() {
-  SmallSet<unsigned, 4> Processed;
-  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
-       mbbi != mbbe; ++mbbi) {
-    MachineBasicBlock* MBB = mbbi;
-    SlotIndex MBBEnd = li_->getMBBEndIdx(MBB);
-    MachineLoop* loop = loopInfo->getLoopFor(MBB);
-    unsigned loopDepth = loop ? loop->getLoopDepth() : 0;
-    bool isExiting = loop ? loop->isLoopExiting(MBB) : false;
-
-    for (MachineBasicBlock::const_iterator mii = MBB->begin(), mie = MBB->end();
-         mii != mie; ++mii) {
-      const MachineInstr *MI = mii;
-      if (tii_->isIdentityCopy(*MI))
-        continue;
-
-      if (MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
-        continue;
-
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &mopi = MI->getOperand(i);
-        if (!mopi.isReg() || mopi.getReg() == 0)
-          continue;
-        unsigned Reg = mopi.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(mopi.getReg()))
-          continue;
-        // Multiple uses of reg by the same instruction. It should not
-        // contribute to spill weight again.
-        if (!Processed.insert(Reg))
-          continue;
-
-        bool HasDef = mopi.isDef();
-        bool HasUse = !HasDef;
-        for (unsigned j = i+1; j != e; ++j) {
-          const MachineOperand &mopj = MI->getOperand(j);
-          if (!mopj.isReg() || mopj.getReg() != Reg)
-            continue;
-          HasDef |= mopj.isDef();
-          HasUse |= mopj.isUse();
-          if (HasDef && HasUse)
-            break;
-        }
-
-        LiveInterval &RegInt = li_->getInterval(Reg);
-        float Weight = li_->getSpillWeight(HasDef, HasUse, loopDepth);
-        if (HasDef && isExiting) {
-          // Looks like this is a loop count variable update.
-          SlotIndex DefIdx = li_->getInstructionIndex(MI).getDefIndex();
-          const LiveRange *DLR =
-            li_->getInterval(Reg).getLiveRangeContaining(DefIdx);
-          if (DLR->end > MBBEnd)
-            Weight *= 3.0F;
-        }
-        RegInt.weight += Weight;
-      }
-      Processed.clear();
-    }
-  }
-
-  for (LiveIntervals::iterator I = li_->begin(), E = li_->end(); I != E; ++I) {
-    LiveInterval &LI = *I->second;
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      // If the live interval length is essentially zero, i.e. in every live
-      // range the use follows def immediately, it doesn't make sense to spill
-      // it and hope it will be easier to allocate for this li.
-      if (isZeroLengthInterval(&LI, li_)) {
-        LI.weight = HUGE_VALF;
-        continue;
-      }
-
-      bool isLoad = false;
-      SmallVector<LiveInterval*, 4> SpillIs;
-      if (li_->isReMaterializable(LI, SpillIs, isLoad)) {
-        // If all of the definitions of the interval are re-materializable,
-        // it is a preferred candidate for spilling. If non of the defs are
-        // loads, then it's potentially very cheap to re-materialize.
-        // FIXME: this gets much more complicated once we support non-trivial
-        // re-materialization.
-        if (isLoad)
-          LI.weight *= 0.9F;
-        else
-          LI.weight *= 0.5F;
-      }
-
-      // Slightly prefer live interval that has been assigned a preferred reg.
-      std::pair<unsigned, unsigned> Hint = mri_->getRegAllocationHint(LI.reg);
-      if (Hint.first || Hint.second)
-        LI.weight *= 1.01F;
-
-      // Divide the weight of the interval by its size.  This encourages
-      // spilling of intervals that are large and have few uses, and
-      // discourages spilling of small intervals with many uses.
-      LI.weight /= li_->getApproximateInstructionCount(LI) * InstrSlots::NUM;
-    }
-  }
-}
-
-
 bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
   mf_ = &fn;
   mri_ = &fn.getRegInfo();
@@ -2727,7 +2647,8 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
     joinIntervals();
     DEBUG({
         errs() << "********** INTERVALS POST JOINING **********\n";
-        for (LiveIntervals::iterator I = li_->begin(), E = li_->end(); I != E; ++I){
+        for (LiveIntervals::iterator I = li_->begin(), E = li_->end();
+             I != E; ++I){
           I->second->print(errs(), tri_);
           errs() << "\n";
         }
@@ -2768,7 +2689,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           DoDelete = true;
         }
         if (!DoDelete)
-          mii = next(mii);
+          mii = llvm::next(mii);
         else {
           li_->RemoveMachineInstrFromMaps(MI);
           mii = mbbi->erase(mii);
@@ -2831,8 +2752,6 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
     }
   }
 
-  CalculateSpillWeights();
-
   DEBUG(dump());
   return true;
 }
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index 78f8a9a..605a740 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -244,10 +244,6 @@ namespace llvm {
     MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
                                     unsigned Reg, SlotIndex &LastUseIdx) const;
 
-    /// CalculateSpillWeights - Compute spill weights for all virtual register
-    /// live intervals.
-    void CalculateSpillWeights();
-
     void printRegName(unsigned reg) const;
   };
 
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 237d0b5..bc246c1 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -20,22 +20,25 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <set>
 
 using namespace llvm;
 
 namespace {
-  enum SpillerName { trivial, standard };
+  enum SpillerName { trivial, standard, splitting };
 }
 
 static cl::opt<SpillerName>
 spillerOpt("spiller",
            cl::desc("Spiller to use: (default: standard)"),
            cl::Prefix,
-           cl::values(clEnumVal(trivial, "trivial spiller"),
-                      clEnumVal(standard, "default spiller"),
+           cl::values(clEnumVal(trivial,   "trivial spiller"),
+                      clEnumVal(standard,  "default spiller"),
+                      clEnumVal(splitting, "splitting spiller"),
                       clEnumValEnd),
            cl::init(standard));
 
+// Spiller virtual destructor implementation.
 Spiller::~Spiller() {}
 
 namespace {
@@ -140,9 +143,9 @@ protected:
 
       // Insert store if necessary.
       if (hasDef) {
-        tii->storeRegToStackSlot(*mi->getParent(), next(miItr), newVReg, true,
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr), newVReg, true,
                                  ss, trc);
-        MachineInstr *storeInstr(next(miItr));
+        MachineInstr *storeInstr(llvm::next(miItr));
         SlotIndex storeIndex =
           lis->InsertMachineInstrInMaps(storeInstr).getDefIndex();
         SlotIndex beginIndex = storeIndex.getPrevIndex();
@@ -170,7 +173,8 @@ public:
     : SpillerBase(mf, lis, vrm) {}
 
   std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs) {
+                                   SmallVectorImpl<LiveInterval*> &spillIs,
+                                   SlotIndex*) {
     // Ignore spillIs - we don't use it.
     return trivialSpillEverywhere(li);
   }
@@ -179,23 +183,336 @@ public:
 
 /// Falls back on LiveIntervals::addIntervalsForSpills.
 class StandardSpiller : public Spiller {
-private:
+protected:
   LiveIntervals *lis;
   const MachineLoopInfo *loopInfo;
   VirtRegMap *vrm;
 public:
-  StandardSpiller(MachineFunction *mf, LiveIntervals *lis,
-                  const MachineLoopInfo *loopInfo, VirtRegMap *vrm)
+  StandardSpiller(LiveIntervals *lis, const MachineLoopInfo *loopInfo,
+                  VirtRegMap *vrm)
     : lis(lis), loopInfo(loopInfo), vrm(vrm) {}
 
   /// Falls back on LiveIntervals::addIntervalsForSpills.
   std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs) {
+                                   SmallVectorImpl<LiveInterval*> &spillIs,
+                                   SlotIndex*) {
     return lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
   }
 
 };
 
+/// When a call to spill is placed this spiller will first try to break the
+/// interval up into its component values (one new interval per value).
+/// If this fails, or if a call is placed to spill a previously split interval
+/// then the spiller falls back on the standard spilling mechanism. 
+class SplittingSpiller : public StandardSpiller {
+public:
+  SplittingSpiller(MachineFunction *mf, LiveIntervals *lis,
+                   const MachineLoopInfo *loopInfo, VirtRegMap *vrm)
+    : StandardSpiller(lis, loopInfo, vrm) {
+
+    mri = &mf->getRegInfo();
+    tii = mf->getTarget().getInstrInfo();
+    tri = mf->getTarget().getRegisterInfo();
+  }
+
+  std::vector<LiveInterval*> spill(LiveInterval *li,
+                                   SmallVectorImpl<LiveInterval*> &spillIs,
+                                   SlotIndex *earliestStart) {
+    
+    if (worthTryingToSplit(li)) {
+      return tryVNISplit(li, earliestStart);
+    }
+    // else
+    return StandardSpiller::spill(li, spillIs, earliestStart);
+  }
+
+private:
+
+  MachineRegisterInfo *mri;
+  const TargetInstrInfo *tii;
+  const TargetRegisterInfo *tri;  
+  DenseSet<LiveInterval*> alreadySplit;
+
+  bool worthTryingToSplit(LiveInterval *li) const {
+    return (!alreadySplit.count(li) && li->getNumValNums() > 1);
+  }
+
+  /// Try to break a LiveInterval into its component values.
+  std::vector<LiveInterval*> tryVNISplit(LiveInterval *li,
+                                         SlotIndex *earliestStart) {
+
+    DEBUG(errs() << "Trying VNI split of %reg" << *li << "\n");
+
+    std::vector<LiveInterval*> added;
+    SmallVector<VNInfo*, 4> vnis;
+
+    std::copy(li->vni_begin(), li->vni_end(), std::back_inserter(vnis));
+   
+    for (SmallVectorImpl<VNInfo*>::iterator vniItr = vnis.begin(),
+         vniEnd = vnis.end(); vniItr != vniEnd; ++vniItr) {
+      VNInfo *vni = *vniItr;
+      
+      // Skip unused VNIs, or VNIs with no kills.
+      if (vni->isUnused() || vni->kills.empty())
+        continue;
+
+      DEBUG(errs() << "  Extracted Val #" << vni->id << " as ");
+      LiveInterval *splitInterval = extractVNI(li, vni);
+      
+      if (splitInterval != 0) {
+        DEBUG(errs() << *splitInterval << "\n");
+        added.push_back(splitInterval);
+        alreadySplit.insert(splitInterval);
+        if (earliestStart != 0) {
+          if (splitInterval->beginIndex() < *earliestStart)
+            *earliestStart = splitInterval->beginIndex();
+        }
+      } else {
+        DEBUG(errs() << "0\n");
+      }
+    } 
+
+    DEBUG(errs() << "Original LI: " << *li << "\n");
+
+    // If there original interval still contains some live ranges
+    // add it to added and alreadySplit.    
+    if (!li->empty()) {
+      added.push_back(li);
+      alreadySplit.insert(li);
+      if (earliestStart != 0) {
+        if (li->beginIndex() < *earliestStart)
+          *earliestStart = li->beginIndex();
+      }
+    }
+
+    return added;
+  }
+
+  /// Extract the given value number from the interval.
+  LiveInterval* extractVNI(LiveInterval *li, VNInfo *vni) const {
+    assert(vni->isDefAccurate() || vni->isPHIDef());
+    assert(!vni->kills.empty());
+
+    // Create a new vreg and live interval, copy VNI kills & ranges over.                                                                                                                                                     
+    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+    unsigned newVReg = mri->createVirtualRegister(trc);
+    vrm->grow();
+    LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
+    VNInfo *newVNI = newLI->createValueCopy(vni, lis->getVNInfoAllocator());
+
+    // Start by copying all live ranges in the VN to the new interval.                                                                                                                                                        
+    for (LiveInterval::iterator rItr = li->begin(), rEnd = li->end();
+         rItr != rEnd; ++rItr) {
+      if (rItr->valno == vni) {
+        newLI->addRange(LiveRange(rItr->start, rItr->end, newVNI));
+      }
+    }
+
+    // Erase the old VNI & ranges.                                                                                                                                                                                            
+    li->removeValNo(vni);
+
+    // Collect all current uses of the register belonging to the given VNI.
+    // We'll use this to rename the register after we've dealt with the def.
+    std::set<MachineInstr*> uses;
+    for (MachineRegisterInfo::use_iterator
+         useItr = mri->use_begin(li->reg), useEnd = mri->use_end();
+         useItr != useEnd; ++useItr) {
+      uses.insert(&*useItr);
+    }
+
+    // Process the def instruction for this VNI.
+    if (newVNI->isPHIDef()) {
+      // Insert a copy at the start of the MBB. The range proceeding the
+      // copy will be attached to the original LiveInterval.
+      MachineBasicBlock *defMBB = lis->getMBBFromIndex(newVNI->def);
+      tii->copyRegToReg(*defMBB, defMBB->begin(), newVReg, li->reg, trc, trc);
+      MachineInstr *copyMI = defMBB->begin();
+      copyMI->addRegisterKilled(li->reg, tri);
+      SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+      VNInfo *phiDefVNI = li->getNextValue(lis->getMBBStartIdx(defMBB),
+                                           0, false, lis->getVNInfoAllocator());
+      phiDefVNI->setIsPHIDef(true);
+      phiDefVNI->addKill(copyIdx.getDefIndex());
+      li->addRange(LiveRange(phiDefVNI->def, copyIdx.getDefIndex(), phiDefVNI));
+      LiveRange *oldPHIDefRange =
+        newLI->getLiveRangeContaining(lis->getMBBStartIdx(defMBB));
+
+      // If the old phi def starts in the middle of the range chop it up.
+      if (oldPHIDefRange->start < lis->getMBBStartIdx(defMBB)) {
+        LiveRange oldPHIDefRange2(copyIdx.getDefIndex(), oldPHIDefRange->end,
+                                  oldPHIDefRange->valno);
+        oldPHIDefRange->end = lis->getMBBStartIdx(defMBB);
+        newLI->addRange(oldPHIDefRange2);
+      } else if (oldPHIDefRange->start == lis->getMBBStartIdx(defMBB)) {
+        // Otherwise if it's at the start of the range just trim it.
+        oldPHIDefRange->start = copyIdx.getDefIndex();
+      } else {
+        assert(false && "PHI def range doesn't cover PHI def?");
+      }
+
+      newVNI->def = copyIdx.getDefIndex();
+      newVNI->setCopy(copyMI);
+      newVNI->setIsPHIDef(false); // not a PHI def anymore.
+      newVNI->setIsDefAccurate(true);
+    } else {
+      // non-PHI def. Rename the def. If it's two-addr that means renaming the use
+      // and inserting a new copy too.
+      MachineInstr *defInst = lis->getInstructionFromIndex(newVNI->def);
+      // We'll rename this now, so we can remove it from uses.
+      uses.erase(defInst);
+      unsigned defOpIdx = defInst->findRegisterDefOperandIdx(li->reg);
+      bool isTwoAddr = defInst->isRegTiedToUseOperand(defOpIdx),
+        twoAddrUseIsUndef = false;
+
+      for (unsigned i = 0; i < defInst->getNumOperands(); ++i) {
+        MachineOperand &mo = defInst->getOperand(i);
+        if (mo.isReg() && (mo.isDef() || isTwoAddr) && (mo.getReg()==li->reg)) {
+          mo.setReg(newVReg);
+          if (isTwoAddr && mo.isUse() && mo.isUndef())
+            twoAddrUseIsUndef = true;
+        }
+      }
+    
+      SlotIndex defIdx = lis->getInstructionIndex(defInst);
+      newVNI->def = defIdx.getDefIndex();
+
+      if (isTwoAddr && !twoAddrUseIsUndef) {
+        MachineBasicBlock *defMBB = defInst->getParent();
+        tii->copyRegToReg(*defMBB, defInst, newVReg, li->reg, trc, trc);
+        MachineInstr *copyMI = prior(MachineBasicBlock::iterator(defInst));
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+        copyMI->addRegisterKilled(li->reg, tri);
+        LiveRange *origUseRange =
+          li->getLiveRangeContaining(newVNI->def.getUseIndex());
+        VNInfo *origUseVNI = origUseRange->valno;
+        origUseRange->end = copyIdx.getDefIndex();
+        bool updatedKills = false;
+        for (unsigned k = 0; k < origUseVNI->kills.size(); ++k) {
+          if (origUseVNI->kills[k] == defIdx.getDefIndex()) {
+            origUseVNI->kills[k] = copyIdx.getDefIndex();
+            updatedKills = true;
+            break;
+          }
+        }
+        assert(updatedKills && "Failed to update VNI kill list.");
+        VNInfo *copyVNI = newLI->getNextValue(copyIdx.getDefIndex(), copyMI,
+                                              true, lis->getVNInfoAllocator());
+        copyVNI->addKill(defIdx.getDefIndex());
+        LiveRange copyRange(copyIdx.getDefIndex(),defIdx.getDefIndex(),copyVNI);
+        newLI->addRange(copyRange);
+      }    
+    }
+    
+    for (std::set<MachineInstr*>::iterator
+         usesItr = uses.begin(), usesEnd = uses.end();
+         usesItr != usesEnd; ++usesItr) {
+      MachineInstr *useInst = *usesItr;
+      SlotIndex useIdx = lis->getInstructionIndex(useInst);
+      LiveRange *useRange =
+        newLI->getLiveRangeContaining(useIdx.getUseIndex());
+
+      // If this use doesn't belong to the new interval skip it.
+      if (useRange == 0)
+        continue;
+
+      // This use doesn't belong to the VNI, skip it.
+      if (useRange->valno != newVNI)
+        continue;
+
+      // Check if this instr is two address.
+      unsigned useOpIdx = useInst->findRegisterUseOperandIdx(li->reg);
+      bool isTwoAddress = useInst->isRegTiedToDefOperand(useOpIdx);
+      
+      // Rename uses (and defs for two-address instrs).
+      for (unsigned i = 0; i < useInst->getNumOperands(); ++i) {
+        MachineOperand &mo = useInst->getOperand(i);
+        if (mo.isReg() && (mo.isUse() || isTwoAddress) &&
+            (mo.getReg() == li->reg)) {
+          mo.setReg(newVReg);
+        }
+      }
+
+      // If this is a two address instruction we've got some extra work to do.
+      if (isTwoAddress) {
+        // We modified the def operand, so we need to copy back to the original
+        // reg.
+        MachineBasicBlock *useMBB = useInst->getParent();
+        MachineBasicBlock::iterator useItr(useInst);
+        tii->copyRegToReg(*useMBB, next(useItr), li->reg, newVReg, trc, trc);
+        MachineInstr *copyMI = next(useItr);
+        copyMI->addRegisterKilled(newVReg, tri);
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+
+        // Change the old two-address defined range & vni to start at
+        // (and be defined by) the copy.
+        LiveRange *origDefRange =
+          li->getLiveRangeContaining(useIdx.getDefIndex());
+        origDefRange->start = copyIdx.getDefIndex();
+        origDefRange->valno->def = copyIdx.getDefIndex();
+        origDefRange->valno->setCopy(copyMI);
+
+        // Insert a new range & vni for the two-address-to-copy value. This
+        // will be attached to the new live interval.
+        VNInfo *copyVNI =
+          newLI->getNextValue(useIdx.getDefIndex(), 0, true,
+                              lis->getVNInfoAllocator());
+        copyVNI->addKill(copyIdx.getDefIndex());
+        LiveRange copyRange(useIdx.getDefIndex(),copyIdx.getDefIndex(),copyVNI);
+        newLI->addRange(copyRange);
+      }
+    }
+    
+    // Iterate over any PHI kills - we'll need to insert new copies for them.
+    for (VNInfo::KillSet::iterator
+         killItr = newVNI->kills.begin(), killEnd = newVNI->kills.end();
+         killItr != killEnd; ++killItr) {
+      SlotIndex killIdx(*killItr);
+      if (killItr->isPHI()) {
+        MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
+        LiveRange *oldKillRange =
+          newLI->getLiveRangeContaining(killIdx);
+
+        assert(oldKillRange != 0 && "No kill range?");
+
+        tii->copyRegToReg(*killMBB, killMBB->getFirstTerminator(),
+                          li->reg, newVReg, trc, trc);
+        MachineInstr *copyMI = prior(killMBB->getFirstTerminator());
+        copyMI->addRegisterKilled(newVReg, tri);
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+
+        // Save the current end. We may need it to add a new range if the
+        // current range runs of the end of the MBB.
+        SlotIndex newKillRangeEnd = oldKillRange->end;
+        oldKillRange->end = copyIdx.getDefIndex();
+
+        if (newKillRangeEnd != lis->getMBBEndIdx(killMBB).getNextSlot()) {
+          assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB).getNextSlot() &&
+                 "PHI kill range doesn't reach kill-block end. Not sane.");
+          newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB).getNextSlot(),
+                                    newKillRangeEnd, newVNI));
+        }
+
+        *killItr = oldKillRange->end;
+        VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
+                                              copyMI, true,
+                                              lis->getVNInfoAllocator());
+        newKillVNI->addKill(lis->getMBBTerminatorGap(killMBB));
+        newKillVNI->setHasPHIKill(true);
+        li->addRange(LiveRange(copyIdx.getDefIndex(),
+                               lis->getMBBEndIdx(killMBB).getNextSlot(),
+                               newKillVNI));
+      }
+
+    }
+
+    newVNI->setHasPHIKill(false);
+
+    return newLI;
+  }
+
+};
+
 }
 
 llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
@@ -203,7 +520,8 @@ llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
                                    VirtRegMap *vrm) {
   switch (spillerOpt) {
     case trivial: return new TrivialSpiller(mf, lis, vrm); break;
-    case standard: return new StandardSpiller(mf, lis, loopInfo, vrm); break;
+    case standard: return new StandardSpiller(lis, loopInfo, vrm); break;
+    case splitting: return new SplittingSpiller(mf, lis, loopInfo, vrm); break;
     default: llvm_unreachable("Unreachable!"); break;
   }
 }
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index c6bd985..dda52e8 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -21,6 +21,7 @@ namespace llvm {
   class MachineFunction;
   class MachineInstr;
   class MachineLoopInfo;
+  class SlotIndex;
   class VirtRegMap;
   class VNInfo;
 
@@ -35,7 +36,8 @@ namespace llvm {
     /// Spill the given live range. The method used will depend on the Spiller
     /// implementation selected.
     virtual std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs) = 0;
+					     SmallVectorImpl<LiveInterval*> &spillIs,
+                                             SlotIndex *earliestIndex = 0) = 0;
 
   };
 
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index c299192..fd25a37 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -668,7 +668,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     if (DCELimit != -1 && (int)NumDead >= DCELimit)
       break;
     
-    MachineBasicBlock::iterator NextMI = next(I);
+    MachineBasicBlock::iterator NextMI = llvm::next(I);
     if (NextMI == MBB->end()) continue;
     
     int FirstSS, SecondSS;
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 9c0b596..bf58902 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -17,15 +17,19 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
+STATISTIC(NumTails     , "Number of tails duplicated");
 STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
 STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
@@ -36,34 +40,71 @@ TailDuplicateSize("tail-dup-size",
                   cl::desc("Maximum instructions to consider tail duplicating"),
                   cl::init(2), cl::Hidden);
 
+static cl::opt<bool>
+TailDupVerify("tail-dup-verify",
+              cl::desc("Verify sanity of PHI instructions during taildup"),
+              cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned>
+TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden);
+
+typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy;
+
 namespace {
   /// TailDuplicatePass - Perform tail duplication.
   class TailDuplicatePass : public MachineFunctionPass {
+    bool PreRegAlloc;
     const TargetInstrInfo *TII;
     MachineModuleInfo *MMI;
+    MachineRegisterInfo *MRI;
+
+    // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
+    SmallVector<unsigned, 16> SSAUpdateVRs;
+
+    // SSAUpdateVals - For each virtual register in SSAUpdateVals keep a list of
+    // source virtual registers.
+    DenseMap<unsigned, AvailableValsTy> SSAUpdateVals;
 
   public:
     static char ID;
-    explicit TailDuplicatePass() : MachineFunctionPass(&ID) {}
+    explicit TailDuplicatePass(bool PreRA) :
+      MachineFunctionPass(&ID), PreRegAlloc(PreRA) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "Tail Duplication"; }
 
   private:
+    void AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                           MachineBasicBlock *BB);
+    void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
+                    MachineBasicBlock *PredBB,
+                    DenseMap<unsigned, unsigned> &LocalVRMap,
+                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies);
+    void DuplicateInstruction(MachineInstr *MI,
+                              MachineBasicBlock *TailBB,
+                              MachineBasicBlock *PredBB,
+                              MachineFunction &MF,
+                              DenseMap<unsigned, unsigned> &LocalVRMap);
+    void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                              SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                              SmallSetVector<MachineBasicBlock*, 8> &Succs);
     bool TailDuplicateBlocks(MachineFunction &MF);
-    bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF);
+    bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                       SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                       SmallVector<MachineInstr*, 16> &Copies);
     void RemoveDeadBlock(MachineBasicBlock *MBB);
   };
 
   char TailDuplicatePass::ID = 0;
 }
 
-FunctionPass *llvm::createTailDuplicatePass() {
-  return new TailDuplicatePass();
+FunctionPass *llvm::createTailDuplicatePass(bool PreRegAlloc) {
+  return new TailDuplicatePass(PreRegAlloc);
 }
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
 
   bool MadeChange = false;
@@ -77,36 +118,325 @@ bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   return MadeChange;
 }
 
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(),
+                                                MBB->pred_end());
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != MBB->end()) {
+      if (MI->getOpcode() != TargetInstrInfo::PHI)
+        break;
+      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+             PE = Preds.end(); PI != PE; ++PI) {
+        MachineBasicBlock *PredBB = *PI;
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          errs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          errs() << "  missing input from predecessor BB#"
+                 << PredBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          // This is not a hard error.
+          errs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
+                 << ": " << *MI;
+          errs() << "  extra input from predecessor BB#"
+                 << PHIBB->getNumber() << '\n';
+        }
+        if (PHIBB->getNumber() < 0) {
+          errs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          errs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
 /// TailDuplicateBlocks - Look for small blocks that are unconditionally
 /// branched to and do not fall through. Tail-duplicate their instructions
 /// into their predecessors to eliminate (dynamic) branches.
 bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
   bool MadeChange = false;
 
+  if (PreRegAlloc && TailDupVerify) {
+    DEBUG(errs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(MF, true);
+  }
+
+  SmallVector<MachineInstr*, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
+
   for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
     MachineBasicBlock *MBB = I++;
 
+    if (NumTails == TailDupLimit)
+      break;
+
     // Only duplicate blocks that end with unconditional branches.
     if (MBB->canFallThrough())
       continue;
 
-    MadeChange |= TailDuplicate(MBB, MF);
-
-    // If it is dead, remove it.
-    if (MBB->pred_empty()) {
-      NumInstrDups -= MBB->size();
-      RemoveDeadBlock(MBB);
+    // Save the successors list.
+    SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
+                                                MBB->succ_end());
+
+    SmallVector<MachineBasicBlock*, 8> TDBBs;
+    SmallVector<MachineInstr*, 16> Copies;
+    if (TailDuplicate(MBB, MF, TDBBs, Copies)) {
+      ++NumTails;
+
+      // TailBB's immediate successors are now successors of those predecessors
+      // which duplicated TailBB. Add the predecessors as sources to the PHI
+      // instructions.
+      bool isDead = MBB->pred_empty();
+      if (PreRegAlloc)
+        UpdateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+      // If it is dead, remove it.
+      if (isDead) {
+        NumInstrDups -= MBB->size();
+        RemoveDeadBlock(MBB);
+        ++NumDeadBlocks;
+      }
+
+      // Update SSA form.
+      if (!SSAUpdateVRs.empty()) {
+        for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+          unsigned VReg = SSAUpdateVRs[i];
+          SSAUpdate.Initialize(VReg);
+
+          // If the original definition is still around, add it as an available
+          // value.
+          MachineInstr *DefMI = MRI->getVRegDef(VReg);
+          MachineBasicBlock *DefBB = 0;
+          if (DefMI) {
+            DefBB = DefMI->getParent();
+            SSAUpdate.AddAvailableValue(DefBB, VReg);
+          }
+
+          // Add the new vregs as available values.
+          DenseMap<unsigned, AvailableValsTy>::iterator LI =
+            SSAUpdateVals.find(VReg);  
+          for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+            MachineBasicBlock *SrcBB = LI->second[j].first;
+            unsigned SrcReg = LI->second[j].second;
+            SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+          }
+
+          // Rewrite uses that are outside of the original def's block.
+          MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
+          while (UI != MRI->use_end()) {
+            MachineOperand &UseMO = UI.getOperand();
+            MachineInstr *UseMI = &*UI;
+            ++UI;
+            if (UseMI->getParent() == DefBB)
+              continue;
+            SSAUpdate.RewriteUse(UseMO);
+          }
+        }
+
+        SSAUpdateVRs.clear();
+        SSAUpdateVals.clear();
+      }
+
+      // Eliminate some of the copies inserted tail duplication to maintain
+      // SSA form.
+      for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+        MachineInstr *Copy = Copies[i];
+        unsigned Src, Dst, SrcSR, DstSR;
+        if (TII->isMoveInstr(*Copy, Src, Dst, SrcSR, DstSR)) {
+          MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
+          if (++UI == MRI->use_end()) {
+            // Copy is the only use. Do trivial copy propagation here.
+            MRI->replaceRegWith(Dst, Src);
+            Copy->eraseFromParent();
+          }
+        }
+      }
+
+      if (PreRegAlloc && TailDupVerify)
+        VerifyPHIs(MF, false);
       MadeChange = true;
-      ++NumDeadBlocks;
     }
   }
+
   return MadeChange;
 }
 
+static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i+1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+/// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
+/// SSA update.
+void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                                          MachineBasicBlock *BB) {
+  DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB.
+/// Remember the source register that's contributed by PredBB and update SSA
+/// update map.
+void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
+                                   MachineBasicBlock *TailBB,
+                                   MachineBasicBlock *PredBB,
+                                   DenseMap<unsigned, unsigned> &LocalVRMap,
+                         SmallVector<std::pair<unsigned,unsigned>, 4> &Copies) {
+  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, SrcReg));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, SrcReg));
+  if (isDefLiveOut(DefReg, TailBB, MRI))
+    AddSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  // Remove PredBB from the PHI node.
+  MI->RemoveOperand(SrcOpIdx+1);
+  MI->RemoveOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1)
+    MI->eraseFromParent();
+}
+
+/// DuplicateInstruction - Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
+                                     MachineBasicBlock *TailBB,
+                                     MachineBasicBlock *PredBB,
+                                     MachineFunction &MF,
+                                     DenseMap<unsigned, unsigned> &LocalVRMap) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO.isDef()) {
+      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+      unsigned NewReg = MRI->createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      LocalVRMap.insert(std::make_pair(Reg, NewReg));
+      if (isDefLiveOut(Reg, TailBB, MRI))
+        AddSSAUpdateEntry(Reg, NewReg, PredBB);
+    } else {
+      DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
+      if (VI != LocalVRMap.end())
+        MO.setReg(VI->second);
+    }
+  }
+  PredBB->insert(PredBB->end(), NewMI);
+}
+
+/// UpdateSuccessorsPHIs - After FromBB is tail duplicated into its predecessor
+/// blocks, the successors have gained new predecessors. Update the PHI
+/// instructions in them accordingly.
+void
+TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                                  SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                  SmallSetVector<MachineBasicBlock*,8> &Succs) {
+  for (SmallSetVector<MachineBasicBlock*, 8>::iterator SI = Succs.begin(),
+         SE = Succs.end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
+         II != EE; ++II) {
+      if (II->getOpcode() != TargetInstrInfo::PHI)
+        break;
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = II->getOperand(i+1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = II->getOperand(Idx);
+      unsigned Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = II->getNumOperands()-2; i != Idx; i -= 2) {
+          MachineOperand &MO = II->getOperand(i+1);
+          if (MO.getMBB() == FromBB) {
+            II->RemoveOperand(i+1);
+            II->RemoveOperand(i);
+          }
+        }
+        II->RemoveOperand(Idx+1);
+        II->RemoveOperand(Idx);
+      }
+      DenseMap<unsigned,AvailableValsTy>::iterator LI=SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = LI->second[j].first;
+          unsigned SrcReg = LI->second[j].second;
+          II->addOperand(MachineOperand::CreateReg(SrcReg, false));
+          II->addOperand(MachineOperand::CreateMBB(SrcBB));
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = TDBBs[j];
+          II->addOperand(MachineOperand::CreateReg(Reg, false));
+          II->addOperand(MachineOperand::CreateMBB(SrcBB));
+        }
+      }
+    }
+  }
+}
+
 /// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
 /// of its predecessors.
-bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
-                                        MachineFunction &MF) {
+bool
+TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                 SmallVector<MachineInstr*, 16> &Copies) {
   // Don't try to tail-duplicate single-block loops.
   if (TailBB->isSuccessor(TailBB))
     return false;
@@ -129,28 +459,36 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
 
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
-  unsigned i = 0;
+  unsigned InstrCount = 0;
   bool HasCall = false;
   for (MachineBasicBlock::iterator I = TailBB->begin();
-       I != TailBB->end(); ++I, ++i) {
+       I != TailBB->end(); ++I) {
     // Non-duplicable things shouldn't be tail-duplicated.
     if (I->getDesc().isNotDuplicable()) return false;
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && I->getDesc().isReturn()) return false;
     // Don't duplicate more than the threshold.
-    if (i == MaxDuplicateCount) return false;
+    if (InstrCount == MaxDuplicateCount) return false;
     // Remember if we saw a call.
     if (I->getDesc().isCall()) HasCall = true;
+    if (I->getOpcode() != TargetInstrInfo::PHI)
+      InstrCount += 1;
   }
   // Heuristically, don't tail-duplicate calls if it would expand code size,
   // as it's less likely to be worth the extra cost.
-  if (i > 1 && HasCall)
+  if (InstrCount > 1 && HasCall)
     return false;
 
+  DEBUG(errs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
+
   // Iterate through all the unique predecessors and tail-duplicate this
   // block into them, if possible. Copying the list ahead of time also
   // avoids trouble with the predecessor list reallocating.
   bool Changed = false;
-  SmallSetVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
-                                               TailBB->pred_end());
+  SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
+                                              TailBB->pred_end());
   for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
        PE = Preds.end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
@@ -175,13 +513,35 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
     DEBUG(errs() << "\nTail-duplicating into PredBB: " << *PredBB
                  << "From Succ: " << *TailBB);
 
+    TDBBs.push_back(PredBB);
+
     // Remove PredBB's unconditional branch.
     TII->RemoveBranch(*PredBB);
+
     // Clone the contents of TailBB into PredBB.
-    for (MachineBasicBlock::iterator I = TailBB->begin(), E = TailBB->end();
-         I != E; ++I) {
-      MachineInstr *NewMI = MF.CloneMachineInstr(I);
-      PredBB->insert(PredBB->end(), NewMI);
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    while (I != TailBB->end()) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->getOpcode() == TargetInstrInfo::PHI) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap);
+      }
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
+      TII->copyRegToReg(*PredBB, Loc, CopyInfos[i].first,
+                        CopyInfos[i].second, RC,RC);
+      MachineInstr *CopyMI = prior(Loc);
+      Copies.push_back(CopyMI);
     }
     NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
 
@@ -190,8 +550,8 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
     assert(PredBB->succ_empty() &&
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
-         E = TailBB->succ_end(); I != E; ++I)
-       PredBB->addSuccessor(*I);
+           E = TailBB->succ_end(); I != E; ++I)
+      PredBB->addSuccessor(*I);
 
     Changed = true;
     ++NumTailDups;
@@ -200,22 +560,56 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
   // If TailBB was duplicated into all its predecessors except for the prior
   // block, which falls through unconditionally, move the contents of this
   // block into the prior block.
-  MachineBasicBlock &PrevBB = *prior(MachineFunction::iterator(TailBB));
+  MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB));
   MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
   SmallVector<MachineOperand, 4> PriorCond;
   bool PriorUnAnalyzable =
-    TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+    TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true);
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
   if (!PriorUnAnalyzable && PriorCond.empty() && !PriorTBB &&
-      TailBB->pred_size() == 1 && PrevBB.succ_size() == 1 &&
+      TailBB->pred_size() == 1 && PrevBB->succ_size() == 1 &&
       !TailBB->hasAddressTaken()) {
-    DEBUG(errs() << "\nMerging into block: " << PrevBB
+    DEBUG(errs() << "\nMerging into block: " << *PrevBB
           << "From MBB: " << *TailBB);
-    PrevBB.splice(PrevBB.end(), TailBB, TailBB->begin(), TailBB->end());
-    PrevBB.removeSuccessor(PrevBB.succ_begin());;
-    assert(PrevBB.succ_empty());
-    PrevBB.transferSuccessors(TailBB);
+    if (PreRegAlloc) {
+      DenseMap<unsigned, unsigned> LocalVRMap;
+      SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+      MachineBasicBlock::iterator I = TailBB->begin();
+      // Process PHI instructions first.
+      while (I != TailBB->end() && I->getOpcode() == TargetInstrInfo::PHI) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        MachineInstr *MI = &*I++;
+        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos);
+        if (MI->getParent())
+          MI->eraseFromParent();
+      }
+
+      // Now copy the non-PHI instructions.
+      while (I != TailBB->end()) {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        MachineInstr *MI = &*I++;
+        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap);
+        MI->eraseFromParent();
+      }
+      MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
+      for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+        const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
+        TII->copyRegToReg(*PrevBB, Loc, CopyInfos[i].first,
+                          CopyInfos[i].second, RC, RC);
+        MachineInstr *CopyMI = prior(Loc);
+        Copies.push_back(CopyMI);
+      }
+    } else {
+      // No PHIs to worry about, just splice the instructions over.
+      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+    }
+    PrevBB->removeSuccessor(PrevBB->succ_begin());
+    assert(PrevBB->succ_empty());
+    PrevBB->transferSuccessors(TailBB);
+    TDBBs.push_back(PrevBB);
     Changed = true;
   }
 
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 102e2a3..393e315 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -329,7 +329,7 @@ TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(const MachineInstr *
       return false;
 
     // For the def, it should be the only def of that register.
-    if (MO.isDef() && (next(MRI.def_begin(Reg)) != MRI.def_end() ||
+    if (MO.isDef() && (llvm::next(MRI.def_begin(Reg)) != MRI.def_end() ||
                        MRI.isLiveIn(Reg)))
       return false;
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 5fa690b..98b95ac 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -211,7 +211,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   ++KillPos;
 
   unsigned NumVisited = 0;
-  for (MachineBasicBlock::iterator I = next(OldPos); I != KillPos; ++I) {
+  for (MachineBasicBlock::iterator I = llvm::next(OldPos); I != KillPos; ++I) {
     MachineInstr *OtherMI = I;
     if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -412,7 +412,7 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
     MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
     // If there are multiple defs, we can't do a simple analysis, so just
     // go with what the kill flag says.
-    if (next(Begin) != MRI->def_end())
+    if (llvm::next(Begin) != MRI->def_end())
       return true;
     DefMI = &*Begin;
     bool IsSrcPhys, IsDstPhys;
@@ -643,7 +643,7 @@ TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
     if (!Sunk) {
       DistanceMap.insert(std::make_pair(NewMI, Dist));
       mi = NewMI;
-      nmi = next(mi);
+      nmi = llvm::next(mi);
     }
     return true;
   }
@@ -923,7 +923,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
     Processed.clear();
     for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
          mi != me; ) {
-      MachineBasicBlock::iterator nmi = next(mi);
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
       const TargetInstrDesc &TID = mi->getDesc();
       bool FirstTied = true;
 
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 10c8066..054c3b6 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -754,7 +754,7 @@ void AvailableSpills::AddAvailableRegsToLiveIn(MachineBasicBlock &MBB,
     }
 
     // Skip over the same register.
-    std::multimap<unsigned, int>::iterator NI = next(I);
+    std::multimap<unsigned, int>::iterator NI = llvm::next(I);
     while (NI != E && NI->first == Reg) {
       ++I;
       ++NI;
@@ -1133,7 +1133,7 @@ private:
                          std::vector<MachineOperand*> &KillOps,
                          VirtRegMap &VRM) {
 
-    MachineBasicBlock::iterator NextMII = next(MII);
+    MachineBasicBlock::iterator NextMII = llvm::next(MII);
     if (NextMII == MBB.end())
       return false;
 
@@ -1186,7 +1186,7 @@ private:
     // Unfold next instructions that fold the same SS.
     do {
       MachineInstr &NextMI = *NextMII;
-      NextMII = next(NextMII);
+      NextMII = llvm::next(NextMII);
       NewMIs.clear();
       if (!TII->unfoldMemoryOperand(MF, &NextMI, VirtReg, false, false, NewMIs))
         llvm_unreachable("Unable unfold the load / store folding instruction!");
@@ -1463,8 +1463,8 @@ private:
                            std::vector<MachineOperand*> &KillOps,
                            VirtRegMap &VRM) {
 
-    MachineBasicBlock::iterator oldNextMII = next(MII);
-    TII->storeRegToStackSlot(MBB, next(MII), PhysReg, true, StackSlot, RC);
+    MachineBasicBlock::iterator oldNextMII = llvm::next(MII);
+    TII->storeRegToStackSlot(MBB, llvm::next(MII), PhysReg, true, StackSlot, RC);
     MachineInstr *StoreMI = prior(oldNextMII);
     VRM.addSpillSlotUse(StackSlot, StoreMI);
     DEBUG(errs() << "Store:\t" << *StoreMI);
@@ -1626,14 +1626,14 @@ private:
     DistanceMap.clear();
     for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
          MII != E; ) {
-      MachineBasicBlock::iterator NextMII = next(MII);
+      MachineBasicBlock::iterator NextMII = llvm::next(MII);
 
       VirtRegMap::MI2VirtMapTy::const_iterator I, End;
       bool Erased = false;
       bool BackTracked = false;
       if (OptimizeByUnfold(MBB, MII,
                            MaybeDeadStores, Spills, RegKills, KillOps, VRM))
-        NextMII = next(MII);
+        NextMII = llvm::next(MII);
 
       MachineInstr &MI = *MII;
 
@@ -1657,7 +1657,7 @@ private:
 
           // Back-schedule reloads and remats.
           MachineBasicBlock::iterator InsertLoc =
-            ComputeReloadLoc(next(MII), MBB.begin(), PhysReg, TRI, false,
+            ComputeReloadLoc(llvm::next(MII), MBB.begin(), PhysReg, TRI, false,
                              SS, TII, MF);
 
           TII->loadRegFromStackSlot(MBB, InsertLoc, PhysReg, SS, RC);
@@ -1667,7 +1667,7 @@ private:
           ++NumPSpills;
           DistanceMap.insert(std::make_pair(LoadMI, Dist++));
         }
-        NextMII = next(MII);
+        NextMII = llvm::next(MII);
       }
 
       // Insert restores here if asked to.
@@ -1785,14 +1785,14 @@ private:
           const TargetRegisterClass *RC = RegInfo->getRegClass(VirtReg);
           unsigned Phys = VRM.getPhys(VirtReg);
           int StackSlot = VRM.getStackSlot(VirtReg);
-          MachineBasicBlock::iterator oldNextMII = next(MII);
-          TII->storeRegToStackSlot(MBB, next(MII), Phys, isKill, StackSlot, RC);
+          MachineBasicBlock::iterator oldNextMII = llvm::next(MII);
+          TII->storeRegToStackSlot(MBB, llvm::next(MII), Phys, isKill, StackSlot, RC);
           MachineInstr *StoreMI = prior(oldNextMII);
           VRM.addSpillSlotUse(StackSlot, StoreMI);
           DEBUG(errs() << "Store:\t" << *StoreMI);
           VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
         }
-        NextMII = next(MII);
+        NextMII = llvm::next(MII);
       }
 
       /// ReusedOperands - Keep track of operand reuse in case we need to undo
@@ -2265,7 +2265,7 @@ private:
 
               if (CommuteToFoldReload(MBB, MII, VirtReg, SrcReg, StackSlot,
                                       Spills, RegKills, KillOps, TRI, VRM)) {
-                NextMII = next(MII);
+                NextMII = llvm::next(MII);
                 BackTracked = true;
                 goto ProcessNextInst;
               }
@@ -2381,7 +2381,7 @@ private:
           MachineInstr *&LastStore = MaybeDeadStores[StackSlot];
           SpillRegToStackSlot(MBB, MII, -1, PhysReg, StackSlot, RC, true,
                             LastStore, Spills, ReMatDefs, RegKills, KillOps, VRM);
-          NextMII = next(MII);
+          NextMII = llvm::next(MII);
 
           // Check to see if this is a noop copy.  If so, eliminate the
           // instruction before considering the dest reg to be changed.
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 6d781c7..26afa54 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -208,7 +208,7 @@ ExecutionEngine *JIT::createJIT(ModuleProvider *MP,
                                 JITMemoryManager *JMM,
                                 CodeGenOpt::Level OptLevel,
                                 bool GVsWithCode,
-				CodeModel::Model CMM) {
+                                CodeModel::Model CMM) {
   // Make sure we can resolve symbols in the program as well. The zero arg
   // to the function tells DynamicLibrary to load the program, not a library.
   if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
@@ -681,7 +681,7 @@ void *JIT::getOrEmitGlobalVariable(const GlobalVariable *GV) {
   if (Ptr) return Ptr;
 
   // If the global is external, just remember the address.
-  if (GV->isDeclaration()) {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) {
 #if HAVE___DSO_HANDLE
     if (GV->getName() == "__dso_handle")
       return (void*)&__dso_handle;
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
index f2b28ad..0193486 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
@@ -175,7 +175,6 @@ struct KeyInfo {
   static inline unsigned getTombstoneKey() { return -2U; }
   static unsigned getHashValue(const unsigned &Key) { return Key; }
   static bool isEqual(unsigned LHS, unsigned RHS) { return LHS == RHS; }
-  static bool isPod() { return true; }
 };
 
 /// ActionEntry - Structure describing an entry in the actions table.
diff --git a/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
index b45c71f..52a8f71 100644
--- a/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
@@ -69,24 +69,18 @@ OProfileJITEventListener::~OProfileJITEventListener() {
 }
 
 class FilenameCache {
-  // Holds the filename of each Scope, so that we can pass the
-  // pointer into oprofile.  These char*s are freed in the destructor.
-  DenseMap<MDNode*, char*> Filenames;
+  // Holds the filename of each Scope, so that we can pass a null-terminated
+  // string into oprofile.
+  DenseMap<MDNode*, std::string> Filenames;
 
  public:
   const char *getFilename(MDNode *Scope) {
-    char *&Filename = Filenames[Scope];
-    if (Filename == NULL) {
+    std::string &Filename = Filenames[Scope];
+    if (Filename.empty()) {
       DIScope S(Scope);
-      Filename = strdup(S.getFilename());
-    }
-    return Filename;
-  }
-  ~FilenameCache() {
-    for (DenseMap<MDNode*, char*>::iterator
-             I = Filenames.begin(), E = Filenames.end(); I != E; ++I) {
-      free(I->second);
+      Filename = S.getFilename();
     }
+    return Filename.c_str();
   }
 };
 
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index cd355ff..ac736dc 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMSupport
   CommandLine.cpp
   ConstantRange.cpp
   Debug.cpp
+  DeltaAlgorithm.cpp
   Dwarf.cpp
   ErrorHandling.cpp
   FileUtilities.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 9cf9c89..b6c0e08 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -778,9 +778,10 @@ void cl::ParseCommandLineOptions(int argc, char **argv,
       free(*i);
   }
 
-  DEBUG(errs() << "\nArgs: ";
+  DEBUG(errs() << "Args: ";
         for (int i = 0; i < argc; ++i)
           errs() << argv[i] << ' ';
+        errs() << '\n';
        );
 
   // If we had an error processing our arguments, don't let the program execute
diff --git a/lib/Support/DeltaAlgorithm.cpp b/lib/Support/DeltaAlgorithm.cpp
new file mode 100644
index 0000000..d176548
--- /dev/null
+++ b/lib/Support/DeltaAlgorithm.cpp
@@ -0,0 +1,114 @@
+//===--- DeltaAlgorithm.cpp - A Set Minimization Algorithm -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DeltaAlgorithm.h"
+#include <algorithm>
+#include <iterator>
+using namespace llvm;
+
+DeltaAlgorithm::~DeltaAlgorithm() {
+}
+
+bool DeltaAlgorithm::GetTestResult(const changeset_ty &Changes) {
+  if (FailedTestsCache.count(Changes))
+    return false;
+
+  bool Result = ExecuteOneTest(Changes);
+  if (!Result)
+    FailedTestsCache.insert(Changes);
+
+  return Result;
+}
+
+void DeltaAlgorithm::Split(const changeset_ty &S, changesetlist_ty &Res) {
+  // FIXME: Allow clients to provide heuristics for improved splitting.
+
+  // FIXME: This is really slow.
+  changeset_ty LHS, RHS;
+  unsigned idx = 0;
+  for (changeset_ty::const_iterator it = S.begin(),
+         ie = S.end(); it != ie; ++it, ++idx)
+    ((idx & 1) ? LHS : RHS).insert(*it);
+  if (!LHS.empty())
+    Res.push_back(LHS);
+  if (!RHS.empty())
+    Res.push_back(RHS);
+}
+
+DeltaAlgorithm::changeset_ty
+DeltaAlgorithm::Delta(const changeset_ty &Changes,
+                      const changesetlist_ty &Sets) {
+  // Invariant: union(Res) == Changes
+  UpdatedSearchState(Changes, Sets);
+
+  // If there is nothing left we can remove, we are done.
+  if (Sets.size() <= 1)
+    return Changes;
+
+  // Look for a passing subset.
+  changeset_ty Res;
+  if (Search(Changes, Sets, Res))
+    return Res;
+
+  // Otherwise, partition the sets if possible; if not we are done.
+  changesetlist_ty SplitSets;
+  for (changesetlist_ty::const_iterator it = Sets.begin(),
+         ie = Sets.end(); it != ie; ++it)
+    Split(*it, SplitSets);
+  if (SplitSets.size() == Sets.size())
+    return Changes;
+
+  return Delta(Changes, SplitSets);
+}
+
+bool DeltaAlgorithm::Search(const changeset_ty &Changes,
+                            const changesetlist_ty &Sets,
+                            changeset_ty &Res) {
+  // FIXME: Parallelize.
+  for (changesetlist_ty::const_iterator it = Sets.begin(),
+         ie = Sets.end(); it != ie; ++it) {
+    // If the test passes on this subset alone, recurse.
+    if (GetTestResult(*it)) {
+      changesetlist_ty Sets;
+      Split(*it, Sets);
+      Res = Delta(*it, Sets);
+      return true;
+    }
+
+    // Otherwise, if we have more than two sets, see if test passes on the
+    // complement.
+    if (Sets.size() > 2) {
+      // FIXME: This is really slow.
+      changeset_ty Complement;
+      std::set_difference(
+        Changes.begin(), Changes.end(), it->begin(), it->end(),
+        std::insert_iterator<changeset_ty>(Complement, Complement.begin()));
+      if (GetTestResult(Complement)) {
+        changesetlist_ty ComplementSets;
+        ComplementSets.insert(ComplementSets.end(), Sets.begin(), it);
+        ComplementSets.insert(ComplementSets.end(), it + 1, Sets.end());
+        Res = Delta(Complement, ComplementSets);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+DeltaAlgorithm::changeset_ty DeltaAlgorithm::Run(const changeset_ty &Changes) {
+  // Check empty set first to quickly find poor test functions.
+  if (GetTestResult(changeset_ty()))
+    return changeset_ty();
+
+  // Otherwise run the real delta algorithm.
+  changesetlist_ty Sets;
+  Split(Changes, Sets);
+
+  return Delta(Changes, Sets);
+}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index b04864a..df1aa6a 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -176,7 +176,7 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
 #endif
   int FD = ::open(Filename.str().c_str(), O_RDONLY|OpenFlags);
   if (FD == -1) {
-    if (ErrStr) *ErrStr = "could not open file";
+    if (ErrStr) *ErrStr = strerror(errno);
     return 0;
   }
   
@@ -186,7 +186,7 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
     struct stat FileInfo;
     // TODO: This should use fstat64 when available.
     if (fstat(FD, &FileInfo) == -1) {
-      if (ErrStr) *ErrStr = "could not get file length";
+      if (ErrStr) *ErrStr = strerror(errno);
       ::close(FD);
       return 0;
     }
@@ -230,8 +230,8 @@ MemoryBuffer *MemoryBuffer::getFile(StringRef Filename, std::string *ErrStr,
       // try again
     } else {
       // error reading.
+      if (ErrStr) *ErrStr = strerror(errno);
       close(FD);
-      if (ErrStr) *ErrStr = "error reading file data";
       return 0;
     }
   }
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 31451cc..0c90e77 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -209,8 +209,7 @@ raw_ostream &raw_ostream::operator<<(const void *P) {
 }
 
 raw_ostream &raw_ostream::operator<<(double N) {
-  this->operator<<(ftostr(N));
-  return *this;
+  return this->operator<<(ftostr(N));
 }
 
 
diff --git a/lib/System/Atomic.cpp b/lib/System/Atomic.cpp
index f9b55a1..7ba8b77 100644
--- a/lib/System/Atomic.cpp
+++ b/lib/System/Atomic.cpp
@@ -85,7 +85,7 @@ sys::cas_flag sys::AtomicAdd(volatile sys::cas_flag* ptr, sys::cas_flag val) {
 #elif defined(__GNUC__)
   return __sync_add_and_fetch(ptr, val);
 #elif defined(_MSC_VER)
-  return InterlockedAdd(ptr, val);
+  return InterlockedExchangeAdd(ptr, val) + val;
 #else
 #  error No atomic add implementation for your platform!
 #endif
diff --git a/lib/System/Host.cpp b/lib/System/Host.cpp
index e112698..79897e4 100644
--- a/lib/System/Host.cpp
+++ b/lib/System/Host.cpp
@@ -103,11 +103,8 @@ static void DetectX86FamilyModel(unsigned EAX, unsigned &Family, unsigned &Model
     Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
   }
 }
-#endif
-
 
 std::string sys::getHostCPUName() {
-#if defined(__x86_64__) || defined(__i386__)
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   if (GetX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
     return "generic";
@@ -295,7 +292,10 @@ std::string sys::getHostCPUName() {
       return "generic";
     }
   }
-#endif
-
   return "generic";
 }
+#else
+std::string sys::getHostCPUName() {
+  return "generic";
+}
+#endif
diff --git a/lib/System/Unix/Path.inc b/lib/System/Unix/Path.inc
index ff1497a..33b26f7 100644
--- a/lib/System/Unix/Path.inc
+++ b/lib/System/Unix/Path.inc
@@ -77,7 +77,7 @@ inline bool lastIsSlash(const std::string& path) {
 namespace llvm {
 using namespace sys;
 
-extern const char sys::PathSeparator = ':';
+const char sys::PathSeparator = ':';
 
 Path::Path(const std::string& p)
   : path(p) {}
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index ff1980d..21445ad 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -109,7 +109,6 @@ FunctionPass *createNEONPreAllocPass();
 FunctionPass *createNEONMoveFixPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createThumb2SizeReductionPass();
-FunctionPass *createARMMaxStackAlignmentCalculatorPass();
 
 extern Target TheARMTarget, TheThumbTarget;
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c95d4c8..1aae369 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -418,11 +418,13 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
-/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
+DISABLE_INLINE
 static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
-                                unsigned JTI) DISABLE_INLINE;
+                                unsigned JTI);
 static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
                                 unsigned JTI) {
+  assert(JTI < JT.size());
   return JT[JTI].MBBs.size();
 }
 
@@ -467,6 +469,8 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       return MI->getOperand(2).getImm();
     case ARM::Int_eh_sjlj_setjmp:
       return 24;
+    case ARM::tInt_eh_sjlj_setjmp:
+      return 22;
     case ARM::t2Int_eh_sjlj_setjmp:
       return 22;
     case ARM::BR_JTr:
@@ -755,7 +759,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     assert((RC == ARM::QPRRegisterClass ||
             RC == ARM::QPR_VFP2RegisterClass ||
             RC == ARM::QPR_8RegisterClass) && "Unknown regclass!");
-    // FIXME: Neon instructions should support predicates
     if (Align >= 16
         && (getRegisterInfo().needsStackRealignment(MF))) {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 282e30c..78d9135 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -92,6 +92,8 @@ namespace ARMII {
     StMiscFrm     = 9  << FormShift,
     LdStMulFrm    = 10 << FormShift,
 
+    LdStExFrm     = 28 << FormShift,
+
     // Miscellaneous arithmetic instructions
     ArithMiscFrm  = 11 << FormShift,
 
@@ -190,9 +192,6 @@ public:
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
 
-  // Return true if the block does not fall through.
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const =0;
-
   virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
                                               MachineBasicBlock::iterator &MBBI,
                                               LiveVariables *LV) const;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 653328d..9b5f79f 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -471,21 +471,6 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
   }
 }
 
-static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
-  unsigned MaxAlign = 0;
-
-  for (int i = FFI->getObjectIndexBegin(),
-         e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-
-    unsigned Align = FFI->getObjectAlignment(i);
-    MaxAlign = std::max(MaxAlign, Align);
-  }
-
-  return MaxAlign;
-}
-
 /// hasFP - Return true if the specified function should have a dedicated frame
 /// pointer register.  This is true if the function has variable sized allocas
 /// or if frame pointer elimination is disabled.
@@ -585,16 +570,21 @@ ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   SmallVector<unsigned, 4> UnspilledCS2GPRs;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
-  MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Calculate and set max stack object alignment early, so we can decide
   // whether we will need stack realignment (and thus FP).
   if (RealignStack) {
-    unsigned MaxAlign = std::max(MFI->getMaxAlignment(),
-                                 calculateMaxStackAlignment(MFI));
-    MFI->setMaxAlignment(MaxAlign);
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    MFI->calculateMaxStackAlignment();
   }
 
+  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+  // scratch register.
+  // FIXME: It will be better just to find spare register here.
+  if (needsStackRealignment(MF) &&
+      AFI->isThumb2Function())
+    MF.getRegInfo().setPhysRegUsed(ARM::R4);
+
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
   const unsigned *CSRegs = getCalleeSavedRegs();
@@ -1368,14 +1358,30 @@ emitPrologue(MachineFunction &MF) const {
 
   // If we need dynamic stack realignment, do it here.
   if (needsStackRealignment(MF)) {
-    unsigned Opc;
     unsigned MaxAlign = MFI->getMaxAlignment();
     assert (!AFI->isThumb1OnlyFunction());
-    Opc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri;
-
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), ARM::SP)
+    if (!AFI->isThumbFunction()) {
+      // Emit bic sp, sp, MaxAlign
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::BICri), ARM::SP)
                                   .addReg(ARM::SP, RegState::Kill)
                                   .addImm(MaxAlign-1)));
+    } else {
+      // We cannot use sp as source/dest register here, thus we're emitting the
+      // following sequence:
+      // mov r4, sp
+      // bic r4, r4, MaxAlign
+      // mov sp, r4
+      // FIXME: It will be better just to find spare register here.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::R4)
+        .addReg(ARM::SP, RegState::Kill);
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::t2BICri), ARM::R4)
+                                  .addReg(ARM::R4, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
+        .addReg(ARM::R4, RegState::Kill);
+    }
   }
 }
 
@@ -1479,48 +1485,4 @@ emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
 }
 
-namespace {
-  struct MaximalStackAlignmentCalculator : public MachineFunctionPass {
-    static char ID;
-    MaximalStackAlignmentCalculator() : MachineFunctionPass(&ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
-      MachineFrameInfo *FFI = MF.getFrameInfo();
-      MachineRegisterInfo &RI = MF.getRegInfo();
-
-      // Calculate max stack alignment of all already allocated stack objects.
-      unsigned MaxAlign = calculateMaxStackAlignment(FFI);
-
-      // Be over-conservative: scan over all vreg defs and find, whether vector
-      // registers are used. If yes - there is probability, that vector register
-      // will be spilled and thus stack needs to be aligned properly.
-      for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
-           RegNum < RI.getLastVirtReg(); ++RegNum)
-        MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
-
-      if (FFI->getMaxAlignment() == MaxAlign)
-        return false;
-
-      FFI->setMaxAlignment(MaxAlign);
-      return true;
-    }
-
-    virtual const char *getPassName() const {
-      return "ARM Stack Required Alignment Auto-Detector";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-
-  char MaximalStackAlignmentCalculator::ID = 0;
-}
-
-FunctionPass*
-llvm::createARMMaxStackAlignmentCalculatorPass() {
-  return new MaximalStackAlignmentCalculator();
-}
-
 #include "ARMGenRegisterInfo.inc"
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index e59a315..acd30d2 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -418,10 +418,10 @@ void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
 static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
-  if (next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
     return false;
 
-  MachineBasicBlock *NextBB = next(MBBI);
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I)
     if (*I == NextBB)
@@ -760,7 +760,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
                      CompareMBBNumbers);
   MachineBasicBlock* WaterBB = *IP;
   if (WaterBB == OrigBB)
-    WaterList.insert(next(IP), NewBB);
+    WaterList.insert(llvm::next(IP), NewBB);
   else
     WaterList.insert(IP, OrigBB);
   NewWaterList.insert(OrigBB);
@@ -887,7 +887,7 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
 
 void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
                                               int delta) {
-  MachineFunction::iterator MBBI = BB; MBBI = next(MBBI);
+  MachineFunction::iterator MBBI = BB; MBBI = llvm::next(MBBI);
   for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs();
       i < e; ++i) {
     BBOffsets[i] += delta;
@@ -929,7 +929,7 @@ void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
       if (delta==0)
         return;
     }
-    MBBI = next(MBBI);
+    MBBI = llvm::next(MBBI);
   }
 }
 
@@ -1096,7 +1096,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     DEBUG(errs() << "Split at end of block\n");
     if (&UserMBB->back() == UserMI)
       assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
-    NewMBB = next(MachineFunction::iterator(UserMBB));
+    NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
     // Add an unconditional branch from UserMBB to fallthrough block.
     // Record it for branch lengthening; this new branch will not get out of
     // range, but if the preceding conditional branch is out of range, the
@@ -1144,7 +1144,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
          Offset < BaseInsertOffset;
          Offset += TII->GetInstSizeInBytes(MI),
-            MI = next(MI)) {
+            MI = llvm::next(MI)) {
       if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
         CPUser &U = CPUsers[CPUIndex];
         if (!OffsetIsInRange(Offset, EndInsertOffset,
@@ -1204,7 +1204,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
       NewWaterList.insert(NewIsland);
     }
     // The new CPE goes before the following block (NewMBB).
-    NewMBB = next(MachineFunction::iterator(WaterBB));
+    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
 
   } else {
     // No water found.
@@ -1406,7 +1406,7 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
 
   NumCBrFixed++;
   if (BMI != MI) {
-    if (next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
         BMI->getOpcode() == Br.UncondBr) {
       // Last MI in the BB is an unconditional branch. Can we simply invert the
       // condition and swap destinations:
@@ -1433,12 +1433,12 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
     // branch to the destination.
     int delta = TII->GetInstSizeInBytes(&MBB->back());
     BBSizes[MBB->getNumber()] -= delta;
-    MachineBasicBlock* SplitBB = next(MachineFunction::iterator(MBB));
+    MachineBasicBlock* SplitBB = llvm::next(MachineFunction::iterator(MBB));
     AdjustBBOffsetsAfter(SplitBB, -delta);
     MBB->back().eraseFromParent();
     // BBOffsets[SplitBB] is wrong temporarily, fixed below
   }
-  MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
 
   DEBUG(errs() << "  Insert B to BB#" << DestBB->getNumber()
                << " also invert condition and change dest. to BB#"
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c929c54..1b8727d 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -48,7 +48,7 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineInstr &MI = *MBBI;
-    MachineBasicBlock::iterator NMBBI = next(MBBI);
+    MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
 
     unsigned Opcode = MI.getOpcode();
     switch (Opcode) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index c839fc6..655c762 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include <sstream>
 using namespace llvm;
 
@@ -377,7 +378,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
-  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
 
   if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -500,6 +501,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
 
+  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
+  case ARMISD::SYNCBARRIER:   return "ARMISD::SYNCBARRIER";
+
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
@@ -1470,6 +1474,28 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   }
 }
 
+static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *Subtarget) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Op5 = Op.getOperand(5);
+  SDValue Res;
+  unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
+  if (isDeviceBarrier) {
+    if (Subtarget->hasV7Ops())
+      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0));
+    else
+      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0),
+                        DAG.getConstant(0, MVT::i32));
+  } else {
+    if (Subtarget->hasV7Ops())
+      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+    else
+      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                        DAG.getConstant(0, MVT::i32));
+  }
+  return Res;
+}
+
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                             unsigned VarArgsFrameIndex) {
   // vastart just stores the address of the VarArgsFrameIndex slot into the
@@ -2528,6 +2554,25 @@ static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
   return true;
 }
 
+/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isVTRN_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
+                                unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((unsigned) M[i] != i + WhichResult ||
+        (unsigned) M[i+1] != i + WhichResult)
+      return false;
+  }
+  return true;
+}
+
 static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
@@ -2548,6 +2593,33 @@ static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
   return true;
 }
 
+/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isVUZP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
+                                unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      if ((unsigned) M[i + j * Half] != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
 static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
@@ -2571,6 +2643,33 @@ static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
   return true;
 }
 
+/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
+                                unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((unsigned) M[i] != Idx ||
+        (unsigned) M[i+1] != Idx)
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+
 static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   // Canonicalize all-zeros and all-ones vectors.
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode());
@@ -2683,7 +2782,10 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
           isVTRNMask(M, VT, WhichResult) ||
           isVUZPMask(M, VT, WhichResult) ||
-          isVZIPMask(M, VT, WhichResult));
+          isVZIPMask(M, VT, WhichResult) ||
+          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
+          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
+          isVZIP_v_undef_Mask(M, VT, WhichResult));
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
@@ -2815,6 +2917,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
                        V1, V2).getValue(WhichResult);
 
+  if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       V1, V1).getValue(WhichResult);
+  if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       V1, V1).getValue(WhichResult);
+  if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       V1, V1).getValue(WhichResult);
+
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   if (VT.getVectorNumElements() == 4 &&
@@ -2886,6 +2998,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG, VarArgsFrameIndex);
+  case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
@@ -2938,14 +3051,233 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
 //===----------------------------------------------------------------------===//
 
 MachineBasicBlock *
+ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                                     unsigned Size) const {
+  unsigned dest    = MI->getOperand(0).getReg();
+  unsigned ptr     = MI->getOperand(1).getReg();
+  unsigned oldval  = MI->getOperand(2).getReg();
+  unsigned newval  = MI->getOperand(3).getReg();
+  unsigned scratch = BB->getParent()->getRegInfo()
+    .createVirtualRegister(ARM::GPRRegisterClass);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned ldrOpc, strOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    break;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ldrex dest, [ptr]
+  //   cmp dest, oldval
+  //   bne exitMBB
+  BB = loop1MBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                 .addReg(dest).addReg(oldval));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  BB->addSuccessor(loop2MBB);
+  BB->addSuccessor(exitMBB);
+
+  // loop2MBB:
+  //   strex scratch, newval, [ptr]
+  //   cmp scratch, #0
+  //   bne loop1MBB
+  BB = loop2MBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval)
+                 .addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    unsigned Size, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+  unsigned ldrOpc, strOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    break;
+  }
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+  unsigned scratch2 = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrex dest, ptr
+  //   <binop> scratch2, dest, incr
+  //   strex scratch, scratch2, ptr
+  //   cmp scratch, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  if (BinOpcode) {
+    // operand order needs to go the other way for NAND
+    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
+                     addReg(incr).addReg(dest)).addReg(0);
+    else
+      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
+                     addReg(dest).addReg(incr)).addReg(0);
+  }
+
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
+                 .addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB,
                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
   default:
+    MI->dump();
     llvm_unreachable("Unexpected instr type to insert");
+
+  case ARM::ATOMIC_LOAD_ADD_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+  case ARM::ATOMIC_LOAD_ADD_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+  case ARM::ATOMIC_LOAD_ADD_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+
+  case ARM::ATOMIC_LOAD_AND_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMIC_LOAD_AND_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMIC_LOAD_AND_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+
+  case ARM::ATOMIC_LOAD_OR_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMIC_LOAD_OR_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMIC_LOAD_OR_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+
+  case ARM::ATOMIC_LOAD_XOR_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMIC_LOAD_XOR_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMIC_LOAD_XOR_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+
+  case ARM::ATOMIC_LOAD_NAND_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+  case ARM::ATOMIC_LOAD_NAND_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+  case ARM::ATOMIC_LOAD_NAND_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+
+  case ARM::ATOMIC_LOAD_SUB_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+  case ARM::ATOMIC_LOAD_SUB_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+  case ARM::ATOMIC_LOAD_SUB_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+
+  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
+  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
+  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
+
+  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
+  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
+  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
+
   case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
@@ -3935,6 +4267,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return std::make_pair(0U, ARM::SPRRegisterClass);
       if (VT == MVT::f64)
         return std::make_pair(0U, ARM::DPRRegisterClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, ARM::QPRRegisterClass);
       break;
     }
   }
@@ -3973,6 +4307,9 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
                                    ARM::D4, ARM::D5, ARM::D6, ARM::D7,
                                    ARM::D8, ARM::D9, ARM::D10,ARM::D11,
                                    ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0);
+    if (VT.getSizeInBits() == 128)
+      return make_vector<unsigned>(ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
+                                   ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7, 0);
       break;
   }
 
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 4f31f8a..e1b3348 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -72,6 +72,9 @@ namespace llvm {
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
 
+      MEMBARRIER,   // Memory barrier
+      SYNCBARRIER,  // Memory sync barrier
+
       VCEQ,         // Vector compare equal.
       VCGE,         // Vector compare greater than or equal.
       VCGEU,        // Vector compare unsigned greater than or equal.
@@ -328,6 +331,15 @@ namespace llvm {
 
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl);
+
+    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
+                                        MachineBasicBlock *BB,
+                                        unsigned Size,
+                                        unsigned BinOpcode) const;
+
   };
 }
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index e76e93c..cf0edff 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -33,6 +33,8 @@ def LdMiscFrm     : Format<8>;
 def StMiscFrm     : Format<9>;
 def LdStMulFrm    : Format<10>;
 
+def LdStExFrm     : Format<28>;
+
 def ArithMiscFrm  : Format<11>;
 def ExtFrm        : Format<12>;
 
@@ -199,6 +201,19 @@ class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   let Pattern = pattern;
   list<Predicate> Predicates = [IsARM];
 }
+// A few are not predicable
+class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+        IndexMode im, Format f, InstrItinClass itin, 
+        string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = !strconcat(opc, asm);
+  let Pattern = pattern;
+  let isPredicable = 0;
+  list<Predicate> Predicates = [IsARM];
+}
 
 // Same as I except it can optionally modify CPSR. Note it's modeled as
 // an input operand since by default it's a zero register. It will
@@ -239,6 +254,10 @@ class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
           string asm, list<dag> pattern>
   : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
        asm, "", pattern>;
+class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
 
 // Ctrl flow instructions
 class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
@@ -264,6 +283,28 @@ class JTI<dag oops, dag iops, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin,
        asm, "", pattern>;
 
+
+// Atomic load/store instructions
+
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20} = 1;
+  let Inst{11-0}  = 0b111110011111;
+}
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20} = 0;
+  let Inst{11-4}  = 0b11111001;
+}
+
 // addrmode1 instructions
 class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
@@ -967,6 +1008,17 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
   list<Predicate> Predicates = [IsThumb2];
 }
 
+class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
 class T2I<dag oops, dag iops, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 87bb12b..85f6b40 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -60,25 +60,6 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-bool ARMInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  if (MBB.empty()) return false;
-
-  switch (MBB.back().getOpcode()) {
-  case ARM::BX_RET:   // Return.
-  case ARM::LDM_RET:
-  case ARM::B:
-  case ARM::BRIND:
-  case ARM::BR_JTr:   // Jumptable branch.
-  case ARM::BR_JTm:   // Jumptable branch through mem.
-  case ARM::BR_JTadd: // Jumptable branch add to pc.
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
 void ARMInstrInfo::
 reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
               unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig,
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 4319577..d4199d1 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -32,9 +32,6 @@ public:
   // if there is not such an opcode.
   unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  // Return true if the block does not fall through.
-  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
-
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr *Orig,
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 7516d3c..e14696a 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -46,6 +46,11 @@ def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
 def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
 def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
 
+def SDT_ARMMEMBARRIERV7  : SDTypeProfile<0, 0, []>;
+def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>;
+def SDT_ARMMEMBARRIERV6  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
@@ -93,6 +98,15 @@ def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInFlag ]>;
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
 
+def ARMMemBarrierV7  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV7,
+                              [SDNPHasChain]>;
+def ARMSyncBarrierV7 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV7,
+                              [SDNPHasChain]>;
+def ARMMemBarrierV6  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV6,
+                              [SDNPHasChain]>;
+def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6,
+                              [SDNPHasChain]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -772,6 +786,7 @@ let isBranch = 1, isTerminator = 1 in {
   def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
                     IIC_Br, "mov\tpc, $target \n$jt",
                     [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
+    let Inst{11-4}  = 0b00000000;
     let Inst{15-12} = 0b1111;
     let Inst{20}    = 0; // S Bit
     let Inst{24-21} = 0b1101;
@@ -1561,6 +1576,189 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
   let Inst{25} = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def Int_MemBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dmb", "",
+                        [(ARMMemBarrierV7)]>,
+                        Requires<[IsARM, HasV7]> {
+  let Inst{31-4} = 0xf57ff05;
+  // FIXME: add support for options other than a full system DMB
+  let Inst{3-0} = 0b1111;
+}
+
+def Int_SyncBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dsb", "",
+                        [(ARMSyncBarrierV7)]>,
+                        Requires<[IsARM, HasV7]> {
+  let Inst{31-4} = 0xf57ff04;
+  // FIXME: add support for options other than a full system DSB
+  let Inst{3-0} = 0b1111;
+}
+
+def Int_MemBarrierV6 : AInoP<(outs), (ins GPR:$zero),
+                       Pseudo, NoItinerary,
+                       "mcr", "\tp15, 0, $zero, c7, c10, 5",
+                       [(ARMMemBarrierV6 GPR:$zero)]>,
+                       Requires<[IsARM, HasV6]> {
+  // FIXME: add support for options other than a full system DMB
+  // FIXME: add encoding
+}
+
+def Int_SyncBarrierV6 : AInoP<(outs), (ins GPR:$zero),
+                        Pseudo, NoItinerary,
+                        "mcr", "\tp15, 0, $zero, c7, c10, 4",
+                        [(ARMSyncBarrierV6 GPR:$zero)]>,
+                        Requires<[IsARM, HasV6]> {
+  // FIXME: add support for options other than a full system DSB
+  // FIXME: add encoding
+}
+}
+
+let usesCustomInserter = 1 in {
+  let Uses = [CPSR] in {
+    def ATOMIC_LOAD_ADD_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+
+    def ATOMIC_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_SWAP_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
+    def ATOMIC_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_SWAP_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
+    def ATOMIC_SWAP_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_SWAP_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
+}
+}
+
+let mayLoad = 1 in {
+def LDREXB : AIldrex<0b10, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
+                    "ldrexb", "\t$dest, [$ptr]",
+                    []>;
+def LDREXH : AIldrex<0b11, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
+                    "ldrexh", "\t$dest, [$ptr]",
+                    []>;
+def LDREX  : AIldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
+                    "ldrex", "\t$dest, [$ptr]",
+                    []>;
+def LDREXD : AIldrex<0b01, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr),
+                    NoItinerary,
+                    "ldrexd", "\t$dest, $dest2, [$ptr]",
+                    []>;
+}
+
+let mayStore = 1 in {
+def STREXB : AIstrex<0b10, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                    NoItinerary,
+                    "strexb", "\t$success, $src, [$ptr]",
+                    []>;
+def STREXH : AIstrex<0b11, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                    NoItinerary,
+                    "strexh", "\t$success, $src, [$ptr]",
+                    []>;
+def STREX  : AIstrex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                    NoItinerary,
+                    "strex", "\t$success, $src, [$ptr]",
+                    []>;
+def STREXD : AIstrex<0b01, (outs GPR:$success),
+                    (ins GPR:$src, GPR:$src2, GPR:$ptr),
+                    NoItinerary,
+                    "strexd", "\t$success, $src, $src2, [$ptr]",
+                    []>;
+}
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 3166931..61b7705 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -152,7 +152,7 @@ def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr),
   let Inst{24}    = 0; // P bit
   let Inst{23}    = 1; // U bit
   let Inst{20}    = 1;
-  let Inst{11-9}  = 0b101;
+  let Inst{11-8}  = 0b1011;
 }
 
 // Use vstmia to store a Q register as a D register pair.
@@ -164,7 +164,7 @@ def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr),
   let Inst{24}    = 0; // P bit
   let Inst{23}    = 1; // U bit
   let Inst{20}    = 0;
-  let Inst{11-9}  = 0b101;
+  let Inst{11-8}  = 0b1011;
 }
 
 //   VLD1     : Vector Load (multiple single elements)
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index b5956a3..9306bdb 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -669,6 +669,35 @@ let isCall = 1,
                [(set R0, ARMthread_pointer)]>;
 }
 
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everthing out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ] in {
+  def tInt_eh_sjlj_setjmp : ThumbXI<(outs), (ins GPR:$src),
+                              AddrModeNone, SizeSpecial, NoItinerary,
+                              "mov\tr12, r1\t@ begin eh.setjmp\n"
+                              "\tmov\tr1, sp\n"
+                              "\tstr\tr1, [$src, #8]\n"
+                              "\tadr\tr1, 0f\n"
+                              "\tadds\tr1, #1\n"
+                              "\tstr\tr1, [$src, #4]\n"
+                              "\tmov\tr1, r12\n"
+                              "\tmovs\tr0, #0\n"
+                              "\tb\t1f\n"
+                              ".align 2\n"
+                              "0:\tmovs\tr0, #1\t@ end eh.setjmp\n"
+                              "1:", "",
+                              [(set R0, (ARMeh_sjlj_setjmp GPR:$src))]>;
+}
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 9489815..949ce73 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1065,6 +1065,68 @@ def t2MOVCCror : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true, i32imm:$rhs),
                    RegConstraint<"$false = $dst">;
 
 //===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def t2Int_MemBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dmb", "",
+                        [(ARMMemBarrierV7)]>,
+                        Requires<[IsThumb2]> {
+  // FIXME: add support for options other than a full system DMB
+}
+
+def t2Int_SyncBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dsb", "",
+                        [(ARMSyncBarrierV7)]>,
+                        Requires<[IsThumb2]> {
+  // FIXME: add support for options other than a full system DSB
+}
+}
+
+let mayLoad = 1 in {
+def t2LDREXB : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+                      Size4Bytes, NoItinerary,
+                      "ldrexb", "\t$dest, [$ptr]", "",
+                      []>;
+def t2LDREXH : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+                      Size4Bytes, NoItinerary,
+                      "ldrexh", "\t$dest, [$ptr]", "",
+                      []>;
+def t2LDREX  : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+                      Size4Bytes, NoItinerary,
+                      "ldrex", "\t$dest, [$ptr]", "",
+                      []>;
+def t2LDREXD : Thumb2I<(outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr),
+                      AddrModeNone, Size4Bytes, NoItinerary,
+                      "ldrexd", "\t$dest, $dest2, [$ptr]", "",
+                      []>;
+}
+
+let mayStore = 1 in {
+def t2STREXB : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                      AddrModeNone, Size4Bytes, NoItinerary,
+                      "strexb", "\t$success, $src, [$ptr]", "",
+                      []>;
+def t2STREXH : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                      AddrModeNone, Size4Bytes, NoItinerary,
+                      "strexh", "\t$success, $src, [$ptr]", "",
+                      []>;
+def t2STREX  : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                      AddrModeNone, Size4Bytes, NoItinerary,
+                      "strex", "\t$success, $src, [$ptr]", "",
+                      []>;
+def t2STREXD : Thumb2I<(outs GPR:$success),
+                      (ins GPR:$src, GPR:$src2, GPR:$ptr),
+                      AddrModeNone, Size4Bytes, NoItinerary,
+                      "strexd", "\t$success, $src, $src2, [$ptr]", "",
+                      []>;
+}
+
+//===----------------------------------------------------------------------===//
 // TLS Instructions
 //
 
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 304d0ef..22bd80e 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -449,7 +449,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     }
 
     if (MBBI != MBB.end()) {
-      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
       if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
           isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
@@ -494,7 +494,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
     }
 
     if (MBBI != MBB.end()) {
-      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
       if (Mode == ARM_AM::ia &&
           isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
         MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
@@ -604,7 +604,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   }
 
   if (!DoMerge && MBBI != MBB.end()) {
-    MachineBasicBlock::iterator NextMBBI = next(MBBI);
+    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
     if (!isAM5 &&
         isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
       DoMerge = true;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 2564ed9..1c6fca7 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -95,7 +95,7 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
 
   // Calculate and set max stack object alignment early, so we can decide
   // whether we will need stack realignment (and thus FP).
-  PM.add(createARMMaxStackAlignmentCalculatorPass());
+  PM.add(createMaxStackAlignmentCalculatorPass());
 
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 692bb19..362bbf1 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -1045,6 +1045,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       printNoHashImmediate(MI, OpNum);
       return false;
     case 'P': // Print a VFP double precision register.
+    case 'q': // Print a NEON quad precision register.
       printOperand(MI, OpNum);
       return false;
     case 'Q':
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
index 50abcf4..3c0414d 100644
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ b/lib/Target/ARM/NEONMoveFix.cpp
@@ -51,7 +51,7 @@ bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
   MachineBasicBlock::iterator NextMII;
   for (; MII != E; MII = NextMII) {
-    NextMII = next(MII);
+    NextMII = llvm::next(MII);
     MachineInstr *MI = &*MII;
 
     if (MI->getOpcode() == ARM::VMOVD &&
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index 206677b..d9942c8 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -338,7 +338,7 @@ bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
     if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
       continue;
 
-    MachineBasicBlock::iterator NextI = next(MBBI);
+    MachineBasicBlock::iterator NextI = llvm::next(MBBI);
     for (unsigned R = 0; R < NumRegs; ++R) {
       MachineOperand &MO = MI->getOperand(FirstOpnd + R);
       assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 7602b6d..66d3b83 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -32,25 +32,6 @@ unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
   return 0;
 }
 
-bool
-Thumb1InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  if (MBB.empty()) return false;
-
-  switch (MBB.back().getOpcode()) {
-  case ARM::tBX_RET:
-  case ARM::tBX_RET_vararg:
-  case ARM::tPOP_RET:
-  case ARM::tB:
-  case ARM::tBRIND:
-  case ARM::tBR_JTr:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
 bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I,
                                    unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index b28229d..516ddf1 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -31,9 +31,6 @@ public:
   // if there is not such an opcode.
   unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  // Return true if the block does not fall through.
-  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
-
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 37adf37..9f3816a 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -528,7 +528,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
         MI.getOperand(i+1).ChangeToImmediate(Mask);
       }
       Offset = (Offset - Mask * Scale);
-      MachineBasicBlock::iterator NII = next(II);
+      MachineBasicBlock::iterator NII = llvm::next(II);
       emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
                                 *this, dl);
     } else {
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 16c1e6f..f4a8c27 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -36,30 +36,6 @@ unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
 }
 
 bool
-Thumb2InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  if (MBB.empty()) return false;
-
-  switch (MBB.back().getOpcode()) {
-  case ARM::t2LDM_RET:
-  case ARM::t2B:        // Uncond branch.
-  case ARM::t2BR_JT:    // Jumptable branch.
-  case ARM::t2TBB:      // Table branch byte.
-  case ARM::t2TBH:      // Table branch halfword.
-  case ARM::tBR_JTr:    // Jumptable branch (16-bit version).
-  case ARM::tBX_RET:
-  case ARM::tBX_RET_vararg:
-  case ARM::tPOP_RET:
-  case ARM::tB:
-  case ARM::tBRIND:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-bool
 Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I,
                               unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 663a60b..a0f89a6 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -31,9 +31,6 @@ public:
   // if there is not such an opcode.
   unsigned getUnindexedOpcode(unsigned Opc) const;
 
-  // Return true if the block does not fall through.
-  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
-
   bool copyRegToReg(MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator I,
                     unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index b2fd7b3..35359aa 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -649,7 +649,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
   MachineBasicBlock::iterator NextMII;
   for (; MII != E; MII = NextMII) {
-    NextMII = next(MII);
+    NextMII = llvm::next(MII);
 
     MachineInstr *MI = &*MII;
     LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 86173ff..39f0749 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -392,18 +392,6 @@ void AlphaInstrInfo::insertNoop(MachineBasicBlock &MBB,
     .addReg(Alpha::R31);
 }
 
-bool AlphaInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  if (MBB.empty()) return false;
-  
-  switch (MBB.back().getOpcode()) {
-  case Alpha::RETDAG: // Return.
-  case Alpha::RETDAGp:
-  case Alpha::BR:     // Uncond branch.
-  case Alpha::JMP:  // Indirect branch.
-    return true;
-  default: return false;
-  }
-}
 bool AlphaInstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid Alpha branch opcode!");
diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
index 274f452..c3b6044 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.h
+++ b/lib/Target/Alpha/AlphaInstrInfo.h
@@ -78,7 +78,6 @@ public:
   unsigned RemoveBranch(MachineBasicBlock &MBB) const;
   void insertNoop(MachineBasicBlock &MBB, 
                   MachineBasicBlock::iterator MI) const;
-  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
   /// getGlobalBaseReg - Return a virtual register initialized with the
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 9e4fe27..a52ca79 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -2573,7 +2573,7 @@ void CWriter::visitSwitchInst(SwitchInst &SI) {
     BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1));
     printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
     printBranchToBlock(SI.getParent(), Succ, 2);
-    if (Function::iterator(Succ) == next(Function::iterator(SI.getParent())))
+    if (Function::iterator(Succ) == llvm::next(Function::iterator(SI.getParent())))
       Out << "    break;\n";
   }
   Out << "  }\n";
@@ -2593,7 +2593,7 @@ bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
   /// FIXME: This should be reenabled, but loop reordering safe!!
   return true;
 
-  if (next(Function::iterator(From)) != Function::iterator(To))
+  if (llvm::next(Function::iterator(From)) != Function::iterator(To))
     return true;  // Not the direct successor, we need a goto.
 
   //isa<SwitchInst>(From->getTerminator())
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index ecce8e3..2306665 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -580,10 +580,6 @@ SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   }
 }
 
-bool
-SPUInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  return (!MBB.empty() && isUncondBranch(&MBB.back()));
-}
 //! Reverses a branch's condition, returning false on success.
 bool
 SPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
index c644a11..42677fc 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.h
+++ b/lib/Target/CellSPU/SPUInstrInfo.h
@@ -79,9 +79,6 @@ namespace llvm {
     bool canFoldMemoryOperand(const MachineInstr *MI,
                               const SmallVectorImpl<unsigned> &Ops) const;
 
-    //! Return true if the specified block does not fall through
-    virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
-
     //! Reverses a branch's condition, returning false on success.
     virtual
     bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 4bae6c7..a872fbd 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -976,21 +976,20 @@ namespace {
     nl(Out);
     printType(GV->getType());
     if (GV->hasInitializer()) {
-      Constant* Init = GV->getInitializer();
+      Constant *Init = GV->getInitializer();
       printType(Init->getType());
-      if (Function* F = dyn_cast<Function>(Init)) {
+      if (Function *F = dyn_cast<Function>(Init)) {
         nl(Out)<< "/ Function Declarations"; nl(Out);
         printFunctionHead(F);
       } else if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
         nl(Out) << "// Global Variable Declarations"; nl(Out);
         printVariableHead(gv);
-      } else  {
-        nl(Out) << "// Constant Definitions"; nl(Out);
-        printConstant(gv);
-      }
-      if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+        
         nl(Out) << "// Global Variable Definitions"; nl(Out);
         printVariableBody(gv);
+      } else  {
+        nl(Out) << "// Constant Definitions"; nl(Out);
+        printConstant(Init);
       }
     }
   }
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index beccb2c..4edf422 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -86,10 +86,10 @@ namespace {
 
     void dump() {
       errs() << "MSP430ISelAddressMode " << this << '\n';
-      if (Base.Reg.getNode() != 0) {
+      if (BaseType == RegBase && Base.Reg.getNode() != 0) {
         errs() << "Base.Reg ";
         Base.Reg.getNode()->dump();
-      } else {
+      } else if (BaseType == FrameIndexBase) {
         errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
       }
       errs() << " Disp " << Disp << '\n';
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 29cc370..5fe9b20 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "MSP430ISelLowering.h"
 #include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
 #include "MSP430TargetMachine.h"
 #include "MSP430Subtarget.h"
 #include "llvm/DerivedTypes.h"
@@ -32,16 +33,38 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/VectorExtras.h"
 using namespace llvm;
 
+typedef enum {
+  NoHWMult,
+  HWMultIntr,
+  HWMultNoIntr
+} HWMultUseMode;
+
+static cl::opt<HWMultUseMode>
+HWMultMode("msp430-hwmult-mode",
+           cl::desc("Hardware multiplier use mode"),
+           cl::init(HWMultNoIntr),
+           cl::values(
+             clEnumValN(NoHWMult, "no",
+                "Do not use hardware multiplier"),
+             clEnumValN(HWMultIntr, "interrupts",
+                "Assume hardware multiplier can be used inside interrupts"),
+             clEnumValN(HWMultNoIntr, "use",
+                "Assume hardware multiplier cannot be used inside interrupts"),
+             clEnumValEnd));
+
 MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   TargetLowering(tm, new TargetLoweringObjectFileELF()),
   Subtarget(*tm.getSubtargetImpl()), TM(tm) {
 
+  TD = getTargetData();
+
   // Set up the register classes.
   addRegisterClass(MVT::i8,  MSP430::GR8RegisterClass);
   addRegisterClass(MVT::i16, MSP430::GR16RegisterClass);
@@ -92,8 +115,8 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   setOperationAction(ISD::BR_CC,            MVT::i8,    Custom);
   setOperationAction(ISD::BR_CC,            MVT::i16,   Custom);
   setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
-  setOperationAction(ISD::SETCC,            MVT::i8,    Expand);
-  setOperationAction(ISD::SETCC,            MVT::i16,   Expand);
+  setOperationAction(ISD::SETCC,            MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC,            MVT::i16,   Custom);
   setOperationAction(ISD::SELECT,           MVT::i8,    Expand);
   setOperationAction(ISD::SELECT,           MVT::i16,   Expand);
   setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
@@ -142,6 +165,15 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
   setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
   setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+
+  // Libcalls names.
+  if (HWMultMode == HWMultIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
+  } else if (HWMultMode == HWMultNoIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  }
 }
 
 SDValue MSP430TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
@@ -151,9 +183,12 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
   case ISD::SRA:              return LowerShifts(Op, DAG);
   case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
   case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
+  case ISD::SETCC:            return LowerSETCC(Op, DAG);
   case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
   case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
   case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
@@ -225,6 +260,13 @@ MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
   case CallingConv::C:
   case CallingConv::Fast:
     return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+   if (Ins.empty())
+     return Chain;
+   else {
+    llvm_report_error("ISRs cannot have arguments");
+    return SDValue();
+   }
   }
 }
 
@@ -244,6 +286,9 @@ MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   case CallingConv::C:
     return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
                           Outs, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    llvm_report_error("ISRs cannot be called directly");
+    return SDValue();
   }
 }
 
@@ -340,6 +385,12 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
 
+  // ISRs cannot return any value.
+  if (CallConv == CallingConv::MSP430_INTR && !Outs.empty()) {
+    llvm_report_error("ISRs cannot return any value");
+    return SDValue();
+  }
+
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
                  RVLocs, *DAG.getContext());
@@ -370,11 +421,14 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
     Flag = Chain.getValue(1);
   }
 
+  unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
+                  MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
+
   if (Flag.getNode())
-    return DAG.getNode(MSP430ISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+    return DAG.getNode(Opc, dl, MVT::Other, Chain, Flag);
 
   // Return Void
-  return DAG.getNode(MSP430ISD::RET_FLAG, dl, MVT::Other, Chain);
+  return DAG.getNode(Opc, dl, MVT::Other, Chain);
 }
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
@@ -538,9 +592,21 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   EVT VT = Op.getValueType();
   DebugLoc dl = N->getDebugLoc();
 
-  // We currently only lower shifts of constant argument.
+  // Expand non-constant shifts to loops:
   if (!isa<ConstantSDNode>(N->getOperand(1)))
-    return SDValue();
+    switch (Opc) {
+    default:
+      assert(0 && "Invalid shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(MSP430ISD::SHL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(MSP430ISD::SRA, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(MSP430ISD::SRL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    }
 
   uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
@@ -648,6 +714,88 @@ SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
                      Chain, Dest, TargetCC, Flag);
 }
 
+
+SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  // If we are doing an AND and testing against zero, then the CMP
+  // will not be generated.  The AND (or BIT) will generate the condition codes,
+  // but they are different from CMP.
+  bool andCC = false;
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    if (RHSC->isNullValue() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::AND ||
+         (LHS.getOpcode() == ISD::TRUNCATE &&
+          LHS.getOperand(0).getOpcode() == ISD::AND))) {
+      andCC = true;
+    }
+  }
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  // Get the condition codes directly from the status register, if its easy.
+  // Otherwise a branch will be generated.  Note that the AND and BIT
+  // instructions generate different flags than CMP, the carry bit can be used
+  // for NE/EQ.
+  bool Invert = false;
+  bool Shift = false;
+  bool Convert = true;
+  switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
+   default:
+    Convert = false;
+    break;
+   case MSP430CC::COND_HS:
+     // Res = SRW & 1, no processing is required
+     break;
+    case MSP430CC::COND_LO:
+     // Res = ~(SRW & 1)
+     Invert = true;
+     break;
+    case MSP430CC::COND_NE:
+     if (andCC) {
+       // C = ~Z, thus Res = SRW & 1, no processing is required
+     } else {
+       // Res = (SRW >> 1) & 1
+       Shift = true;
+     }
+     break;
+    case MSP430CC::COND_E:
+     if (andCC) {
+       // C = ~Z, thus Res = ~(SRW & 1)
+     } else {
+       // Res = ~((SRW >> 1) & 1)
+       Shift = true;
+     }
+     Invert = true;
+     break;
+  }
+  EVT VT = Op.getValueType();
+  SDValue One  = DAG.getConstant(1, VT);
+  if (Convert) {
+    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SRW,
+                                     MVT::i16, Flag);
+    if (Shift)
+      // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
+      SR = DAG.getNode(ISD::SRA, dl, MVT::i16, SR, One);
+    SR = DAG.getNode(ISD::AND, dl, MVT::i16, SR, One);
+    if (Invert)
+      SR = DAG.getNode(ISD::XOR, dl, MVT::i16, SR, One);
+    return SR;
+  } else {
+    SDValue Zero = DAG.getConstant(0, VT);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(One);
+    Ops.push_back(Zero);
+    Ops.push_back(TargetCC);
+    Ops.push_back(Flag);
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+  }
+}
+
 SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
   SDValue LHS    = Op.getOperand(0);
   SDValue RHS    = Op.getOperand(1);
@@ -682,6 +830,55 @@ SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
                      DAG.getValueType(Val.getValueType()));
 }
 
+SDValue MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = TD->getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+                                                           true, false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+      DAG.getConstant(TD->getPointerSize(), MVT::i16);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       NULL, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, NULL, 0);
+}
+
+SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         MSP430::FPW, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
+}
+
 /// getPostIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if this node can be
 /// combined with a load / store to form a post-indexed load / store.
@@ -722,6 +919,7 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default: return NULL;
   case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
+  case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
   case MSP430ISD::RRA:                return "MSP430ISD::RRA";
   case MSP430ISD::RLA:                return "MSP430ISD::RLA";
   case MSP430ISD::RRC:                return "MSP430ISD::RRC";
@@ -730,6 +928,8 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
   case MSP430ISD::CMP:                return "MSP430ISD::CMP";
   case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
+  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
+  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
   }
 }
 
@@ -738,13 +938,131 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
 //===----------------------------------------------------------------------===//
 
 MachineBasicBlock*
+MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RI = F->getRegInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+
+  unsigned Opc;
+  const TargetRegisterClass * RC;
+  switch (MI->getOpcode()) {
+  default:
+    assert(0 && "Invalid shift opcode!");
+  case MSP430::Shl8:
+   Opc = MSP430::SHL8r1;
+   RC = MSP430::GR8RegisterClass;
+   break;
+  case MSP430::Shl16:
+   Opc = MSP430::SHL16r1;
+   RC = MSP430::GR16RegisterClass;
+   break;
+  case MSP430::Sra8:
+   Opc = MSP430::SAR8r1;
+   RC = MSP430::GR8RegisterClass;
+   break;
+  case MSP430::Sra16:
+   Opc = MSP430::SAR16r1;
+   RC = MSP430::GR16RegisterClass;
+   break;
+  case MSP430::Srl8:
+   Opc = MSP430::SAR8r1c;
+   RC = MSP430::GR8RegisterClass;
+   break;
+  case MSP430::Srl16:
+   Opc = MSP430::SAR16r1c;
+   RC = MSP430::GR16RegisterClass;
+   break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  // Create loop block
+  MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *RemBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, LoopBB);
+  F->insert(I, RemBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the block containing instructions after shift.
+  RemBB->transferSuccessors(BB);
+
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    EM->insert(std::make_pair(*SI, RemBB));
+
+  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  BB->addSuccessor(LoopBB);
+  BB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(LoopBB);
+
+  unsigned ShiftAmtReg = RI.createVirtualRegister(MSP430::GR8RegisterClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(MSP430::GR8RegisterClass);
+  unsigned ShiftReg = RI.createVirtualRegister(RC);
+  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+  unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // BB:
+  // cmp 0, N
+  // je RemBB
+  BuildMI(BB, dl, TII.get(MSP430::CMP8ir))
+    .addImm(0).addReg(ShiftAmtSrcReg);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(RemBB)
+    .addImm(MSP430CC::COND_E);
+
+  // LoopBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
+  // ShiftReg2 = shift ShiftReg
+  // ShiftAmt2 = ShiftAmt - 1;
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
+    .addReg(ShiftAmtSrcReg).addMBB(BB)
+    .addReg(ShiftAmtReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+    .addReg(ShiftReg);
+  BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
+    .addReg(ShiftAmtReg).addImm(1);
+  BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
+    .addMBB(LoopBB)
+    .addImm(MSP430CC::COND_NE);
+
+  // RemBB:
+  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+  BuildMI(RemBB, dl, TII.get(MSP430::PHI), DstReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+
+  return RemBB;
+}
+
+MachineBasicBlock*
 MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                   MachineBasicBlock *BB,
                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+    return EmitShiftInstr(MI, BB, EM);
+
   const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
-  assert((MI->getOpcode() == MSP430::Select16 ||
-          MI->getOpcode() == MSP430::Select8) &&
+
+  assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
          "Unexpected instr type to insert");
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index d413ccb..4921500 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -27,6 +27,9 @@ namespace llvm {
       /// Return with a flag operand. Operand 0 is the chain operand.
       RET_FLAG,
 
+      /// Same as RET_FLAG, but used for returning from ISRs.
+      RETI_FLAG,
+
       /// Y = R{R,L}A X, rotate right (left) arithmetically
       RRA, RLA,
 
@@ -44,7 +47,7 @@ namespace llvm {
       /// CMP - Compare instruction.
       CMP,
 
-      /// SetCC. Operand 0 is condition code, and operand 1 is the flag
+      /// SetCC - Operand 0 is condition code, and operand 1 is the flag
       /// operand produced by a CMP instruction.
       SETCC,
 
@@ -54,9 +57,12 @@ namespace llvm {
       /// instruction.
       BR_CC,
 
-      /// SELECT_CC. Operand 0 and operand 1 are selection variable, operand 3
+      /// SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
       /// is condition code and operand 4 is flag operand.
-      SELECT_CC
+      SELECT_CC,
+
+      /// SHL, SRA, SRL - Non-constant shifts.
+      SHL, SRA, SRL
     };
   }
 
@@ -81,8 +87,12 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
 
     TargetLowering::ConstraintType
     getConstraintType(const std::string &Constraint) const;
@@ -92,6 +102,9 @@ namespace llvm {
     MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
                                                    MachineBasicBlock *BB,
                     DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+    MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
+                                      MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
 
   private:
     SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
@@ -144,6 +157,7 @@ namespace llvm {
 
     const MSP430Subtarget &Subtarget;
     const MSP430TargetMachine &TM;
+    const TargetData *TD;
   };
 } // namespace llvm
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index b2f09c7..2ae6759 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -219,17 +219,6 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool MSP430InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB)const{
-  if (MBB.empty()) return false;
-
-  switch (MBB.back().getOpcode()) {
-  case MSP430::RET:   // Return.
-  case MSP430::JMP:   // Uncond branch.
-    return true;
-  default: return false;
-  }
-}
-
 bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   const TargetInstrDesc &TID = MI->getDesc();
   if (!TID.isTerminator()) return false;
@@ -270,8 +259,8 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (next(I) != MBB.end())
-        next(I)->eraseFromParent();
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
       Cond.clear();
       FBB = 0;
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 35e35db..e4ceeb9 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -61,7 +61,6 @@ public:
 
   // Branch folding goodness
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
   bool isUnpredicatedTerminator(const MachineInstr *MI) const;
   bool AnalyzeBranch(MachineBasicBlock &MBB,
                      MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 7a26f6c..d67ba90 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -31,12 +31,15 @@ def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
                                                   SDTCisVT<1, i8>]>;
 def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, 
                                                   SDTCisVT<3, i8>]>;
+def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisI8<2>]>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Specific Node Definitions.
 //===----------------------------------------------------------------------===//
-def MSP430retflag : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
-                     [SDNPHasChain, SDNPOptInFlag]>;
+def MSP430retflag  : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInFlag]>;
+def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInFlag]>;
 
 def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
 def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
@@ -54,6 +57,9 @@ def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
 def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutFlag]>;
 def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC, [SDNPHasChain, SDNPInFlag]>;
 def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC, [SDNPInFlag]>;
+def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
+def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
+def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Operand Definitions.
@@ -90,7 +96,9 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
 // Pattern Fragments
 def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
 def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
-
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
 //===----------------------------------------------------------------------===//
 // Instruction list..
 
@@ -117,6 +125,27 @@ let usesCustomInserter = 1 in {
                         "# Select16 PSEUDO",
                         [(set GR16:$dst,
                           (MSP430selectcc GR16:$src1, GR16:$src2, imm:$cc))]>;
+  let Defs = [SRW] in {
+  def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Shl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+  def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Shl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+  def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Sra8 PSEUDO",
+                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+  def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Sra16 PSEUDO",
+                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+  def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Srl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+  def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Srl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
+
+  }
 }
 
 let neverHasSideEffects = 1 in
@@ -128,7 +157,8 @@ def NOP : Pseudo<(outs), (ins), "nop", []>;
 
 // FIXME: Provide proper encoding!
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def RET : Pseudo<(outs), (ins), "ret", [(MSP430retflag)]>;
+  def RET  : Pseudo<(outs), (ins), "ret",  [(MSP430retflag)]>;
+  def RETI : Pseudo<(outs), (ins), "reti", [(MSP430retiflag)]>;
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -823,6 +853,65 @@ def CMP16mr : Pseudo<(outs), (ins memsrc:$src1, GR16:$src2),
                 "cmp.w\t{$src1, $src2}",
                 [(MSP430cmp (load addr:$src1), GR16:$src2), (implicit SRW)]>;
 
+
+// BIT TESTS, just sets condition codes
+// Note that the C condition is set differently than when using CMP.
+let isCommutable = 1 in {
+def BIT8rr  : Pseudo<(outs), (ins GR8:$src1, GR8:$src2),
+                     "bit.b\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su GR8:$src1, GR8:$src2)),
+                      (implicit SRW)]>;
+def BIT16rr : Pseudo<(outs), (ins GR16:$src1, GR16:$src2),
+                     "bit.w\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su GR16:$src1, GR16:$src2)),
+                      (implicit SRW)]>;
+}
+def BIT8ri  : Pseudo<(outs), (ins GR8:$src1, i8imm:$src2),
+                     "bit.b\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su GR8:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+def BIT16ri : Pseudo<(outs), (ins GR16:$src1, i16imm:$src2),
+                     "bit.w\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su GR16:$src1, imm:$src2)),
+                      (implicit SRW)]>;
+
+def BIT8rm  : Pseudo<(outs), (ins GR8:$src1, memdst:$src2),
+                     "bit.b\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su GR8:$src1,  (load addr:$src2))),
+                      (implicit SRW)]>;
+def BIT16rm : Pseudo<(outs), (ins GR16:$src1, memdst:$src2),
+                     "bit.w\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su GR16:$src1,  (load addr:$src2))),
+                      (implicit SRW)]>;
+
+def BIT8mr  : Pseudo<(outs), (ins memsrc:$src1, GR8:$src2),
+                     "bit.b\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su (load addr:$src1), GR8:$src2)),
+                      (implicit SRW)]>;
+def BIT16mr : Pseudo<(outs), (ins memsrc:$src1, GR16:$src2),
+                     "bit.w\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su (load addr:$src1), GR16:$src2)),
+                      (implicit SRW)]>;
+
+def BIT8mi  : Pseudo<(outs), (ins memsrc:$src1, i8imm:$src2),
+                     "bit.b\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su (load addr:$src1), (i8 imm:$src2))),
+                      (implicit SRW)]>;
+def BIT16mi : Pseudo<(outs), (ins memsrc:$src1, i16imm:$src2),
+                     "bit.w\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su (load addr:$src1), (i16 imm:$src2))),
+                      (implicit SRW)]>;
+
+def BIT8mm  : Pseudo<(outs), (ins memsrc:$src1, memsrc:$src2),
+                     "bit.b\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su (i8 (load addr:$src1)), 
+                                            (load addr:$src2))),
+                      (implicit SRW)]>;
+def BIT16mm : Pseudo<(outs), (ins memsrc:$src1, memsrc:$src2),
+                     "bit.w\t{$src2, $src1}",
+                     [(MSP430cmp 0, (and_su (i16 (load addr:$src1)),
+                                            (load addr:$src2))),
+                      (implicit SRW)]>;
 } // Defs = [SRW]
 
 //===----------------------------------------------------------------------===//
@@ -905,3 +994,6 @@ def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
 
 // peephole patterns
 def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
+def : Pat<(MSP430cmp 0, (trunc (and_su GR16:$src1, GR16:$src2))),
+          (BIT8rr (EXTRACT_SUBREG GR16:$src1, subreg_8bit),
+                  (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index 1d26ae3..383fd2e 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -25,14 +25,20 @@ class MSP430MachineFunctionInfo : public MachineFunctionInfo {
   /// stack frame in bytes.
   unsigned CalleeSavedFrameSize;
 
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
 public:
   MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
 
   explicit MSP430MachineFunctionInfo(MachineFunction &MF)
-    : CalleeSavedFrameSize(0) {}
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 92baad9..e85c7a2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -17,6 +17,7 @@
 #include "MSP430MachineFunctionInfo.h"
 #include "MSP430RegisterInfo.h"
 #include "MSP430TargetMachine.h"
+#include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -37,17 +38,26 @@ MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
 
 const unsigned*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const Function* F = MF->getFunction();
   static const unsigned CalleeSavedRegs[] = {
     MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
     MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
     0
   };
+  static const unsigned CalleeSavedRegsIntr[] = {
+    MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
 
-  return CalleeSavedRegs;
+  return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+          CalleeSavedRegsIntr : CalleeSavedRegs);
 }
 
 const TargetRegisterClass *const *
 MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  const Function* F = MF->getFunction();
   static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
     &MSP430::GR16RegClass, &MSP430::GR16RegClass,
     &MSP430::GR16RegClass, &MSP430::GR16RegClass,
@@ -55,8 +65,18 @@ MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
     &MSP430::GR16RegClass, &MSP430::GR16RegClass,
     0
   };
+  static const TargetRegisterClass * const CalleeSavedRegClassesIntr[] = {
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    0
+  };
 
-  return CalleeSavedRegClasses;
+  return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+          CalleeSavedRegClassesIntr : CalleeSavedRegClasses);
 }
 
 BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -193,10 +213,10 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // We need to materialize the offset via add instruction.
     unsigned DstReg = MI.getOperand(0).getReg();
     if (Offset < 0)
-      BuildMI(MBB, next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
         .addReg(DstReg).addImm(-Offset);
     else
-      BuildMI(MBB, next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
         .addReg(DstReg).addImm(Offset);
 
     return 0;
@@ -251,7 +271,7 @@ void MSP430RegisterInfo::emitPrologue(MachineFunction &MF) const {
       .addReg(MSP430::SPW);
 
     // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(MSP430::FPW);
 
@@ -292,7 +312,8 @@ void MSP430RegisterInfo::emitEpilogue(MachineFunction &MF,
   DebugLoc DL = MBBI->getDebugLoc();
 
   switch (RetOpcode) {
-  case MSP430::RET: break;  // These are ok
+  case MSP430::RET:
+  case MSP430::RETI: break;  // These are ok
   default:
     llvm_unreachable("Can only insert epilog into returning blocks");
   }
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 6d8e160..48b9bdf 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -590,22 +590,6 @@ RemoveBranch(MachineBasicBlock &MBB) const
   return 2;
 }
 
-/// BlockHasNoFallThrough - Analyze if MachineBasicBlock does not
-/// fall-through into its successor block.
-bool MipsInstrInfo::
-BlockHasNoFallThrough(const MachineBasicBlock &MBB) const 
-{
-  if (MBB.empty()) return false;
-  
-  switch (MBB.back().getOpcode()) {
-  case Mips::RET:     // Return.
-  case Mips::JR:      // Indirect branch.
-  case Mips::J:       // Uncond branch.
-    return true;
-  default: return false;
-  }
-}
-
 /// ReverseBranchCondition - Return the inverse opcode of the 
 /// specified Branch instruction.
 bool MipsInstrInfo::
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 249d3de..ab8dc59 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -232,7 +232,6 @@ public:
     return 0;
   }
   
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 0083598..af7d812 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -740,18 +740,6 @@ bool PPCInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
 }
 
 
-bool PPCInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  if (MBB.empty()) return false;
-  
-  switch (MBB.back().getOpcode()) {
-  case PPC::BLR:   // Return.
-  case PPC::B:     // Uncond branch.
-  case PPC::BCTR:  // Indirect branch.
-    return true;
-  default: return false;
-  }
-}
-
 bool PPCInstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index bb0dc15a..57facac 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -143,7 +143,6 @@ public:
   virtual bool canFoldMemoryOperand(const MachineInstr *MI,
                                     const SmallVectorImpl<unsigned> &Ops) const;
   
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
   
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 2d8a687..e1772c2 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -801,8 +801,21 @@ void bar(unsigned n) {
     true();
 }
 
-I think this basically amounts to a dag combine to simplify comparisons against
-multiply hi's into a comparison against the mullo.
+This is equivalent to the following, where 2863311531 is the multiplicative
+inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
+void bar(unsigned n) {
+  if (n * 2863311531U < 1431655766U)
+    true();
+}
+
+The same transformation can work with an even modulo with the addition of a
+rotate: rotate the result of the multiply to the right by the number of bits
+which need to be zero for the condition to be true, and shrink the compare RHS
+by the same amount.  Unless the target supports rotates, though, that
+transformation probably isn't worthwhile.
+
+The transformation can also easily be made to work with non-zero equality
+comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
 
 //===---------------------------------------------------------------------===//
 
@@ -823,20 +836,6 @@ int main() {
 
 //===---------------------------------------------------------------------===//
 
-Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
-10) u< 10, but only when the comparisons have matching sign.
-
-This could be converted with a similiar technique. (PR1941)
-
-define i1 @test(i8 %x) {
-  %A = icmp uge i8 %x, 5
-  %B = icmp slt i8 %x, 20
-  %C = and i1 %A, %B
-  ret i1 %C
-}
-
-//===---------------------------------------------------------------------===//
-
 These functions perform the same computation, but produce different assembly.
 
 define i8 @select(i8 %x) readnone nounwind {
@@ -884,18 +883,6 @@ The expression should optimize to something like
 
 //===---------------------------------------------------------------------===//
 
-From GCC Bug 15241:
-unsigned int
-foo (unsigned int a, unsigned int b)
-{
- if (a <= 7 && b <= 7)
-   baz ();
-}
-Should combine to "(a|b) <= 7".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
 From GCC Bug 3756:
 int
 pn (int n)
@@ -907,19 +894,6 @@ Should combine to (n >> 31) | 1.  Currently not optimized with "clang
 
 //===---------------------------------------------------------------------===//
 
-From GCC Bug 28685:
-int test(int a, int b)
-{
- int lt = a < b;
- int eq = a == b;
-
- return (lt || eq);
-}
-Should combine to "a <= b".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
 void a(int variable)
 {
  if (variable == 4 || variable == 6)
@@ -993,12 +967,6 @@ Should combine to 0.  Currently not optimized with "clang
 
 //===---------------------------------------------------------------------===//
 
-int a(unsigned char* b) {return *b > 99;}
-There's an unnecessary zext in the generated code with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
 with "clang -emit-llvm-bc | opt -std-compile-opts".
@@ -1011,12 +979,6 @@ Should combine to "x | (y & 3)".  Currently not optimized with "clang
 
 //===---------------------------------------------------------------------===//
 
-unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);}
-Should combine to "a | 1".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
 "clang -emit-llvm-bc | opt -std-compile-opts".
@@ -1704,3 +1666,50 @@ need all but the bottom two bits from %A, and if we gave that mask to SDB it
 would delete the or instruction for us.
 
 //===---------------------------------------------------------------------===//
+
+FunctionAttrs is not marking this function as readnone (just readonly):
+$ clang t.c -emit-llvm -S -o - -O0 | opt -mem2reg -S -functionattrs
+
+int t(int a, int b, int c) {
+ int *p;
+ if (a)
+   p = &a;
+ else
+   p = &c;
+ return *p;
+}
+
+This is because we codegen this to:
+
+define i32 @t(i32 %a, i32 %b, i32 %c) nounwind readonly ssp {
+entry:
+  %a.addr = alloca i32                            ; <i32*> [#uses=3]
+  %c.addr = alloca i32                            ; <i32*> [#uses=2]
+...
+
+if.end:
+  %p.0 = phi i32* [ %a.addr, %if.then ], [ %c.addr, %if.else ]
+  %tmp2 = load i32* %p.0                          ; <i32> [#uses=1]
+  ret i32 %tmp2
+}
+
+And functionattrs doesn't realize that the p.0 load points to function local
+memory.
+
+Also, functionattrs doesn't know about memcpy/memset.  This function should be
+marked readnone, since it only twiddles local memory, but functionattrs doesn't
+handle memset/memcpy/memmove aggressively:
+
+struct X { int *p; int *q; };
+int foo() {
+ int i = 0, j = 1;
+ struct X x, y;
+ int **p;
+ y.p = &i;
+ x.q = &j;
+ p = __builtin_memcpy (&x, &y, sizeof (int *));
+ return **p;
+}
+
+//===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index d82d928..5fa7e8c 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -402,18 +402,6 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool SystemZInstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB)const{
-  if (MBB.empty()) return false;
-
-  switch (MBB.back().getOpcode()) {
-  case SystemZ::RET:   // Return.
-  case SystemZ::JMP:   // Uncond branch.
-  case SystemZ::JMPr:  // Indirect branch.
-    return true;
-  default: return false;
-  }
-}
-
 bool SystemZInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   const TargetInstrDesc &TID = MI->getDesc();
   if (!TID.isTerminator()) return false;
@@ -454,8 +442,8 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (next(I) != MBB.end())
-        next(I)->eraseFromParent();
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
       Cond.clear();
       FBB = 0;
 
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index e16d704..ef3b39e 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -89,7 +89,6 @@ public:
                                  const std::vector<CalleeSavedInfo> &CSI) const;
 
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
   virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB,
                              MachineBasicBlock *&TBB,
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 4d1c01f..1318195 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -247,7 +247,7 @@ void SystemZRegisterInfo::emitPrologue(MachineFunction &MF) const {
       .addReg(SystemZ::R15D);
 
     // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(SystemZ::R11D);
 
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index fc71bc3..9434a19 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -117,14 +117,6 @@ TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-std::ostream &
-TargetAlignElem::dump(std::ostream &os) const {
-  return os << AlignType
-            << TypeBitWidth
-            << ":" << (int) (ABIAlign * 8)
-            << ":" << (int) (PrefAlign * 8);
-}
-
 const TargetAlignElem TargetData::InvalidAlignmentElem =
                 TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
 
@@ -323,11 +315,10 @@ unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
                  : Alignments[BestMatchIdx].PrefAlign;
 }
 
-typedef DenseMap<const StructType*, StructLayout*> LayoutInfoTy;
-
-namespace llvm {
+namespace {
 
 class StructLayoutMap : public AbstractTypeUser {
+  typedef DenseMap<const StructType*, StructLayout*> LayoutInfoTy;
   LayoutInfoTy LayoutInfo;
 
   /// refineAbstractType - The callback method invoked when an abstract type is
@@ -336,19 +327,11 @@ class StructLayoutMap : public AbstractTypeUser {
   ///
   virtual void refineAbstractType(const DerivedType *OldTy,
                                   const Type *) {
-    const StructType *STy = dyn_cast<const StructType>(OldTy);
-    if (!STy) {
-      OldTy->removeAbstractTypeUser(this);
-      return;
-    }
-
-    StructLayout *SL = LayoutInfo[STy];
-    if (SL) {
-      SL->~StructLayout();
-      free(SL);
-      LayoutInfo[STy] = NULL;
-    }
-
+    const StructType *STy = cast<const StructType>(OldTy);
+    LayoutInfoTy::iterator Iter = LayoutInfo.find(STy);
+    Iter->second->~StructLayout();
+    free(Iter->second);
+    LayoutInfo.erase(Iter);
     OldTy->removeAbstractTypeUser(this);
   }
 
@@ -358,70 +341,43 @@ class StructLayoutMap : public AbstractTypeUser {
   /// This method notifies ATU's when this occurs for a type.
   ///
   virtual void typeBecameConcrete(const DerivedType *AbsTy) {
-    const StructType *STy = dyn_cast<const StructType>(AbsTy);
-    if (!STy) {
-      AbsTy->removeAbstractTypeUser(this);
-      return;
-    }
-
-    StructLayout *SL = LayoutInfo[STy];
-    if (SL) {
-      SL->~StructLayout();
-      free(SL);
-      LayoutInfo[STy] = NULL;
-    }
-
+    const StructType *STy = cast<const StructType>(AbsTy);
+    LayoutInfoTy::iterator Iter = LayoutInfo.find(STy);
+    Iter->second->~StructLayout();
+    free(Iter->second);
+    LayoutInfo.erase(Iter);
     AbsTy->removeAbstractTypeUser(this);
   }
 
-  bool insert(const Type *Ty) {
-    if (Ty->isAbstract())
-      Ty->addAbstractTypeUser(this);
-    return true;
-  }
-
 public:
   virtual ~StructLayoutMap() {
     // Remove any layouts.
     for (LayoutInfoTy::iterator
-           I = LayoutInfo.begin(), E = LayoutInfo.end(); I != E; ++I)
-      if (StructLayout *SL = I->second) {
-        SL->~StructLayout();
-        free(SL);
-      }
-  }
+           I = LayoutInfo.begin(), E = LayoutInfo.end(); I != E; ++I) {
+      const Type *Key = I->first;
+      StructLayout *Value = I->second;
 
-  inline LayoutInfoTy::iterator begin() {
-    return LayoutInfo.begin();
-  }
-  inline LayoutInfoTy::iterator end() {
-    return LayoutInfo.end();
-  }
-  inline LayoutInfoTy::const_iterator begin() const {
-    return LayoutInfo.begin();
-  }
-  inline LayoutInfoTy::const_iterator end() const {
-    return LayoutInfo.end();
-  }
+      if (Key->isAbstract())
+        Key->removeAbstractTypeUser(this);
 
-  LayoutInfoTy::iterator find(const StructType *&Val) {
-    return LayoutInfo.find(Val);
-  }
-  LayoutInfoTy::const_iterator find(const StructType *&Val) const {
-    return LayoutInfo.find(Val);
+      Value->~StructLayout();
+      free(Value);
+    }
   }
 
-  bool erase(const StructType *&Val) {
-    return LayoutInfo.erase(Val);
-  }
-  bool erase(LayoutInfoTy::iterator I) {
-    return LayoutInfo.erase(I);
+  void InvalidateEntry(const StructType *Ty) {
+    LayoutInfoTy::iterator I = LayoutInfo.find(Ty);
+    if (I == LayoutInfo.end()) return;
+
+    I->second->~StructLayout();
+    free(I->second);
+    LayoutInfo.erase(I);
+
+    if (Ty->isAbstract())
+      Ty->removeAbstractTypeUser(this);
   }
 
-  StructLayout *&operator[](const Type *Key) {
-    const StructType *STy = dyn_cast<const StructType>(Key);
-    assert(STy && "Trying to access the struct layout map with a non-struct!");
-    insert(STy);
+  StructLayout *&operator[](const StructType *STy) {
     return LayoutInfo[STy];
   }
 
@@ -429,17 +385,18 @@ public:
   virtual void dump() const {}
 };
 
-} // end namespace llvm
+} // end anonymous namespace
 
 TargetData::~TargetData() {
-  delete LayoutMap;
+  delete static_cast<StructLayoutMap*>(LayoutMap);
 }
 
 const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
   if (!LayoutMap)
     LayoutMap = new StructLayoutMap();
   
-  StructLayout *&SL = (*LayoutMap)[Ty];
+  StructLayoutMap *STM = static_cast<StructLayoutMap*>(LayoutMap);
+  StructLayout *&SL = (*STM)[Ty];
   if (SL) return SL;
 
   // Otherwise, create the struct layout.  Because it is variable length, we 
@@ -453,6 +410,10 @@ const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
   SL = L;
   
   new (L) StructLayout(Ty, *this);
+
+  if (Ty->isAbstract())
+    Ty->addAbstractTypeUser(STM);
+
   return L;
 }
 
@@ -463,15 +424,10 @@ const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
 void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
   if (!LayoutMap) return;  // No cache.
   
-  DenseMap<const StructType*, StructLayout*>::iterator I = LayoutMap->find(Ty);
-  if (I == LayoutMap->end()) return;
-  
-  I->second->~StructLayout();
-  free(I->second);
-  LayoutMap->erase(I);
+  StructLayoutMap *STM = static_cast<StructLayoutMap*>(LayoutMap);
+  STM->InvalidateEntry(Ty);
 }
 
-
 std::string TargetData::getStringRepresentation() const {
   std::string Result;
   raw_string_ostream OS(Result);
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index a167118..684c61f 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -62,11 +62,6 @@ MCCodeEmitter *createX86MCCodeEmitter(const Target &, TargetMachine &TM);
 ///
 FunctionPass *createEmitX86CodeToMemory();
 
-/// createX86MaxStackAlignmentCalculatorPass - This function returns a pass
-/// which calculates maximal stack alignment required for function
-///
-FunctionPass *createX86MaxStackAlignmentCalculatorPass();
-
 extern Target TheX86_32Target, TheX86_64Target;
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
index afd5525..5017af2 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -16,6 +16,7 @@
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/StringSet.h"
+#include "X86MachineFunctionInfo.h"
 
 namespace llvm {
   class X86MachineFunctionInfo;
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index d77f039..12d3d04 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -64,11 +64,18 @@ def RetCC_X86_32_C : CallingConv<[
 // X86-32 FastCC return-value convention.
 def RetCC_X86_32_Fast : CallingConv<[
   // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
-  // SSE2, otherwise it is the the C calling conventions.
+  // SSE2.
   // This can happen when a float, 2 x float, or 3 x float vector is split by
   // target lowering, and is returned in 1-3 sse regs.
   CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
   CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+  // For integers, ECX can be used as an extra return register
+  CCIfType<[i8],  CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+  // Otherwise, it is the same as the common X86 calling convention.
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index a2fe9b0..044bd4b 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -289,7 +289,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
         while (Start != BB.begin() && prior(Start) != PrevI) --Start;
         errs() << "Inserted instructions:\n\t";
         Start->print(errs(), &MF.getTarget());
-        while (++Start != next(I)) {}
+        while (++Start != llvm::next(I)) {}
       }
       dumpStack();
     );
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d80b8ec..0517b56 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -595,6 +595,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
+    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction((MVT::SimpleValueType)VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
   }
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -671,8 +683,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
 
-    setTruncStoreAction(MVT::v8i16,             MVT::v8i8, Expand);
-    setOperationAction(ISD::TRUNCATE,           MVT::v8i8, Expand);
     setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
     setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
     setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
@@ -3344,6 +3354,82 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
 }
 
 SDValue
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                          SelectionDAG &DAG) {
+  
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (Ptr.getOpcode() == ISD::ADD &&
+               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < 16) {
+      if (MFI->isFixedObjectIndex(FI)) {
+        // Can't change the alignment. Reference stack + offset explicitly
+        // if stack pointer is at least 16-byte aligned.
+        unsigned StackAlign = Subtarget->getStackAlignment();
+        if (StackAlign < 16)
+          return SDValue();
+        Offset = MFI->getObjectOffset(FI) + Offset;
+        SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+                                              getPointerTy());
+        Ptr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                          DAG.getConstant(Offset & ~15, getPointerTy()));
+        Offset %= 16;
+      } else {
+        MFI->setObjectAlignment(FI, 16);
+      }
+    }
+
+    // (Offset % 16) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % 16) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~15;
+    if (StartOffset)
+      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
+    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
+    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0);
+    // Canonicalize it to a v4i32 shuffle.
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
+                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
+  }
+
+  return SDValue();
+}
+
+SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   // All zero's are handled with pxor, all one's are handled with pcmpeqd.
@@ -3486,8 +3572,19 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   }
 
   // Splat is obviously ok. Let legalizer expand it to a shuffle.
-  if (Values.size() == 1)
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
     return SDValue();
+  }
 
   // A vector full of immediates; various special cases are already
   // handled, so this is best done with a single constant-pool load.
@@ -4278,7 +4375,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   unsigned ShAmt = 0;
   SDValue ShVal;
   bool isShift = getSubtarget()->hasSSE2() &&
-  isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
@@ -4815,6 +4912,7 @@ static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
            unsigned char OperandFlags) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
   DebugLoc dl = GA->getDebugLoc();
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
@@ -4828,6 +4926,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
     SDValue Ops[]  = { Chain, TGA };
     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
   }
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI->setHasCalls(true);
+
   SDValue Flag = Chain.getValue(1);
   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
 }
@@ -5648,6 +5750,17 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
     return SDValue();
 
   SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
+
+  // Use sbb x, x to materialize carry bit into a GPR.
+  // FIXME: Temporarily disabled since it breaks self-hosting. It's apparently
+  // miscompiling ARMISelDAGToDAG.cpp.
+  if (0 && !isFP && X86CC == X86::COND_B) {
+    return DAG.getNode(ISD::AND, dl, MVT::i8,
+                       DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
+                                   DAG.getConstant(X86CC, MVT::i8), Cond),
+                       DAG.getConstant(1, MVT::i8));
+  }
+
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                      DAG.getConstant(X86CC, MVT::i8), Cond);
 }
@@ -5800,9 +5913,18 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
       Cond = NewCond;
   }
 
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1) 
+      Cond = Cond.getOperand(0);
+  }
+
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
-  if (Cond.getOpcode() == X86ISD::SETCC) {
+  if (Cond.getOpcode() == X86ISD::SETCC ||
+      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
@@ -5885,9 +6007,18 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
     Cond = LowerXALUO(Cond, DAG);
 #endif
 
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1) 
+      Cond = Cond.getOperand(0);
+  }
+
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
-  if (Cond.getOpcode() == X86ISD::SETCC) {
+  if (Cond.getOpcode() == X86ISD::SETCC ||
+      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
@@ -7274,6 +7405,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
@@ -8327,16 +8459,6 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
-                               const TargetLowering &TLI) {
-  GlobalValue *GV;
-  int64_t Offset = 0;
-  if (TLI.isGAPlusOffset(Base, GV, Offset))
-    return (GV->getAlignment() >= N && (Offset % N) == 0);
-  // DAG combine handles the stack object case.
-  return false;
-}
-
 static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
                                      EVT EltVT, LoadSDNode *&LDBase,
                                      unsigned &LastLoadedElt,
@@ -8366,7 +8488,7 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
       continue;
 
     LoadSDNode *LD = cast<LoadSDNode>(Elt);
-    if (!TLI.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i, MFI))
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
       return false;
     LastLoadedElt = i;
   }
@@ -8399,7 +8521,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   if (LastLoadedElt == NumElems - 1) {
-    if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI))
+    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
       return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
                          LD->getSrcValue(), LD->getSrcValueOffset(),
                          LD->isVolatile());
@@ -8858,11 +8980,42 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+  // since the result of setcc_c is all zero's or all ones.
+  if (N1C && N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
+        ((N00.getOpcode() == ISD::ANY_EXTEND ||
+          N00.getOpcode() == ISD::ZERO_EXTEND) &&
+         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
+      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      APInt ShAmt = N1C->getAPIntValue();
+      Mask = Mask.shl(ShAmt);
+      if (Mask != 0)
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                           N00, DAG.getConstant(Mask, VT));
+    }
+  }
+
+  return SDValue();
+}
 
 /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
 ///                       when possible.
 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() && VT.isInteger() &&
+      N->getOpcode() == ISD::SHL)
+    return PerformSHLCombine(N, DAG);
+
   // On X86 with SSE2 support, we can transform this to a vector shift if
   // all elements are shifted by the same amount.  We can't do this in legalize
   // because the a constant vector is typically transformed to a constant pool
@@ -8870,7 +9023,6 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   if (!Subtarget->hasSSE2())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
     return SDValue();
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 7b4ab62..64bc70c 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -118,6 +118,10 @@ namespace llvm {
       /// operand produced by a CMP instruction.
       SETCC,
 
+      // Same as SETCC except it's materialized with a sbb and the value is all
+      // one's or all zero's.
+      SETCC_CARRY,
+
       /// X86 conditional moves. Operand 0 and operand 1 are the two values
       /// to select from. Operand 2 is the condition code, and operand 3 is the
       /// flag operand produced by a CMP or TEST instruction. It also writes a
@@ -626,7 +630,9 @@ namespace llvm {
 
     std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                                bool isSigned);
-    
+
+    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                   SelectionDAG &DAG);
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index b5fa862..b6a2c05 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -1333,6 +1333,15 @@ def CMOVNO64rm : RI<0x41, MRMSrcMem,       // if !overflow, GR64 = [mem64]
                                      X86_COND_NO, EFLAGS))]>, TB;
 } // isTwoAddress
 
+// Use sbb to materialize carry flag into a GPR.
+let Defs = [EFLAGS], Uses = [EFLAGS], isCodeGenOnly = 1 in
+def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins),
+                  "sbb{q}\t$dst, $dst",
+                 [(set GR64:$dst, (zext (X86setcc_c X86_COND_B, EFLAGS)))]>;
+
+def : Pat<(i64 (anyext (X86setcc_c X86_COND_B, EFLAGS))),
+          (SETB_C64r)>;
+
 //===----------------------------------------------------------------------===//
 //  Conversion Instructions...
 //
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index a37013d..1947d35 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -783,12 +783,14 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
     if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
       return Reg;
     // Check for post-frame index elimination operations
-    return hasLoadFromStackSlot(MI, FrameIndex);
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
   }
   return 0;
 }
 
 bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                        const MachineMemOperand *&MMO,
                                         int &FrameIndex) const {
   for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
          oe = MI->memoperands_end();
@@ -798,6 +800,7 @@ bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
       if (const FixedStackPseudoSourceValue *Value =
           dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
         FrameIndex = Value->getFrameIndex();
+        MMO = *o;
         return true;
       }
   }
@@ -819,12 +822,14 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
     if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
       return Reg;
     // Check for post-frame index elimination operations
-    return hasStoreToStackSlot(MI, FrameIndex);
+    const MachineMemOperand *Dummy;
+    return hasStoreToStackSlot(MI, Dummy, FrameIndex);
   }
   return 0;
 }
 
 bool X86InstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
+                                       const MachineMemOperand *&MMO,
                                        int &FrameIndex) const {
   for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
          oe = MI->memoperands_end();
@@ -834,6 +839,7 @@ bool X86InstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
       if (const FixedStackPseudoSourceValue *Value =
           dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
         FrameIndex = Value->getFrameIndex();
+        MMO = *o;
         return true;
       }
   }
@@ -1052,6 +1058,112 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) {
   return false;
 }
 
+/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
+/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
+/// to a 32-bit superregister and then truncating back down to a 16-bit
+/// subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+                                           MachineFunction::iterator &MFI,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+    ? X86::LEA64_32r : X86::LEA32r;
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+            
+  // Build and insert into an implicit UNDEF value. This is OK because
+  // well be shifting and then extracting the lower 16-bits. 
+  // This has the potential to cause partial register stall. e.g.
+  //   movw    (%rbp,%rcx,2), %dx
+  //   leal    -65(%rdx), %esi
+  // But testing has shown this *does* help performance in 64-bit mode (at
+  // least on modern x86 machines).
+  BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineInstr *InsMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
+    .addReg(leaInReg)
+    .addReg(Src, getKillRegState(isKill))
+    .addImm(X86::SUBREG_16BIT);
+
+  MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+                                    get(Opc), leaOutReg);
+  switch (MIOpc) {
+  default:
+    llvm_unreachable(0);
+    break;
+  case X86::SHL16ri: {
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    MIB.addReg(0).addImm(1 << ShAmt)
+       .addReg(leaInReg, RegState::Kill).addImm(0);
+    break;
+  }
+  case X86::INC16r:
+  case X86::INC64_16r:
+    addLeaRegOffset(MIB, leaInReg, true, 1);
+    break;
+  case X86::DEC16r:
+  case X86::DEC64_16r:
+    addLeaRegOffset(MIB, leaInReg, true, -1);
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+    addLeaRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());    
+    break;
+  case X86::ADD16rr: {
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    unsigned leaInReg2 = 0;
+    MachineInstr *InsMI2 = 0;
+    if (Src == Src2) {
+      // ADD16rr %reg1028<kill>, %reg1028
+      // just a single insert_subreg.
+      addRegReg(MIB, leaInReg, true, leaInReg, false);
+    } else {
+      leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits. 
+      BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+      InsMI2 =
+        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg2)
+        .addReg(leaInReg2)
+        .addReg(Src2, getKillRegState(isKill2))
+        .addImm(X86::SUBREG_16BIT);
+      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+    }
+    if (LV && isKill2 && InsMI2)
+      LV->replaceKillInstruction(Src2, MI, InsMI2);
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = MIB;
+  MachineInstr *ExtMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+    .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+    .addReg(leaOutReg, RegState::Kill)
+    .addImm(X86::SUBREG_16BIT);
+
+  if (LV) {
+    // Update live variables
+    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, InsMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, ExtMI);
+  }
+
+  return ExtMI;
+}
+
 /// convertToThreeAddress - This method must be implemented by targets that
 /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
 /// may be able to convert a two-address instruction into a true
@@ -1077,7 +1189,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   MachineInstr *NewMI = NULL;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
   // we have better subtarget support, enable the 16-bit LEA generation here.
+  // 16-bit LEA is also slow on Core2.
   bool DisableLEA16 = true;
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
 
   unsigned MIOpc = MI->getOpcode();
   switch (MIOpc) {
@@ -1116,8 +1230,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned ShAmt = MI->getOperand(2).getImm();
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
-    unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
-      X86::LEA64_32r : X86::LEA32r;
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addReg(Dest, RegState::Define | getDeadRegState(isDead))
       .addReg(0).addImm(1 << ShAmt)
@@ -1131,51 +1244,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned ShAmt = MI->getOperand(2).getImm();
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
-    if (DisableLEA16) {
-      // If 16-bit LEA is disabled, use 32-bit LEA via subregisters.
-      MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
-      unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
-        ? X86::LEA64_32r : X86::LEA32r;
-      unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
-      unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
-            
-      // Build and insert into an implicit UNDEF value. This is OK because
-      // well be shifting and then extracting the lower 16-bits. 
-      BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
-      MachineInstr *InsMI =
-        BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
-        .addReg(leaInReg)
-        .addReg(Src, getKillRegState(isKill))
-        .addImm(X86::SUBREG_16BIT);
-      
-      NewMI = BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(Opc), leaOutReg)
-        .addReg(0).addImm(1 << ShAmt)
-        .addReg(leaInReg, RegState::Kill)
-        .addImm(0);
-      
-      MachineInstr *ExtMI =
-        BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
-        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-        .addReg(leaOutReg, RegState::Kill)
-        .addImm(X86::SUBREG_16BIT);
-
-      if (LV) {
-        // Update live variables
-        LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
-        LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
-        if (isKill)
-          LV->replaceKillInstruction(Src, MI, InsMI);
-        if (isDead)
-          LV->replaceKillInstruction(Dest, MI, ExtMI);
-      }
-      return ExtMI;
-    } else {
-      NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-        .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-        .addReg(0).addImm(1 << ShAmt)
-        .addReg(Src, getKillRegState(isKill))
-        .addImm(0);
-    }
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, getKillRegState(isKill))
+      .addImm(0);
     break;
   }
   default: {
@@ -1185,7 +1260,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (hasLiveCondCodeDef(MI))
       return 0;
 
-    bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
     switch (MIOpc) {
     default: return 0;
     case X86::INC64r:
@@ -1202,7 +1276,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
     case X86::INC16r:
     case X86::INC64_16r:
-      if (DisableLEA16) return 0;
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                            .addReg(Dest, RegState::Define |
@@ -1223,7 +1298,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     }
     case X86::DEC16r:
     case X86::DEC64_16r:
-      if (DisableLEA16) return 0;
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                            .addReg(Dest, RegState::Define |
@@ -1246,7 +1322,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       break;
     }
     case X86::ADD16rr: {
-      if (DisableLEA16) return 0;
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
@@ -1261,56 +1338,32 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD64ri32:
     case X86::ADD64ri8:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      if (MI->getOperand(2).isImm())
-        NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                                .addReg(Dest, RegState::Define |
-                                        getDeadRegState(isDead)),
-                                Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, MI->getOperand(2).getImm());
       break;
     case X86::ADD32ri:
-    case X86::ADD32ri8:
+    case X86::ADD32ri8: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      if (MI->getOperand(2).isImm()) {
-        unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-        NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                                .addReg(Dest, RegState::Define |
-                                        getDeadRegState(isDead)),
+      unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
                                 Src, isKill, MI->getOperand(2).getImm());
-      }
       break;
+    }
     case X86::ADD16ri:
     case X86::ADD16ri8:
-      if (DisableLEA16) return 0;
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      if (MI->getOperand(2).isImm())
-        NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                             .addReg(Dest, RegState::Define |
-                                     getDeadRegState(isDead)),
-                             Src, isKill, MI->getOperand(2).getImm());
-      break;
-    case X86::SHL16ri:
-      if (DisableLEA16) return 0;
-    case X86::SHL32ri:
-    case X86::SHL64ri: {
-      assert(MI->getNumOperands() >= 3 && MI->getOperand(2).isImm() &&
-             "Unknown shl instruction!");
-      unsigned ShAmt = MI->getOperand(2).getImm();
-      if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) {
-        X86AddressMode AM;
-        AM.Scale = 1 << ShAmt;
-        AM.IndexReg = Src;
-        unsigned Opc = MIOpc == X86::SHL64ri ? X86::LEA64r
-          : (MIOpc == X86::SHL32ri
-             ? (is64Bit ? X86::LEA64_32r : X86::LEA32r) : X86::LEA16r);
-        NewMI = addFullAddress(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                               .addReg(Dest, RegState::Define |
-                                       getDeadRegState(isDead)), AM);
-        if (isKill)
-          NewMI->getOperand(3).setIsKill(true);
-      }
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, MI->getOperand(2).getImm());
       break;
     }
-    }
   }
   }
 
@@ -1571,14 +1624,17 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   MachineBasicBlock::iterator I = MBB.end();
   while (I != MBB.begin()) {
     --I;
-    // Working from the bottom, when we see a non-terminator
-    // instruction, we're done.
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
     if (!isBrAnalysisUnpredicatedTerminator(I, *this))
       break;
-    // A terminator that isn't a branch can't easily be handled
-    // by this analysis.
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
     if (!I->getDesc().isBranch())
       return true;
+
     // Handle unconditional branches.
     if (I->getOpcode() == X86::JMP) {
       if (!AllowModify) {
@@ -1587,10 +1643,12 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // If the block has any instructions after a JMP, delete them.
-      while (next(I) != MBB.end())
-        next(I)->eraseFromParent();
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+
       Cond.clear();
       FBB = 0;
+
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
         TBB = 0;
@@ -1598,14 +1656,17 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         I = MBB.end();
         continue;
       }
+
       // TBB is used to indicate the unconditinal destination.
       TBB = I->getOperand(0).getMBB();
       continue;
     }
+
     // Handle conditional branches.
     X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
+
     // Working from the bottom, handle the first conditional branch.
     if (Cond.empty()) {
       FBB = TBB;
@@ -1613,24 +1674,26 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       Cond.push_back(MachineOperand::CreateImm(BranchCode));
       continue;
     }
-    // Handle subsequent conditional branches. Only handle the case
-    // where all conditional branches branch to the same destination
-    // and their condition opcodes fit one of the special
-    // multi-branch idioms.
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination and their condition
+    // opcodes fit one of the special multi-branch idioms.
     assert(Cond.size() == 1);
     assert(TBB);
-    // Only handle the case where all conditional branches branch to
-    // the same destination.
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
     if (TBB != I->getOperand(0).getMBB())
       return true;
-    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+
     // If the conditions are the same, we can leave them alone.
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
     if (OldBranchCode == BranchCode)
       continue;
-    // If they differ, see if they fit one of the known patterns.
-    // Theoretically we could handle more patterns here, but
-    // we shouldn't expect to see them if instruction selection
-    // has done a reasonable job.
+
+    // If they differ, see if they fit one of the known patterns. Theoretically,
+    // we could handle more patterns here, but we shouldn't expect to see them
+    // if instruction selection has done a reasonable job.
     if ((OldBranchCode == X86::COND_NP &&
          BranchCode == X86::COND_E) ||
         (OldBranchCode == X86::COND_E &&
@@ -1643,6 +1706,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       BranchCode = X86::COND_NE_OR_P;
     else
       return true;
+
     // Update the MachineOperand.
     Cond[0].setImm(BranchCode);
   }
@@ -2713,27 +2777,6 @@ unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
   return I->second.first;
 }
 
-bool X86InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB) const {
-  if (MBB.empty()) return false;
-  
-  switch (MBB.back().getOpcode()) {
-  case X86::TCRETURNri:
-  case X86::TCRETURNdi:
-  case X86::RET:     // Return.
-  case X86::RETI:
-  case X86::TAILJMPd:
-  case X86::TAILJMPr:
-  case X86::TAILJMPm:
-  case X86::JMP:     // Uncond branch.
-  case X86::JMP32r:  // Indirect branch.
-  case X86::JMP64r:  // Indirect branch (64-bit).
-  case X86::JMP32m:  // Indirect branch through mem.
-  case X86::JMP64m:  // Indirect branch through mem (64-bit).
-    return true;
-  default: return false;
-  }
-}
-
 bool X86InstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index c6daa25..b83441d 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -457,11 +457,14 @@ public:
 
   /// hasLoadFromStackSlot - If the specified machine instruction has
   /// a load from a stack slot, return true along with the FrameIndex
-  /// of the loaded stack slot.  If not, return false.  Unlike
+  /// of the loaded stack slot and the machine mem operand containing
+  /// the reference.  If not, return false.  Unlike
   /// isLoadFromStackSlot, this returns true for any instructions that
   /// loads from the stack.  This is a hint only and may not catch all
   /// cases.
-  bool hasLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const;
 
   unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
@@ -472,11 +475,13 @@ public:
 
   /// hasStoreToStackSlot - If the specified machine instruction has a
   /// store to a stack slot, return true along with the FrameIndex of
-  /// the loaded stack slot.  If not, return false.  Unlike
-  /// isStoreToStackSlot, this returns true for any instructions that
-  /// loads from the stack.  This is a hint only and may not catch all
-  /// cases.
-  bool hasStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  /// the loaded stack slot and the machine mem operand containing the
+  /// reference.  If not, return false.  Unlike isStoreToStackSlot,
+  /// this returns true for any instructions that loads from the
+  /// stack.  This is a hint only and may not catch all cases.
+  bool hasStoreToStackSlot(const MachineInstr *MI,
+                           const MachineMemOperand *&MMO,
+                           int &FrameIndex) const;
 
   bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
                                          AliasAnalysis *AA) const;
@@ -595,7 +600,6 @@ public:
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex = 0) const;
   
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
@@ -633,6 +637,11 @@ public:
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
 private:
+  MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+                                              MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                      MachineInstr* MI,
                                      unsigned OpNum,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 90ef1f4..3cc1853 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -87,6 +87,7 @@ def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
 def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
                         [SDNPHasChain]>;
 def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC>;
 
 def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
                         [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
@@ -816,7 +817,7 @@ def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 
 let neverHasSideEffects = 1 in
 def LEA16r   : I<0x8D, MRMSrcMem,
-                 (outs GR16:$dst), (ins i32mem:$src),
+                 (outs GR16:$dst), (ins lea32mem:$src),
                  "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
 let isReMaterializable = 1 in
 def LEA32r   : I<0x8D, MRMSrcMem,
@@ -3059,6 +3060,21 @@ let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
 def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
 
 let Uses = [EFLAGS] in {
+// Use sbb to materialize carry bit.
+
+let Defs = [EFLAGS], isCodeGenOnly = 1 in {
+def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins),
+                 "sbb{b}\t$dst, $dst",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins),
+                  "sbb{w}\t$dst, $dst",
+                 [(set GR16:$dst, (zext (X86setcc_c X86_COND_B, EFLAGS)))]>,
+                OpSize;
+def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins),
+                  "sbb{l}\t$dst, $dst",
+                 [(set GR32:$dst, (zext (X86setcc_c X86_COND_B, EFLAGS)))]>;
+} // isCodeGenOnly
+
 def SETEr    : I<0x94, MRM0r, 
                  (outs GR8   :$dst), (ins),
                  "sete\t$dst",
@@ -4169,6 +4185,12 @@ def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
                        GR16:$src2, (i8 imm:$amt2)), addr:$dst),
           (SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
 
+// (anyext (setcc_carry)) -> (zext (setcc_carry))
+def : Pat<(i16 (anyext (X86setcc_c X86_COND_B, EFLAGS))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (X86setcc_c X86_COND_B, EFLAGS))),
+          (SETB_C32r)>;
+
 //===----------------------------------------------------------------------===//
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index dfdd4ce..62841f8 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2083,7 +2083,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
                      (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                      "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      [(set VR128:$dst, (v4i32 (pshufd:$src2
-                                             (bc_v4i32(memopv2i64 addr:$src1)),
+                                             (bc_v4i32 (memopv2i64 addr:$src1)),
                                              (undef))))]>;
 }
 
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 33852bd..d96aafd 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -423,21 +423,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
-static unsigned calculateMaxStackAlignment(const MachineFrameInfo *FFI) {
-  unsigned MaxAlign = 0;
-
-  for (int i = FFI->getObjectIndexBegin(),
-         e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-
-    unsigned Align = FFI->getObjectAlignment(i);
-    MaxAlign = std::max(MaxAlign, Align);
-  }
-
-  return MaxAlign;
-}
-
 /// hasFP - Return true if the specified function should have a dedicated frame
 /// pointer register.  This is true if the function has variable sized allocas
 /// or if frame pointer elimination is disabled.
@@ -638,10 +623,7 @@ X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Calculate and set max stack object alignment early, so we can decide
   // whether we will need stack realignment (and thus FP).
-  unsigned MaxAlign = std::max(MFI->getMaxAlignment(),
-                               calculateMaxStackAlignment(MFI));
-
-  MFI->setMaxAlignment(MaxAlign);
+  MFI->calculateMaxStackAlignment();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -741,7 +723,7 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
 
   if (MBBI == MBB.end()) return;
 
-  MachineBasicBlock::iterator NI = next(MBBI);
+  MachineBasicBlock::iterator NI = llvm::next(MBBI);
   if (NI == MBB.end()) return;
 
   unsigned Opc = NI->getOpcode();
@@ -775,7 +757,7 @@ static int mergeSPUpdates(MachineBasicBlock &MBB,
     return 0;
 
   MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
-  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : next(MBBI);
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI);
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
@@ -1001,7 +983,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     }
 
     // Mark the FramePtr as live-in in every block except the entry.
-    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
          I != E; ++I)
       I->addLiveIn(FramePtr);
 
@@ -1482,45 +1464,3 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
 }
 
 #include "X86GenRegisterInfo.inc"
-
-namespace {
-  struct MSAC : public MachineFunctionPass {
-    static char ID;
-    MSAC() : MachineFunctionPass(&ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
-      MachineFrameInfo *FFI = MF.getFrameInfo();
-      MachineRegisterInfo &RI = MF.getRegInfo();
-
-      // Calculate max stack alignment of all already allocated stack objects.
-      unsigned MaxAlign = calculateMaxStackAlignment(FFI);
-
-      // Be over-conservative: scan over all vreg defs and find, whether vector
-      // registers are used. If yes - there is probability, that vector register
-      // will be spilled and thus stack needs to be aligned properly.
-      for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
-           RegNum < RI.getLastVirtReg(); ++RegNum)
-        MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
-
-      if (FFI->getMaxAlignment() == MaxAlign)
-        return false;
-
-      FFI->setMaxAlignment(MaxAlign);
-      return true;
-    }
-
-    virtual const char *getPassName() const {
-      return "X86 Maximal Stack Alignment Calculator";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-  };
-
-  char MSAC::ID = 0;
-}
-
-FunctionPass*
-llvm::createX86MaxStackAlignmentCalculatorPass() { return new MSAC(); }
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 661f560..75cdbad 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -367,5 +367,5 @@ bool X86Subtarget::enablePostRAScheduler(
             RegClassVector& CriticalPathRCs) const {
   Mode = TargetSubtarget::ANTIDEP_CRITICAL;
   CriticalPathRCs.clear();
-  return OptLevel >= CodeGenOpt::Default;
+  return OptLevel >= CodeGenOpt::Aggressive;
 }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 0cda8bc..0152121 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -163,7 +163,7 @@ bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
   // Calculate and set max stack object alignment early, so we can decide
   // whether we will need stack realignment (and thus FP).
-  PM.add(createX86MaxStackAlignmentCalculatorPass());
+  PM.add(createMaxStackAlignmentCalculatorPass());
   return false;  // -print-machineinstr shouldn't print after this.
 }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index e616fe6..5a54844 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -453,26 +453,6 @@ bool XCoreInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-/// BlockHasNoFallThrough - Analyse if MachineBasicBlock does not
-/// fall-through into its successor block.
-bool XCoreInstrInfo::
-BlockHasNoFallThrough(const MachineBasicBlock &MBB) const 
-{
-  if (MBB.empty()) return false;
-  
-  switch (MBB.back().getOpcode()) {
-  case XCore::RETSP_u6:     // Return.
-  case XCore::RETSP_lu6:
-  case XCore::BAU_1r:       // Indirect branch.
-  case XCore::BRFU_u6:      // Uncond branch.
-  case XCore::BRFU_lu6:
-  case XCore::BRBU_u6:
-  case XCore::BRBU_lu6:
-    return true;
-  default: return false;
-  }
-}
-
 /// ReverseBranchCondition - Return the inverse opcode of the 
 /// specified Branch instruction.
 bool XCoreInstrInfo::
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 24230ac..3e0a765 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -87,8 +87,6 @@ public:
                                          MachineBasicBlock::iterator MI,
                                const std::vector<CalleeSavedInfo> &CSI) const;
 
-  virtual bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
-
   virtual bool ReverseBranchCondition(
                             SmallVectorImpl<MachineOperand> &Cond) const;
 };
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 4635d0e..1793bbf 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2493,29 +2493,28 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
       Changed = true;
     }
 
-    // If the aliasee has internal linkage, give it the name and linkage
-    // of the alias, and delete the alias.  This turns:
-    //   define internal ... @f(...)
-    //   @a = alias ... @f
-    // into:
-    //   define ... @a(...)
-    if (!Target->hasLocalLinkage())
-      continue;
-
-    // The transform is only useful if the alias does not have internal linkage.
-    if (J->hasLocalLinkage())
-      continue;
+    // If the alias is externally visible, we may still be able to simplify it.
+    if (!J->hasLocalLinkage()) {
+      // If the aliasee has internal linkage, give it the name and linkage
+      // of the alias, and delete the alias.  This turns:
+      //   define internal ... @f(...)
+      //   @a = alias ... @f
+      // into:
+      //   define ... @a(...)
+      if (!Target->hasLocalLinkage())
+        continue;
 
-    // Do not perform the transform if multiple aliases potentially target the
-    // aliasee.  This check also ensures that it is safe to replace the section
-    // and other attributes of the aliasee with those of the alias.
-    if (!hasOneUse)
-      continue;
+      // Do not perform the transform if multiple aliases potentially target the
+      // aliasee.  This check also ensures that it is safe to replace the section
+      // and other attributes of the aliasee with those of the alias.
+      if (!hasOneUse)
+        continue;
 
-    // Give the aliasee the name, linkage and other attributes of the alias.
-    Target->takeName(J);
-    Target->setLinkage(J->getLinkage());
-    Target->GlobalValue::copyAttributesFrom(J);
+      // Give the aliasee the name, linkage and other attributes of the alias.
+      Target->takeName(J);
+      Target->setLinkage(J->getLinkage());
+      Target->GlobalValue::copyAttributesFrom(J);
+    }
 
     // Delete the alias.
     M.getAliasList().erase(J);
diff --git a/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/lib/Transforms/Instrumentation/MaximumSpanningTree.h
index 2951dbc..829da6b 100644
--- a/lib/Transforms/Instrumentation/MaximumSpanningTree.h
+++ b/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
 #define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
 
+#include "llvm/BasicBlock.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include <vector>
 #include <algorithm>
@@ -33,6 +34,18 @@ namespace llvm {
                       typename MaximumSpanningTree<CT>::EdgeWeight Y) const {
         if (X.second > Y.second) return true;
         if (X.second < Y.second) return false;
+        if (const BasicBlock *BBX = dyn_cast<BasicBlock>(X.first.first)) {
+          if (const BasicBlock *BBY = dyn_cast<BasicBlock>(Y.first.first)) {
+            if (BBX->size() > BBY->size()) return true;
+            if (BBX->size() < BBY->size()) return false;
+          }
+        }
+        if (const BasicBlock *BBX = dyn_cast<BasicBlock>(X.first.second)) {
+          if (const BasicBlock *BBY = dyn_cast<BasicBlock>(Y.first.second)) {
+            if (BBX->size() > BBY->size()) return true;
+            if (BBX->size() < BBY->size()) return false;
+          }
+        }
         return false;
       }
     };
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 9ca90c3..e4c4ae5 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -21,7 +21,6 @@
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Target/TargetData.h"
@@ -563,7 +562,7 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
   return false;
 }
 
-/// OptimizeMemoryInst - Load and Store Instructions have often have
+/// OptimizeMemoryInst - Load and Store Instructions often have
 /// addressing modes that can do significant amounts of computation.  As such,
 /// instruction selection will try to get the load or store to do as much
 /// computation as possible for the program.  The problem is that isel can only
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index b0988b5..1cfde8f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -85,9 +85,14 @@ static bool doesClobberMemory(Instruction *I) {
     return true;
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
-    default: return false;
-    case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy:
-    case Intrinsic::init_trampoline: case Intrinsic::lifetime_end: return true;
+    default:
+      return false;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+    case Intrinsic::init_trampoline:
+    case Intrinsic::lifetime_end:
+      return true;
     }
   }
   return false;
@@ -111,14 +116,13 @@ static Value *getPointerOperand(Instruction *I) {
     return SI->getPointerOperand();
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
     return MI->getOperand(1);
-  IntrinsicInst *II = cast<IntrinsicInst>(I);
-  switch (II->getIntrinsicID()) {
-    default:
-      assert(false && "Unexpected intrinsic!");
-    case Intrinsic::init_trampoline:
-      return II->getOperand(1);
-    case Intrinsic::lifetime_end:
-      return II->getOperand(2);
+  
+  switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+  default: assert(false && "Unexpected intrinsic!");
+  case Intrinsic::init_trampoline:
+    return I->getOperand(1);
+  case Intrinsic::lifetime_end:
+    return I->getOperand(2);
   }
 }
 
@@ -135,15 +139,13 @@ static unsigned getStoreSize(Instruction *I, const TargetData *TD) {
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
     Len = MI->getLength();
   } else {
-    IntrinsicInst *II = cast<IntrinsicInst>(I);
-    switch (II->getIntrinsicID()) {
-      default:
-        assert(false && "Unexpected intrinsic!");
-      case Intrinsic::init_trampoline:
-        return -1u;
-      case Intrinsic::lifetime_end:
-        Len = II->getOperand(1);
-        break;
+    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    default: assert(false && "Unexpected intrinsic!");
+    case Intrinsic::init_trampoline:
+      return -1u;
+    case Intrinsic::lifetime_end:
+      Len = I->getOperand(1);
+      break;
     }
   }
   if (ConstantInt *LenCI = dyn_cast<ConstantInt>(Len))
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 6f1c32c..222792b 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -31,15 +31,18 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -187,8 +190,11 @@ template <> struct DenseMapInfo<Expression> {
   static bool isEqual(const Expression &LHS, const Expression &RHS) {
     return LHS == RHS;
   }
-  static bool isPod() { return true; }
 };
+  
+template <>
+struct isPodLike<Expression> { static const bool value = true; };
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -488,21 +494,21 @@ uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
     // Check to see if we have a single dominating call instruction that is
     // identical to C.
     for (unsigned i = 0, e = deps.size(); i != e; ++i) {
-      const MemoryDependenceAnalysis::NonLocalDepEntry *I = &deps[i];
+      const NonLocalDepEntry *I = &deps[i];
       // Ignore non-local dependencies.
-      if (I->second.isNonLocal())
+      if (I->getResult().isNonLocal())
         continue;
 
       // We don't handle non-depedencies.  If we already have a call, reject
       // instruction dependencies.
-      if (I->second.isClobber() || cdep != 0) {
+      if (I->getResult().isClobber() || cdep != 0) {
         cdep = 0;
         break;
       }
 
-      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->second.getInst());
+      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
       // FIXME: All duplicated with non-local case.
-      if (NonLocalDepCall && DT->properlyDominates(I->first, C->getParent())){
+      if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
         cdep = NonLocalDepCall;
         continue;
       }
@@ -987,27 +993,27 @@ static Value *GetBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
 }
 
 
-/// AnalyzeLoadFromClobberingStore - This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.  This means
-/// that the store *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias.  Check this case to see if there is
-/// anything more we can do before we give up.  This returns -1 if we have to
-/// give up, or a byte number in the stored value of the piece that feeds the
-/// load.
-static int AnalyzeLoadFromClobberingStore(LoadInst *L, StoreInst *DepSI,
+/// AnalyzeLoadFromClobberingWrite - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering memory write (store,
+/// memset, memcpy, memmove).  This means that the write *may* provide bits used
+/// by the load but we can't be sure because the pointers don't mustalias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
                                           const TargetData &TD) {
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
-  if (isa<StructType>(L->getType()) || isa<ArrayType>(L->getType()) ||
-      isa<StructType>(DepSI->getOperand(0)->getType()) ||
-      isa<ArrayType>(DepSI->getOperand(0)->getType()))
+  if (isa<StructType>(LoadTy) || isa<ArrayType>(LoadTy))
     return -1;
   
   int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase = 
-    GetBaseWithConstantOffset(DepSI->getPointerOperand(), StoreOffset, TD);
+  Value *StoreBase = GetBaseWithConstantOffset(WritePtr, StoreOffset, TD);
   Value *LoadBase = 
-    GetBaseWithConstantOffset(L->getPointerOperand(), LoadOffset, TD);
+    GetBaseWithConstantOffset(LoadPtr, LoadOffset, TD);
   if (StoreBase != LoadBase)
     return -1;
   
@@ -1018,12 +1024,10 @@ static int AnalyzeLoadFromClobberingStore(LoadInst *L, StoreInst *DepSI,
 #if 0
     errs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n"
     << "Base       = " << *StoreBase << "\n"
-    << "Store Ptr  = " << *DepSI->getPointerOperand() << "\n"
-    << "Store Offs = " << StoreOffset << " - " << *DepSI << "\n"
-    << "Load Ptr   = " << *L->getPointerOperand() << "\n"
-    << "Load Offs  = " << LoadOffset << " - " << *L << "\n\n";
-    errs() << "'" << L->getParent()->getParent()->getName() << "'"
-    << *L->getParent();
+    << "Store Ptr  = " << *WritePtr << "\n"
+    << "Store Offs = " << StoreOffset << "\n"
+    << "Load Ptr   = " << *LoadPtr << "\n";
+    abort();
 #endif
     return -1;
   }
@@ -1033,12 +1037,11 @@ static int AnalyzeLoadFromClobberingStore(LoadInst *L, StoreInst *DepSI,
   // must have gotten confused.
   // FIXME: Investigate cases where this bails out, e.g. rdar://7238614. Then
   // remove this check, as it is duplicated with what we have below.
-  uint64_t StoreSize = TD.getTypeSizeInBits(DepSI->getOperand(0)->getType());
-  uint64_t LoadSize = TD.getTypeSizeInBits(L->getType());
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy);
   
-  if ((StoreSize & 7) | (LoadSize & 7))
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
-  StoreSize >>= 3;  // Convert to bytes.
+  uint64_t StoreSize = WriteSizeInBits >> 3;  // Convert to bytes.
   LoadSize >>= 3;
   
   
@@ -1052,12 +1055,10 @@ static int AnalyzeLoadFromClobberingStore(LoadInst *L, StoreInst *DepSI,
 #if 0
     errs() << "STORE LOAD DEP WITH COMMON BASE:\n"
     << "Base       = " << *StoreBase << "\n"
-    << "Store Ptr  = " << *DepSI->getPointerOperand() << "\n"
-    << "Store Offs = " << StoreOffset << " - " << *DepSI << "\n"
-    << "Load Ptr   = " << *L->getPointerOperand() << "\n"
-    << "Load Offs  = " << LoadOffset << " - " << *L << "\n\n";
-    errs() << "'" << L->getParent()->getParent()->getName() << "'"
-    << *L->getParent();
+    << "Store Ptr  = " << *WritePtr << "\n"
+    << "Store Offs = " << StoreOffset << "\n"
+    << "Load Ptr   = " << *LoadPtr << "\n";
+    abort();
 #endif
     return -1;
   }
@@ -1075,6 +1076,66 @@ static int AnalyzeLoadFromClobberingStore(LoadInst *L, StoreInst *DepSI,
   return LoadOffset-StoreOffset;
 }  
 
+/// AnalyzeLoadFromClobberingStore - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr,
+                                          StoreInst *DepSI,
+                                          const TargetData &TD) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (isa<StructType>(DepSI->getOperand(0)->getType()) ||
+      isa<ArrayType>(DepSI->getOperand(0)->getType()))
+    return -1;
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize = TD.getTypeSizeInBits(DepSI->getOperand(0)->getType());
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+                                        StorePtr, StoreSize, TD);
+}
+
+static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
+                                            MemIntrinsic *MI,
+                                            const TargetData &TD) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (SizeCst == 0) return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, TD);
+  
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+  
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (Src == 0) return -1;
+  
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(Src->getUnderlyingObject());
+  if (GV == 0 || !GV->isConstant()) return -1;
+  
+  // See if the access is within the bounds of the transfer.
+  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+                                              MI->getDest(), MemSizeInBits, TD);
+  if (Offset == -1)
+    return Offset;
+  
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src = ConstantExpr::getBitCast(Src,
+                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+  Constant *OffsetCst = 
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  if (ConstantFoldLoadFromConstPtr(Src, &TD))
+    return Offset;
+  return -1;
+}
+                                            
 
 /// GetStoreValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.  This means
@@ -1089,50 +1150,134 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   uint64_t StoreSize = TD.getTypeSizeInBits(SrcVal->getType())/8;
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
   
+  IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
   
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (isa<PointerType>(SrcVal->getType()))
-    SrcVal = new PtrToIntInst(SrcVal, TD.getIntPtrType(Ctx), "tmp", InsertPt);
+    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp");
   if (!isa<IntegerType>(SrcVal->getType()))
-    SrcVal = new BitCastInst(SrcVal, IntegerType::get(Ctx, StoreSize*8),
-                             "tmp", InsertPt);
+    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8),
+                                   "tmp");
   
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
-  if (TD.isLittleEndian()) {
+  if (TD.isLittleEndian())
     ShiftAmt = Offset*8;
-  } else {
+  else
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-  }
   
   if (ShiftAmt)
-    SrcVal = BinaryOperator::CreateLShr(SrcVal,
-                ConstantInt::get(SrcVal->getType(), ShiftAmt), "tmp", InsertPt);
+    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp");
   
   if (LoadSize != StoreSize)
-    SrcVal = new TruncInst(SrcVal, IntegerType::get(Ctx, LoadSize*8),
-                           "tmp", InsertPt);
+    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8),
+                                 "tmp");
   
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
 
+/// GetMemInstValueForLoad - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                     const Type *LoadTy, Instruction *InsertPt,
+                                     const TargetData &TD){
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
+
+  IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
+  
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    Value *Val = MSI->getValue();
+    if (LoadSize != 1)
+      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
+    
+    Value *OneElt = Val;
+    
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet*2 <= LoadSize) {
+        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
+        Val = Builder.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+      
+      // Otherwise insert one byte at a time.
+      Value *ShVal = Builder.CreateShl(Val, 1*8);
+      Val = Builder.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+    
+    return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD);
+  }
+ 
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src = ConstantExpr::getBitCast(Src,
+                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+  Constant *OffsetCst = 
+  ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  return ConstantFoldLoadFromConstPtr(Src, &TD);
+}
+
+
+
 struct AvailableValueInBlock {
   /// BB - The basic block in question.
   BasicBlock *BB;
+  enum ValType {
+    SimpleVal,  // A simple offsetted value that is accessed.
+    MemIntrin   // A memory intrinsic which is loaded from.
+  };
+  
   /// V - The value that is live out of the block.
-  Value *V;
-  /// Offset - The byte offset in V that is interesting for the load query.
+  PointerIntPair<Value *, 1, ValType> Val;
+  
+  /// Offset - The byte offset in Val that is interesting for the load query.
   unsigned Offset;
   
   static AvailableValueInBlock get(BasicBlock *BB, Value *V,
                                    unsigned Offset = 0) {
     AvailableValueInBlock Res;
     Res.BB = BB;
-    Res.V = V;
+    Res.Val.setPointer(V);
+    Res.Val.setInt(SimpleVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
+                                     unsigned Offset = 0) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.Val.setPointer(MI);
+    Res.Val.setInt(MemIntrin);
     Res.Offset = Offset;
     return Res;
   }
+  
+  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+  Value *getSimpleValue() const {
+    assert(isSimpleValue() && "Wrong accessor");
+    return Val.getPointer();
+  }
+  
+  MemIntrinsic *getMemIntrinValue() const {
+    assert(!isSimpleValue() && "Wrong accessor");
+    return cast<MemIntrinsic>(Val.getPointer());
+  }
 };
 
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
@@ -1149,30 +1294,33 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   const Type *LoadTy = LI->getType();
   
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
-    BasicBlock *BB = ValuesPerBlock[i].BB;
-    Value *AvailableVal = ValuesPerBlock[i].V;
-    unsigned Offset = ValuesPerBlock[i].Offset;
+    const AvailableValueInBlock &AV = ValuesPerBlock[i];
+    BasicBlock *BB = AV.BB;
     
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
-    
-    if (AvailableVal->getType() != LoadTy) {
-      assert(TD && "Need target data to handle type mismatch case");
-      AvailableVal = GetStoreValueForLoad(AvailableVal, Offset, LoadTy,
-                                          BB->getTerminator(), *TD);
-      
-      if (Offset) {
-        DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\n"
-              << *ValuesPerBlock[i].V << '\n'
+
+    unsigned Offset = AV.Offset;
+
+    Value *AvailableVal;
+    if (AV.isSimpleValue()) {
+      AvailableVal = AV.getSimpleValue();
+      if (AvailableVal->getType() != LoadTy) {
+        assert(TD && "Need target data to handle type mismatch case");
+        AvailableVal = GetStoreValueForLoad(AvailableVal, Offset, LoadTy,
+                                            BB->getTerminator(), *TD);
+        
+        DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+              << *AV.getSimpleValue() << '\n'
               << *AvailableVal << '\n' << "\n\n\n");
       }
-      
-      
-      DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\n"
-            << *ValuesPerBlock[i].V << '\n'
+    } else {
+      AvailableVal = GetMemInstValueForLoad(AV.getMemIntrinValue(), Offset,
+                                            LoadTy, BB->getTerminator(), *TD);
+      DEBUG(errs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+            << "  " << *AV.getMemIntrinValue() << '\n'
             << *AvailableVal << '\n' << "\n\n\n");
     }
-    
     SSAUpdate.AddAvailableValue(BB, AvailableVal);
   }
   
@@ -1187,12 +1335,18 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   return V;
 }
 
+static bool isLifetimeStart(Instruction *Inst) {
+  if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+    return II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return false;
+}
+
 /// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
 /// non-local by performing PHI construction.
 bool GVN::processNonLocalLoad(LoadInst *LI,
                               SmallVectorImpl<Instruction*> &toErase) {
   // Find the non-local dependencies of the load.
-  SmallVector<MemoryDependenceAnalysis::NonLocalDepEntry, 64> Deps;
+  SmallVector<NonLocalDepEntry, 64> Deps;
   MD->getNonLocalPointerDependency(LI->getOperand(0), true, LI->getParent(),
                                    Deps);
   //DEBUG(errs() << "INVESTIGATING NONLOCAL LOAD: "
@@ -1206,11 +1360,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
 
   // If we had a phi translation failure, we'll have a single entry which is a
   // clobber in the current block.  Reject this early.
-  if (Deps.size() == 1 && Deps[0].second.isClobber()) {
+  if (Deps.size() == 1 && Deps[0].getResult().isClobber()) {
     DEBUG(
       errs() << "GVN: non-local load ";
       WriteAsOperand(errs(), LI);
-      errs() << " is clobbered by " << *Deps[0].second.getInst() << '\n';
+      errs() << " is clobbered by " << *Deps[0].getResult().getInst() << '\n';
     );
     return false;
   }
@@ -1225,18 +1379,24 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
   const TargetData *TD = 0;
   
   for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
-    BasicBlock *DepBB = Deps[i].first;
-    MemDepResult DepInfo = Deps[i].second;
+    BasicBlock *DepBB = Deps[i].getBB();
+    MemDepResult DepInfo = Deps[i].getResult();
 
     if (DepInfo.isClobber()) {
+      // The address being loaded in this non-local block may not be the same as
+      // the pointer operand of the load if PHI translation occurs.  Make sure
+      // to consider the right address.
+      Value *Address = Deps[i].getAddress();
+      
       // If the dependence is to a store that writes to a superset of the bits
       // read by the load, we can extract the bits we need for the load from the
       // stored value.
       if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
         if (TD == 0)
           TD = getAnalysisIfAvailable<TargetData>();
-        if (TD) {
-          int Offset = AnalyzeLoadFromClobberingStore(LI, DepSI, *TD);
+        if (TD && Address) {
+          int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address,
+                                                      DepSI, *TD);
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
                                                            DepSI->getOperand(0),
@@ -1245,8 +1405,23 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
           }
         }
       }
+
+      // If the clobbering value is a memset/memcpy/memmove, see if we can
+      // forward a value on from it.
+      if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+        if (TD == 0)
+          TD = getAnalysisIfAvailable<TargetData>();
+        if (TD && Address) {
+          int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+                                                        DepMI, *TD);
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
+                                                                  Offset));
+            continue;
+          }            
+        }
+      }
       
-      // FIXME: Handle memset/memcpy.
       UnavailableBlocks.push_back(DepBB);
       continue;
     }
@@ -1254,21 +1429,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) {
+    if (isa<AllocaInst>(DepInst) || isMalloc(DepInst) ||
+        // Loading immediately after lifetime begin -> undef.
+        isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
                                              UndefValue::get(LI->getType())));
       continue;
     }
     
-    // Loading immediately after lifetime begin or end -> undef.
-    if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(DepInst)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
-                                             UndefValue::get(LI->getType())));
-      }
-    }
-
     if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
       // Reject loads and stores that are to the same address but are of
       // different types if we have to.
@@ -1378,19 +1546,25 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
   // to eliminate LI even if we insert uses in the other predecessors, we will
   // end up increasing code size.  Reject this by scanning for LI.
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
-    if (ValuesPerBlock[i].V == LI)
+    if (ValuesPerBlock[i].isSimpleValue() &&
+        ValuesPerBlock[i].getSimpleValue() == LI)
       return false;
 
+  // FIXME: It is extremely unclear what this loop is doing, other than
+  // artificially restricting loadpre.
   if (isSinglePred) {
     bool isHot = false;
-    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
-      if (Instruction *I = dyn_cast<Instruction>(ValuesPerBlock[i].V))
+    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
+      const AvailableValueInBlock &AV = ValuesPerBlock[i];
+      if (AV.isSimpleValue())
         // "Hot" Instruction is in some loop (because it dominates its dep.
         // instruction).
-        if (DT->dominates(LI, I)) {
-          isHot = true;
-          break;
-        }
+        if (Instruction *I = dyn_cast<Instruction>(AV.getSimpleValue()))
+          if (DT->dominates(LI, I)) {
+            isHot = true;
+            break;
+          }
+    }
 
     // We are interested only in "hot" instructions. We don't want to do any
     // mis-optimizations here.
@@ -1435,29 +1609,43 @@ bool GVN::processNonLocalLoad(LoadInst *LI,
   // Do PHI translation to get its value in the predecessor if necessary.  The
   // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
   //
-  // FIXME: This may insert a computation, but we don't tell scalar GVN
-  // optimization stuff about it.  How do we do this?
   SmallVector<Instruction*, 8> NewInsts;
-  Value *LoadPtr = 0;
   
   // If all preds have a single successor, then we know it is safe to insert the
   // load on the pred (?!?), so we can insert code to materialize the pointer if
   // it is not available.
+  PHITransAddr Address(LI->getOperand(0), TD);
+  Value *LoadPtr = 0;
   if (allSingleSucc) {
-    LoadPtr = MD->InsertPHITranslatedPointer(LI->getOperand(0), LoadBB,
-                                             UnavailablePred, TD, *DT,NewInsts);
+    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+                                                *DT, NewInsts);
   } else {
-    LoadPtr = MD->GetAvailablePHITranslatedValue(LI->getOperand(0), LoadBB,
-                                                 UnavailablePred, TD, *DT);
-  }
+    Address.PHITranslateValue(LoadBB, UnavailablePred);
+    LoadPtr = Address.getAddr();
     
+    // Make sure the value is live in the predecessor.
+    if (Instruction *Inst = dyn_cast_or_null<Instruction>(LoadPtr))
+      if (!DT->dominates(Inst->getParent(), UnavailablePred))
+        LoadPtr = 0;
+  }
+
   // If we couldn't find or insert a computation of this phi translated value,
   // we fail PRE.
   if (LoadPtr == 0) {
+    assert(NewInsts.empty() && "Shouldn't insert insts on failure");
     DEBUG(errs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
                  << *LI->getOperand(0) << "\n");
     return false;
   }
+
+  // Assign value numbers to these new instructions.
+  for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) {
+    // FIXME: We really _ought_ to insert these value numbers into their 
+    // parent's availability map.  However, in doing so, we risk getting into
+    // ordering issues.  If a block hasn't been processed yet, we would be
+    // marking a value as AVAIL-IN, which isn't what we intend.
+    VN.lookup_or_add(NewInsts[i]);
+  }
   
   // Make sure it is valid to move this load here.  We have to watch out for:
   //  @1 = getelementptr (i8* p, ...
@@ -1517,11 +1705,6 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
 
   // If the value isn't available, don't do anything!
   if (Dep.isClobber()) {
-    // FIXME: We should handle memset/memcpy/memmove as dependent instructions
-    // to forward the value if available.
-    //if (isa<MemIntrinsic>(Dep.getInst()))
-    //errs() << "LOAD DEPENDS ON MEM: " << *L << "\n" << *Dep.getInst()<<"\n\n";
-    
     // Check to see if we have something like this:
     //   store i32 123, i32* %P
     //   %A = bitcast i32* %P to i8*
@@ -1532,25 +1715,42 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     // a common base + constant offset, and if the previous store (or memset)
     // completely covers this load.  This sort of thing can happen in bitfield
     // access code.
+    Value *AvailVal = 0;
     if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst()))
       if (const TargetData *TD = getAnalysisIfAvailable<TargetData>()) {
-        int Offset = AnalyzeLoadFromClobberingStore(L, DepSI, *TD);
-        if (Offset != -1) {
-          Value *AvailVal = GetStoreValueForLoad(DepSI->getOperand(0), Offset,
-                                                 L->getType(), L, *TD);
-          DEBUG(errs() << "GVN COERCED STORE BITS:\n" << *DepSI << '\n'
-                       << *AvailVal << '\n' << *L << "\n\n\n");
-    
-          // Replace the load!
-          L->replaceAllUsesWith(AvailVal);
-          if (isa<PointerType>(AvailVal->getType()))
-            MD->invalidateCachedPointerInfo(AvailVal);
-          toErase.push_back(L);
-          NumGVNLoad++;
-          return true;
-        }
+        int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
+                                                    L->getPointerOperand(),
+                                                    DepSI, *TD);
+        if (Offset != -1)
+          AvailVal = GetStoreValueForLoad(DepSI->getOperand(0), Offset,
+                                          L->getType(), L, *TD);
       }
     
+    // If the clobbering value is a memset/memcpy/memmove, see if we can forward
+    // a value on from it.
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
+      if (const TargetData *TD = getAnalysisIfAvailable<TargetData>()) {
+        int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(),
+                                                      L->getPointerOperand(),
+                                                      DepMI, *TD);
+        if (Offset != -1)
+          AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L,*TD);
+      }
+    }
+        
+    if (AvailVal) {
+      DEBUG(errs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
+            << *AvailVal << '\n' << *L << "\n\n\n");
+      
+      // Replace the load!
+      L->replaceAllUsesWith(AvailVal);
+      if (isa<PointerType>(AvailVal->getType()))
+        MD->invalidateCachedPointerInfo(AvailVal);
+      toErase.push_back(L);
+      NumGVNLoad++;
+      return true;
+    }
+        
     DEBUG(
       // fast print dep, using operator<< on instruction would be too slow
       errs() << "GVN: load ";
@@ -1635,11 +1835,10 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     return true;
   }
   
-  // If this load occurs either right after a lifetime begin or a lifetime end,
+  // If this load occurs either right after a lifetime begin,
   // then the loaded value is undefined.
   if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(DepInst)) {
-    if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-        II->getIntrinsicID() == Intrinsic::lifetime_end) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
       L->replaceAllUsesWith(UndefValue::get(L->getType()));
       toErase.push_back(L);
       NumGVNLoad++;
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
index d12ad81..b9c536f 100644
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -8585,25 +8585,36 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
     if (const IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
       uint32_t BitWidth = ITy->getBitWidth();
-      if (BitWidth > 1) {
-        Value *LHS = ICI->getOperand(0);
-        Value *RHS = ICI->getOperand(1);
-
-        APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
-        APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
-        APInt TypeMask(APInt::getHighBitsSet(BitWidth, BitWidth-1));
-        ComputeMaskedBits(LHS, TypeMask, KnownZeroLHS, KnownOneLHS);
-        ComputeMaskedBits(RHS, TypeMask, KnownZeroRHS, KnownOneRHS);
-
-        if (KnownZeroLHS.countLeadingOnes() == BitWidth-1 &&
-            KnownZeroRHS.countLeadingOnes() == BitWidth-1) {
+      Value *LHS = ICI->getOperand(0);
+      Value *RHS = ICI->getOperand(1);
+
+      APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
+      APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
+      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
+      ComputeMaskedBits(LHS, TypeMask, KnownZeroLHS, KnownOneLHS);
+      ComputeMaskedBits(RHS, TypeMask, KnownZeroRHS, KnownOneRHS);
+
+      if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
+        APInt KnownBits = KnownZeroLHS | KnownOneLHS;
+        APInt UnknownBit = ~KnownBits;
+        if (UnknownBit.countPopulation() == 1) {
           if (!DoXform) return ICI;
 
-          Value *Xor = Builder->CreateXor(LHS, RHS);
+          Value *Result = Builder->CreateXor(LHS, RHS);
+
+          // Mask off any bits that are set and won't be shifted away.
+          if (KnownOneLHS.uge(UnknownBit))
+            Result = Builder->CreateAnd(Result,
+                                        ConstantInt::get(ITy, UnknownBit));
+
+          // Shift the bit we're testing down to the lsb.
+          Result = Builder->CreateLShr(
+               Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
+
           if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
-            Xor = Builder->CreateXor(Xor, ConstantInt::get(ITy, 1));
-          Xor->takeName(ICI);
-          return ReplaceInstUsesWith(CI, Xor);
+            Result = Builder->CreateXor(Result, ConstantInt::get(ITy, 1));
+          Result->takeName(ICI);
+          return ReplaceInstUsesWith(CI, Result);
         }
       }
     }
@@ -11189,8 +11200,9 @@ namespace llvm {
       return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
              LHS.Width == RHS.Width;
     }
-    static bool isPod() { return true; }
   };
+  template <>
+  struct isPodLike<LoweredPHIRecord> { static const bool value = true; };
 }
 
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 1b93f34..d58b9c9 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -718,6 +718,11 @@ bool JumpThreading::ProcessSwitchOnDuplicateCond(BasicBlock *PredBB,
       if (PredSI->getSuccessor(PredCase) != DestBB &&
           DestSI->getSuccessor(i) != DestBB)
         continue;
+      
+      // Do not forward this if it already goes to this destination, this would
+      // be an infinite loop.
+      if (PredSI->getSuccessor(PredCase) == DestSucc)
+        continue;
 
       // Otherwise, we're safe to make the change.  Make sure that the edge from
       // DestSI to DestSucc is not critical and has no PHI nodes.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 5511387..42a8fdc 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -160,16 +160,17 @@ namespace {
 
       // Because the exit block is not in the loop, we know we have to get _at
       // least_ its immediate dominator.
-      do {
-        // Get next Immediate Dominator.
-        IDom = IDom->getIDom();
-
+      IDom = IDom->getIDom();
+      
+      while (IDom && IDom != BlockInLoopNode) {
         // If we have got to the header of the loop, then the instructions block
         // did not dominate the exit node, so we can't hoist it.
         if (IDom->getBlock() == LoopHeader)
           return false;
 
-      } while (IDom != BlockInLoopNode);
+        // Get next Immediate Dominator.
+        IDom = IDom->getIDom();
+      };
 
       return true;
     }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 564c7ac..85cc712 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -24,18 +24,14 @@
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/Type.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/IVUsers.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ValueHandle.h"
@@ -85,8 +81,6 @@ namespace {
 
   class LoopStrengthReduce : public LoopPass {
     IVUsers *IU;
-    LoopInfo *LI;
-    DominatorTree *DT;
     ScalarEvolution *SE;
     bool Changed;
 
@@ -94,10 +88,6 @@ namespace {
     /// particular stride.
     std::map<const SCEV *, IVsOfOneStride> IVsByStride;
 
-    /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
-    /// reused (nor should they be rewritten to reuse other strides).
-    SmallSet<const SCEV *, 4> StrideNoReuse;
-
     /// DeadInsts - Keep track of instructions we may have made dead, so that
     /// we can remove them after we are done working.
     SmallVector<WeakVH, 16> DeadInsts;
@@ -109,8 +99,7 @@ namespace {
   public:
     static char ID; // Pass ID, replacement for typeid
     explicit LoopStrengthReduce(const TargetLowering *tli = NULL) :
-      LoopPass(&ID), TLI(tli) {
-    }
+      LoopPass(&ID), TLI(tli) {}
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -118,13 +107,11 @@ namespace {
       // We split critical edges, so we change the CFG.  However, we do update
       // many analyses if they are around.
       AU.addPreservedID(LoopSimplifyID);
-      AU.addPreserved<LoopInfo>();
-      AU.addPreserved<DominanceFrontier>();
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved("loops");
+      AU.addPreserved("domfrontier");
+      AU.addPreserved("domtree");
 
       AU.addRequiredID(LoopSimplifyID);
-      AU.addRequired<LoopInfo>();
-      AU.addRequired<DominatorTree>();
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
       AU.addRequired<IVUsers>();
@@ -228,19 +215,17 @@ void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
   if (DeadInsts.empty()) return;
 
   while (!DeadInsts.empty()) {
-    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.back());
-    DeadInsts.pop_back();
+    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
 
     if (I == 0 || !isInstructionTriviallyDead(I))
       continue;
 
-    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
         *OI = 0;
         if (U->use_empty())
           DeadInsts.push_back(U);
       }
-    }
 
     I->eraseFromParent();
     Changed = true;
@@ -265,7 +250,7 @@ static bool containsAddRecFromDifferentLoop(const SCEV *S, Loop *L) {
       if (newLoop == L)
         return false;
       // if newLoop is an outer loop of L, this is OK.
-      if (!LoopInfo::isNotAlreadyContainedIn(L, newLoop))
+      if (newLoop->contains(L->getHeader()))
         return false;
     }
     return true;
@@ -338,9 +323,6 @@ namespace {
   /// BasedUser - For a particular base value, keep information about how we've
   /// partitioned the expression so far.
   struct BasedUser {
-    /// SE - The current ScalarEvolution object.
-    ScalarEvolution *SE;
-
     /// Base - The Base value for the PHI node that needs to be inserted for
     /// this use.  As the use is processed, information gets moved from this
     /// field to the Imm field (below).  BasedUser values are sorted by this
@@ -372,9 +354,9 @@ namespace {
     bool isUseOfPostIncrementedValue;
 
     BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
-      : SE(se), Base(IVSU.getOffset()), Inst(IVSU.getUser()),
+      : Base(IVSU.getOffset()), Inst(IVSU.getUser()),
         OperandValToReplace(IVSU.getOperandValToReplace()),
-        Imm(SE->getIntegerSCEV(0, Base->getType())),
+        Imm(se->getIntegerSCEV(0, Base->getType())),
         isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {}
 
     // Once we rewrite the code to insert the new IVs we want, update the
@@ -383,14 +365,14 @@ namespace {
     void RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
                                         Instruction *InsertPt,
                                        SCEVExpander &Rewriter, Loop *L, Pass *P,
-                                        LoopInfo &LI,
-                                        SmallVectorImpl<WeakVH> &DeadInsts);
+                                        SmallVectorImpl<WeakVH> &DeadInsts,
+                                        ScalarEvolution *SE);
 
     Value *InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
                                        const Type *Ty,
                                        SCEVExpander &Rewriter,
-                                       Instruction *IP, Loop *L,
-                                       LoopInfo &LI);
+                                       Instruction *IP,
+                                       ScalarEvolution *SE);
     void dump() const;
   };
 }
@@ -404,27 +386,12 @@ void BasedUser::dump() const {
 Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
                                               const Type *Ty,
                                               SCEVExpander &Rewriter,
-                                              Instruction *IP, Loop *L,
-                                              LoopInfo &LI) {
-  // Figure out where we *really* want to insert this code.  In particular, if
-  // the user is inside of a loop that is nested inside of L, we really don't
-  // want to insert this expression before the user, we'd rather pull it out as
-  // many loops as possible.
-  Instruction *BaseInsertPt = IP;
-
-  // Figure out the most-nested loop that IP is in.
-  Loop *InsertLoop = LI.getLoopFor(IP->getParent());
-
-  // If InsertLoop is not L, and InsertLoop is nested inside of L, figure out
-  // the preheader of the outer-most loop where NewBase is not loop invariant.
-  if (L->contains(IP->getParent()))
-    while (InsertLoop && NewBase->isLoopInvariant(InsertLoop)) {
-      BaseInsertPt = InsertLoop->getLoopPreheader()->getTerminator();
-      InsertLoop = InsertLoop->getParentLoop();
-    }
-
-  Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt);
+                                              Instruction *IP,
+                                              ScalarEvolution *SE) {
+  Value *Base = Rewriter.expandCodeFor(NewBase, 0, IP);
 
+  // Wrap the base in a SCEVUnknown so that ScalarEvolution doesn't try to
+  // re-analyze it.
   const SCEV *NewValSCEV = SE->getUnknown(Base);
 
   // Always emit the immediate into the same block as the user.
@@ -443,8 +410,8 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *const &NewBase,
 void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
                                                Instruction *NewBasePt,
                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
-                                      LoopInfo &LI,
-                                      SmallVectorImpl<WeakVH> &DeadInsts) {
+                                      SmallVectorImpl<WeakVH> &DeadInsts,
+                                      ScalarEvolution *SE) {
   if (!isa<PHINode>(Inst)) {
     // By default, insert code at the user instruction.
     BasicBlock::iterator InsertPt = Inst;
@@ -473,7 +440,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
     }
     Value *NewVal = InsertCodeForBaseAtPosition(NewBase,
                                                 OperandValToReplace->getType(),
-                                                Rewriter, InsertPt, L, LI);
+                                                Rewriter, InsertPt, SE);
     // Replace the use of the operand Value with the new Phi we just created.
     Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
 
@@ -535,7 +502,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEV *const &NewBase,
                                 PHIPred->getTerminator() :
                                 OldLoc->getParent()->getTerminator();
         Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(),
-                                           Rewriter, InsertPt, L, LI);
+                                           Rewriter, InsertPt, SE);
 
         DEBUG(errs() << "      Changing PHI use to ");
         DEBUG(WriteAsOperand(errs(), Code, /*PrintType=*/false));
@@ -1011,17 +978,13 @@ const SCEV *LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
                                 const SCEV *const &Stride,
                                 IVExpr &IV, const Type *Ty,
                                 const std::vector<BasedUser>& UsersToProcess) {
-  if (StrideNoReuse.count(Stride))
-    return SE->getIntegerSCEV(0, Stride->getType());
-
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
     int64_t SInt = SC->getValue()->getSExtValue();
     for (unsigned NewStride = 0, e = IU->StrideOrder.size();
          NewStride != e; ++NewStride) {
       std::map<const SCEV *, IVsOfOneStride>::iterator SI =
                 IVsByStride.find(IU->StrideOrder[NewStride]);
-      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
-          StrideNoReuse.count(SI->first))
+      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
         continue;
       // The other stride has no uses, don't reuse it.
       std::map<const SCEV *, IVUsersOfOneStride *>::iterator UI =
@@ -1780,8 +1743,8 @@ LoopStrengthReduce::StrengthReduceIVUsersOfStride(const SCEV *const &Stride,
         RewriteExpr = SE->getAddExpr(RewriteExpr, SE->getUnknown(BaseV));
 
       User.RewriteInstructionToUseNewBase(RewriteExpr, NewBasePt,
-                                          Rewriter, L, this, *LI,
-                                          DeadInsts);
+                                          Rewriter, L, this,
+                                          DeadInsts, SE);
 
       // Mark old value we replaced as possibly dead, so that it is eliminated
       // if we just replaced the last use of that value.
@@ -2745,8 +2708,6 @@ bool LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
 
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
   IU = &getAnalysis<IVUsers>();
-  LI = &getAnalysis<LoopInfo>();
-  DT = &getAnalysis<DominatorTree>();
   SE = &getAnalysis<ScalarEvolution>();
   Changed = false;
 
@@ -2792,15 +2753,14 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
     // After all sharing is done, see if we can adjust the loop to test against
     // zero instead of counting up to a maximum.  This is usually faster.
     OptimizeLoopCountIV(L);
-  }
 
-  // We're done analyzing this loop; release all the state we built up for it.
-  IVsByStride.clear();
-  StrideNoReuse.clear();
+    // We're done analyzing this loop; release all the state we built up for it.
+    IVsByStride.clear();
 
-  // Clean up after ourselves
-  if (!DeadInsts.empty())
-    DeleteTriviallyDeadInstructions();
+    // Clean up after ourselves
+    if (!DeadInsts.empty())
+      DeleteTriviallyDeadInstructions();
+  }
 
   // At this point, it is worth checking to see if any recurrence PHIs are also
   // dead, so that we can remove them as well.
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 38d267a..b7adfdc 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -404,12 +404,13 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val){
 
   initLoopData();
-  Function *F = loopHeader->getParent();
 
   // If LoopSimplify was unable to form a preheader, don't do any unswitching.
   if (!loopPreheader)
     return false;
 
+  Function *F = loopHeader->getParent();
+
   // If the condition is trivial, always unswitch.  There is no code growth for
   // this case.
   if (!IsTrivialUnswitchCondition(LoopCond)) {
diff --git a/lib/Transforms/Scalar/SCCVN.cpp b/lib/Transforms/Scalar/SCCVN.cpp
index 001267a..dbc82e1 100644
--- a/lib/Transforms/Scalar/SCCVN.cpp
+++ b/lib/Transforms/Scalar/SCCVN.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Operator.h"
 #include "llvm/Value.h"
 #include "llvm/ADT/DenseMap.h"
@@ -155,8 +154,10 @@ template <> struct DenseMapInfo<Expression> {
   static bool isEqual(const Expression &LHS, const Expression &RHS) {
     return LHS == RHS;
   }
-  static bool isPod() { return true; }
 };
+template <>
+struct isPodLike<Expression> { static const bool value = true; };
+
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index ae6ad74..b040a27 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -105,7 +105,7 @@ namespace {
     void isSafeUseOfAllocation(Instruction *User, AllocaInst *AI,
                                AllocaInfo &Info);
     void isSafeElementUse(Value *Ptr, bool isFirstElt, AllocaInst *AI,
-                         AllocaInfo &Info);
+                          AllocaInfo &Info);
     void isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocaInst *AI,
                                         unsigned OpNo, AllocaInfo &Info);
     void isSafeUseOfBitCastedAllocation(BitCastInst *User, AllocaInst *AI,
@@ -362,7 +362,6 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
 
   // Now that we have created the alloca instructions that we want to use,
   // expand the getelementptr instructions to use them.
-  //
   while (!AI->use_empty()) {
     Instruction *User = cast<Instruction>(AI->use_back());
     if (BitCastInst *BCInst = dyn_cast<BitCastInst>(User)) {
@@ -450,11 +449,9 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
   NumReplaced++;
 }
 
-
 /// isSafeElementUse - Check to see if this use is an allowed use for a
 /// getelementptr instruction of an array aggregate allocation.  isFirstElt
 /// indicates whether Ptr is known to the start of the aggregate.
-///
 void SROA::isSafeElementUse(Value *Ptr, bool isFirstElt, AllocaInst *AI,
                             AllocaInfo &Info) {
   for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
@@ -503,7 +500,6 @@ void SROA::isSafeElementUse(Value *Ptr, bool isFirstElt, AllocaInst *AI,
         }
       }
       
-      
       isSafeElementUse(GEP, AreAllZeroIndices, AI, Info);
       if (Info.isUnsafe) return;
       break;
@@ -543,9 +539,8 @@ static bool AllUsersAreLoads(Value *Ptr) {
   return true;
 }
 
-/// isSafeUseOfAllocation - Check to see if this user is an allowed use for an
+/// isSafeUseOfAllocation - Check if this user is an allowed use for an
 /// aggregate allocation.
-///
 void SROA::isSafeUseOfAllocation(Instruction *User, AllocaInst *AI,
                                  AllocaInfo &Info) {
   if (BitCastInst *C = dyn_cast<BitCastInst>(User))
@@ -614,7 +609,7 @@ void SROA::isSafeUseOfAllocation(Instruction *User, AllocaInst *AI,
       // integer. Specifically, consider A[0][i]. We cannot know that the user
       // isn't doing invalid things like allowing i to index an out-of-range
       // subscript that accesses A[1].  Because of this, we have to reject SROA
-      // of any accesses into structs where any of the components are variables. 
+      // of any accesses into structs where any of the components are variables.
       if (IdxVal->getZExtValue() >= AT->getNumElements())
         return MarkUnsafe(Info);
     } else if (const VectorType *VT = dyn_cast<VectorType>(*I)) {
@@ -628,7 +623,7 @@ void SROA::isSafeUseOfAllocation(Instruction *User, AllocaInst *AI,
   return isSafeElementUse(GEPI, IsAllZeroIndices, AI, Info);
 }
 
-/// isSafeMemIntrinsicOnAllocation - Return true if the specified memory
+/// isSafeMemIntrinsicOnAllocation - Check if the specified memory
 /// intrinsic can be promoted by SROA.  At this point, we know that the operand
 /// of the memintrinsic is a pointer to the beginning of the allocation.
 void SROA::isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocaInst *AI,
@@ -656,8 +651,8 @@ void SROA::isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocaInst *AI,
   }
 }
 
-/// isSafeUseOfBitCastedAllocation - Return true if all users of this bitcast
-/// are 
+/// isSafeUseOfBitCastedAllocation - Check if all users of this bitcast
+/// from an alloca are safe for SROA of that alloca.
 void SROA::isSafeUseOfBitCastedAllocation(BitCastInst *BC, AllocaInst *AI,
                                           AllocaInfo &Info) {
   for (Value::use_iterator UI = BC->use_begin(), E = BC->use_end();
@@ -773,6 +768,10 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
       OtherPtr = MTI->getRawDest();
     }
   }
+
+  // Keep track of the other intrinsic argument, so it can be removed if it
+  // is dead when the intrinsic is replaced.
+  Value *PossiblyDead = OtherPtr;
   
   // If there is an other pointer, we want to convert it to the same pointer
   // type as AI has, so we can GEP through it safely.
@@ -926,9 +925,11 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *BCInst,
     }
   }
   MI->eraseFromParent();
+  if (PossiblyDead)
+    RecursivelyDeleteTriviallyDeadInstructions(PossiblyDead);
 }
 
-/// RewriteStoreUserOfWholeAlloca - We found an store of an integer that
+/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
 /// overwrites the entire allocation.  Extract out the pieces of the stored
 /// integer and store them individually.
 void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
@@ -1052,7 +1053,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   SI->eraseFromParent();
 }
 
-/// RewriteLoadUserOfWholeAlloca - We found an load of the entire allocation to
+/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
 /// an integer.  Load the individual pieces to form the aggregate value.
 void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                         SmallVector<AllocaInst*, 32> &NewElts) {
@@ -1186,7 +1187,6 @@ static bool HasPadding(const Type *Ty, const TargetData &TD) {
 /// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
 /// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
 /// or 1 if safe after canonicalization has been performed.
-///
 int SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   // Loop over the use list of the alloca.  We can only transform it if all of
   // the users are safe to transform.
@@ -1215,7 +1215,7 @@ int SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   return Info.needsCleanup ? 1 : 3;
 }
 
-/// CleanupGEP - GEP is used by an Alloca, which can be prompted after the GEP
+/// CleanupGEP - GEP is used by an Alloca, which can be promoted after the GEP
 /// is canonicalized here.
 void SROA::CleanupGEP(GetElementPtrInst *GEPI) {
   gep_type_iterator I = gep_type_begin(GEPI);
@@ -1347,7 +1347,7 @@ static void MergeInType(const Type *In, uint64_t Offset, const Type *&VecTy,
 }
 
 /// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
-/// its accesses to use a to single vector type, return true, and set VecTy to
+/// its accesses to a single vector type, return true and set VecTy to
 /// the new type.  If we could convert the alloca into a single promotable
 /// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
 /// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
@@ -1355,7 +1355,6 @@ static void MergeInType(const Type *In, uint64_t Offset, const Type *&VecTy,
 ///
 /// If we see at least one access to the value that is as a vector type, set the
 /// SawVec flag.
-///
 bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
                               bool &SawVec, uint64_t Offset,
                               unsigned AllocaSize) {
@@ -1438,7 +1437,6 @@ bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy,
   return true;
 }
 
-
 /// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
 /// directly.  This happens when we are converting an "integer union" to a
 /// single integer scalar, or when we are converting a "vector union" to a
@@ -1481,7 +1479,8 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       assert(SI->getOperand(0) != Ptr && "Consistency error!");
       // FIXME: Remove once builder has Twine API.
-      Value *Old = Builder.CreateLoad(NewAI, (NewAI->getName()+".in").str().c_str());
+      Value *Old = Builder.CreateLoad(NewAI,
+                                      (NewAI->getName()+".in").str().c_str());
       Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
                                              Builder);
       Builder.CreateStore(New, NewAI);
@@ -1506,7 +1505,8 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
             APVal |= APVal << 8;
         
         // FIXME: Remove once builder has Twine API.
-        Value *Old = Builder.CreateLoad(NewAI, (NewAI->getName()+".in").str().c_str());
+        Value *Old = Builder.CreateLoad(NewAI,
+                                        (NewAI->getName()+".in").str().c_str());
         Value *New = ConvertScalar_InsertValue(
                                     ConstantInt::get(User->getContext(), APVal),
                                                Old, Offset, Builder);
@@ -1679,7 +1679,6 @@ Value *SROA::ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
   return FromVal;
 }
 
-
 /// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
 /// or vector value "Old" at the offset specified by Offset.
 ///
diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
index 13077fe..5acd6aa 100644
--- a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp
@@ -88,7 +88,7 @@ InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs,
     if (!isa<ReturnInst>(Body->getTerminator()))
       break;
 
-    Instruction *NextInst = next(BasicBlock::iterator(Call));
+    Instruction *NextInst = llvm::next(BasicBlock::iterator(Call));
 
     // Inline the call, taking care of what code ends up where.
     NewBlock = SplitBlock(NextInst->getParent(), NextInst, this);
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index f9b929c..6fd884b 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -128,8 +128,7 @@ public:
 
 /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
 Value *LibCallOptimization::CastToCStr(Value *V, IRBuilder<> &B) {
-  return
-        B.CreateBitCast(V, Type::getInt8PtrTy(*Context), "cstr");
+  return B.CreateBitCast(V, Type::getInt8PtrTy(*Context), "cstr");
 }
 
 /// EmitStrLen - Emit a call to the strlen function to the builder, for the
@@ -157,27 +156,25 @@ Value *LibCallOptimization::EmitStrLen(Value *Ptr, IRBuilder<> &B) {
 Value *LibCallOptimization::EmitMemCpy(Value *Dst, Value *Src, Value *Len,
                                        unsigned Align, IRBuilder<> &B) {
   Module *M = Caller->getParent();
-  Intrinsic::ID IID = Intrinsic::memcpy;
-  const Type *Tys[1];
-  Tys[0] = Len->getType();
-  Value *MemCpy = Intrinsic::getDeclaration(M, IID, Tys, 1);
-  return B.CreateCall4(MemCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Len,
+  const Type *Ty = Len->getType();
+  Value *MemCpy = Intrinsic::getDeclaration(M, Intrinsic::memcpy, &Ty, 1);
+  Dst = CastToCStr(Dst, B);
+  Src = CastToCStr(Src, B);
+  return B.CreateCall4(MemCpy, Dst, Src, Len,
                        ConstantInt::get(Type::getInt32Ty(*Context), Align));
 }
 
-/// EmitMemMOve - Emit a call to the memmove function to the builder.  This
+/// EmitMemMove - Emit a call to the memmove function to the builder.  This
 /// always expects that the size has type 'intptr_t' and Dst/Src are pointers.
 Value *LibCallOptimization::EmitMemMove(Value *Dst, Value *Src, Value *Len,
 					unsigned Align, IRBuilder<> &B) {
   Module *M = Caller->getParent();
-  Intrinsic::ID IID = Intrinsic::memmove;
-  const Type *Tys[1];
-  Tys[0] = TD->getIntPtrType(*Context);
-  Value *MemMove = Intrinsic::getDeclaration(M, IID, Tys, 1);
-  Value *D = CastToCStr(Dst, B);
-  Value *S = CastToCStr(Src, B);
+  const Type *Ty = TD->getIntPtrType(*Context);
+  Value *MemMove = Intrinsic::getDeclaration(M, Intrinsic::memmove, &Ty, 1);
+  Dst = CastToCStr(Dst, B);
+  Src = CastToCStr(Src, B);
   Value *A = ConstantInt::get(Type::getInt32Ty(*Context), Align);
-  return B.CreateCall4(MemMove, D, S, Len, A);
+  return B.CreateCall4(MemMove, Dst, Src, Len, A);
 }
 
 /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
@@ -2647,10 +2644,11 @@ bool SimplifyLibCalls::doInitialization(Module &M) {
 //   * strcspn("",a) -> 0
 //   * strcspn(s,"") -> strlen(a)
 //
-// strstr:
+// strstr: (PR5783)
 //   * strstr(x,x)  -> x
-//   * strstr(s1,s2) -> offset_of_s2_in(s1)
-//       (if s1 and s2 are constant strings)
+//   * strstr(x, "") -> x
+//   * strstr(x, "a") -> strchr(x, 'a')
+//   * strstr(s1,s2) -> result   (if s1 and s2 are constant strings)
 //
 // tan, tanf, tanl:
 //   * tan(atan(x)) -> x
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 2974592..2962e84 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Constant.h"
 #include "llvm/Type.h"
 #include "llvm/Analysis/AliasAnalysis.h"
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index aef0f5f..7a37aa3 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -603,3 +603,65 @@ bool llvm::OnlyUsedByDbgInfoIntrinsics(Instruction *I,
   return true;
 }
 
+/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI
+/// nodes in this block. This doesn't try to be clever about PHI nodes
+/// which differ only in the order of the incoming values, but instcombine
+/// orders them so it usually won't matter.
+///
+bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
+  bool Changed = false;
+
+  // This implementation doesn't currently consider undef operands
+  // specially. Theroetically, two phis which are identical except for
+  // one having an undef where the other doesn't could be collapsed.
+
+  // Map from PHI hash values to PHI nodes. If multiple PHIs have
+  // the same hash value, the element is the first PHI in the
+  // linked list in CollisionMap.
+  DenseMap<uintptr_t, PHINode *> HashMap;
+
+  // Maintain linked lists of PHI nodes with common hash values.
+  DenseMap<PHINode *, PHINode *> CollisionMap;
+
+  // Examine each PHI.
+  for (BasicBlock::iterator I = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I++); ) {
+    // Compute a hash value on the operands. Instcombine will likely have sorted
+    // them, which helps expose duplicates, but we have to check all the
+    // operands to be safe in case instcombine hasn't run.
+    uintptr_t Hash = 0;
+    for (User::op_iterator I = PN->op_begin(), E = PN->op_end(); I != E; ++I) {
+      // This hash algorithm is quite weak as hash functions go, but it seems
+      // to do a good enough job for this particular purpose, and is very quick.
+      Hash ^= reinterpret_cast<uintptr_t>(static_cast<Value *>(*I));
+      Hash = (Hash << 7) | (Hash >> (sizeof(uintptr_t) * CHAR_BIT - 7));
+    }
+    // If we've never seen this hash value before, it's a unique PHI.
+    std::pair<DenseMap<uintptr_t, PHINode *>::iterator, bool> Pair =
+      HashMap.insert(std::make_pair(Hash, PN));
+    if (Pair.second) continue;
+    // Otherwise it's either a duplicate or a hash collision.
+    for (PHINode *OtherPN = Pair.first->second; ; ) {
+      if (OtherPN->isIdenticalTo(PN)) {
+        // A duplicate. Replace this PHI with its duplicate.
+        PN->replaceAllUsesWith(OtherPN);
+        PN->eraseFromParent();
+        Changed = true;
+        break;
+      }
+      // A non-duplicate hash collision.
+      DenseMap<PHINode *, PHINode *>::iterator I = CollisionMap.find(OtherPN);
+      if (I == CollisionMap.end()) {
+        // Set this PHI to be the head of the linked list of colliding PHIs.
+        PHINode *Old = Pair.first->second;
+        Pair.first->second = PN;
+        CollisionMap[PN] = Old;
+        break;
+      }
+      // Procede to the next PHI in the list.
+      OtherPN = I->second;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 8c18b59..743bb6e 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -244,7 +244,7 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
 
   // Merge case into clusters
   if (Cases.size()>=2)
-    for (CaseItr I=Cases.begin(), J=next(Cases.begin()); J!=Cases.end(); ) {
+    for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) {
       int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
       int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
       BasicBlock* nextBB = J->BB;
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index e25f9e2..846e432 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -55,7 +55,6 @@ struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > {
   static bool isEqual(const EltTy &LHS, const EltTy &RHS) {
     return LHS == RHS;
   }
-  static bool isPod() { return true; }
 };
 }
 
@@ -102,7 +101,7 @@ namespace {
   public:
     typedef std::vector<Value *> ValVector;
     
-    RenamePassData() {}
+    RenamePassData() : BB(NULL), Pred(NULL), Values() {}
     RenamePassData(BasicBlock *B, BasicBlock *P,
                    const ValVector &V) : BB(B), Pred(P), Values(V) {}
     BasicBlock *BB;
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 8a07c35..ba41bf9 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -295,10 +295,14 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
       InsertedVal = SingularValue;
     }
 
+    // Either path through the 'if' should have set insertedVal -> SingularVal.
+    assert((InsertedVal == SingularValue || isa<UndefValue>(InsertedVal)) &&
+           "RAUW didn't change InsertedVal to be SingularVal");
+
     // Drop the entries we added in IncomingPredInfo to restore the stack.
     IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
                            IncomingPredInfo.end());
-    return InsertedVal;
+    return SingularValue;
   }
 
   // Otherwise, we do need a PHI: insert one now if we don't already have one.
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 89b0bd9..d7ca45e 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1589,69 +1589,6 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   return true;
 }
 
-/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI
-/// nodes in this block. This doesn't try to be clever about PHI nodes
-/// which differ only in the order of the incoming values, but instcombine
-/// orders them so it usually won't matter.
-///
-bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
-  bool Changed = false;
-  
-  // This implementation doesn't currently consider undef operands
-  // specially. Theroetically, two phis which are identical except for
-  // one having an undef where the other doesn't could be collapsed.
-
-  // Map from PHI hash values to PHI nodes. If multiple PHIs have
-  // the same hash value, the element is the first PHI in the
-  // linked list in CollisionMap.
-  DenseMap<uintptr_t, PHINode *> HashMap;
-
-  // Maintain linked lists of PHI nodes with common hash values.
-  DenseMap<PHINode *, PHINode *> CollisionMap;
-
-  // Examine each PHI.
-  for (BasicBlock::iterator I = BB->begin();
-       PHINode *PN = dyn_cast<PHINode>(I++); ) {
-    // Compute a hash value on the operands. Instcombine will likely have sorted
-    // them, which helps expose duplicates, but we have to check all the
-    // operands to be safe in case instcombine hasn't run.
-    uintptr_t Hash = 0;
-    for (User::op_iterator I = PN->op_begin(), E = PN->op_end(); I != E; ++I) {
-      // This hash algorithm is quite weak as hash functions go, but it seems
-      // to do a good enough job for this particular purpose, and is very quick.
-      Hash ^= reinterpret_cast<uintptr_t>(static_cast<Value *>(*I));
-      Hash = (Hash << 7) | (Hash >> (sizeof(uintptr_t) * CHAR_BIT - 7));
-    }
-    // If we've never seen this hash value before, it's a unique PHI.
-    std::pair<DenseMap<uintptr_t, PHINode *>::iterator, bool> Pair =
-      HashMap.insert(std::make_pair(Hash, PN));
-    if (Pair.second) continue;
-    // Otherwise it's either a duplicate or a hash collision.
-    for (PHINode *OtherPN = Pair.first->second; ; ) {
-      if (OtherPN->isIdenticalTo(PN)) {
-        // A duplicate. Replace this PHI with its duplicate.
-        PN->replaceAllUsesWith(OtherPN);
-        PN->eraseFromParent();
-        Changed = true;
-        break;
-      }
-      // A non-duplicate hash collision.
-      DenseMap<PHINode *, PHINode *>::iterator I = CollisionMap.find(OtherPN);
-      if (I == CollisionMap.end()) {
-        // Set this PHI to be the head of the linked list of colliding PHIs.
-        PHINode *Old = Pair.first->second;
-        Pair.first->second = PN;
-        CollisionMap[PN] = Old;
-        break;
-      }
-      // Procede to the next PHI in the list.
-      OtherPN = I->second;
-    }
-  }
-
-  return Changed;
-}
-
 /// SimplifyCFG - This function is used to do simplification of a CFG.  For
 /// example, it adjusts branches to branches to eliminate the extra hop, it
 /// eliminates unreachable basic blocks, and does other "peephole" optimization
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 82d7914..c765d96 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -813,6 +813,11 @@ void SlotTracker::CreateFunctionSlot(const Value *V) {
 void SlotTracker::CreateMetadataSlot(const MDNode *N) {
   assert(N && "Can't insert a null Value into SlotTracker!");
 
+  // Don't insert if N contains an instruction.
+  for (unsigned i = 0, e = N->getNumElements(); i != e; ++i)
+    if (N->getElement(i) && isa<Instruction>(N->getElement(i)))
+      return;
+
   ValueMap::iterator I = mdnMap.find(N);
   if (I != mdnMap.end())
     return;
@@ -1227,6 +1232,25 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   }
 
   if (const MDNode *N = dyn_cast<MDNode>(V)) {
+    if (Machine->getMetadataSlot(N) == -1) {
+      // Print metadata inline, not via slot reference number.
+      Out << "!{";
+      for (unsigned mi = 0, me = N->getNumElements(); mi != me; ++mi) {
+        const Value *Val = N->getElement(mi);
+        if (!Val)
+          Out << "null";
+        else {
+          TypePrinter->print(N->getElement(0)->getType(), Out);
+          Out << ' ';
+          WriteAsOperandInternal(Out, N->getElement(0), TypePrinter, Machine);
+        }
+        if (mi + 1 != me)
+          Out << ", ";
+      }
+      Out << '}';
+      return;
+    }
+  
     Out << '!' << Machine->getMetadataSlot(N);
     return;
   }
@@ -1636,6 +1660,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   case CallingConv::ARM_APCS:     Out << "arm_apcscc "; break;
   case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc "; break;
   case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc "; break;
+  case CallingConv::MSP430_INTR:  Out << "msp430_intrcc "; break;
   default: Out << "cc" << F->getCallingConv() << " "; break;
   }
 
@@ -1903,6 +1928,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     case CallingConv::ARM_APCS:     Out << " arm_apcscc "; break;
     case CallingConv::ARM_AAPCS:    Out << " arm_aapcscc "; break;
     case CallingConv::ARM_AAPCS_VFP:Out << " arm_aapcs_vfpcc "; break;
+    case CallingConv::MSP430_INTR:  Out << " msp430_intrcc "; break;
     default: Out << " cc" << CI->getCallingConv(); break;
     }
 
@@ -1953,6 +1979,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     case CallingConv::ARM_APCS:     Out << " arm_apcscc "; break;
     case CallingConv::ARM_AAPCS:    Out << " arm_aapcscc "; break;
     case CallingConv::ARM_AAPCS_VFP:Out << " arm_aapcs_vfpcc "; break;
+    case CallingConv::MSP430_INTR:  Out << " msp430_intrcc "; break;
     default: Out << " cc" << II->getCallingConv(); break;
     }
 
diff --git a/lib/VMCore/BasicBlock.cpp b/lib/VMCore/BasicBlock.cpp
index 23d0557..c7f7f53 100644
--- a/lib/VMCore/BasicBlock.cpp
+++ b/lib/VMCore/BasicBlock.cpp
@@ -262,7 +262,7 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
   assert(I != InstList.end() &&
          "Trying to get me to create degenerate basic block!");
 
-  BasicBlock *InsertBefore = next(Function::iterator(this))
+  BasicBlock *InsertBefore = llvm::next(Function::iterator(this))
                                .getNodePtrUnchecked();
   BasicBlock *New = BasicBlock::Create(getContext(), BBName,
                                        getParent(), InsertBefore);
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index c622558..a62f75b 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -1560,7 +1560,7 @@ Constant *ConstantExpr::getGetElementPtrTy(const Type *ReqTy, Constant *C,
 
 Constant *ConstantExpr::getInBoundsGetElementPtrTy(const Type *ReqTy,
                                                    Constant *C,
-                                                   Value* const *Idxs,
+                                                   Value *const *Idxs,
                                                    unsigned NumIdx) {
   assert(GetElementPtrInst::getIndexedType(C->getType(), Idxs,
                                            Idxs+NumIdx) ==
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 6cf2c81..88e1fe8 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -77,6 +77,13 @@ bool Argument::hasByValAttr() const {
   return getParent()->paramHasAttr(getArgNo()+1, Attribute::ByVal);
 }
 
+/// hasNestAttr - Return true if this argument has the nest attribute on
+/// it in its containing function.
+bool Argument::hasNestAttr() const {
+  if (!isa<PointerType>(getType())) return false;
+  return getParent()->paramHasAttr(getArgNo()+1, Attribute::Nest);
+}
+
 /// hasNoAliasAttr - Return true if this argument has the noalias attribute on
 /// it in its containing function.
 bool Argument::hasNoAliasAttr() const {
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 1c3244b..8a2378e 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -62,7 +62,6 @@ struct DenseMapAPIntKeyInfo {
   static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
     return LHS == RHS;
   }
-  static bool isPod() { return false; }
 };
 
 struct DenseMapAPFloatKeyInfo {
@@ -89,7 +88,6 @@ struct DenseMapAPFloatKeyInfo {
   static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
     return LHS == RHS;
   }
-  static bool isPod() { return false; }
 };
 
 class LLVMContextImpl {
diff --git a/lib/VMCore/Pass.cpp b/lib/VMCore/Pass.cpp
index 1232fe2..6bea7a8 100644
--- a/lib/VMCore/Pass.cpp
+++ b/lib/VMCore/Pass.cpp
@@ -41,6 +41,10 @@ Pass::~Pass() {
 // Force out-of-line virtual method.
 ModulePass::~ModulePass() { }
 
+PassManagerType ModulePass::getPotentialPassManagerType() const {
+  return PMT_ModulePassManager;
+}
+
 bool Pass::mustPreserveAnalysisID(const PassInfo *AnalysisID) const {
   return Resolver->getAnalysisIfAvailable(AnalysisID, true) != 0;
 }
@@ -60,6 +64,27 @@ const char *Pass::getPassName() const {
   return "Unnamed pass: implement Pass::getPassName()";
 }
 
+void Pass::preparePassManager(PMStack &) {
+  // By default, don't do anything.
+}
+
+PassManagerType Pass::getPotentialPassManagerType() const {
+  // Default implementation.
+  return PMT_Unknown; 
+}
+
+void Pass::getAnalysisUsage(AnalysisUsage &) const {
+  // By default, no analysis results are used, all are invalidated.
+}
+
+void Pass::releaseMemory() {
+  // By default, don't do anything.
+}
+
+void Pass::verifyAnalysis() const {
+  // By default, don't do anything.
+}
+
 // print - Print out the internal state of the pass.  This is called by Analyze
 // to print out the contents of an analysis.  Otherwise it is not necessary to
 // implement this method.
@@ -79,6 +104,10 @@ void Pass::dump() const {
 // Force out-of-line virtual method.
 ImmutablePass::~ImmutablePass() { }
 
+void ImmutablePass::initializePass() {
+  // By default, don't do anything.
+}
+
 //===----------------------------------------------------------------------===//
 // FunctionPass Implementation
 //
@@ -107,6 +136,20 @@ bool FunctionPass::run(Function &F) {
   return Changed | doFinalization(*F.getParent());
 }
 
+bool FunctionPass::doInitialization(Module &) {
+  // By default, don't do anything.
+  return false;
+}
+
+bool FunctionPass::doFinalization(Module &) {
+  // By default, don't do anything.
+  return false;
+}
+
+PassManagerType FunctionPass::getPotentialPassManagerType() const {
+  return PMT_FunctionPassManager;
+}
+
 //===----------------------------------------------------------------------===//
 // BasicBlockPass Implementation
 //
@@ -121,6 +164,30 @@ bool BasicBlockPass::runOnFunction(Function &F) {
   return Changed | doFinalization(F);
 }
 
+bool BasicBlockPass::doInitialization(Module &) {
+  // By default, don't do anything.
+  return false;
+}
+
+bool BasicBlockPass::doInitialization(Function &) {
+  // By default, don't do anything.
+  return false;
+}
+
+bool BasicBlockPass::doFinalization(Function &) {
+  // By default, don't do anything.
+  return false;
+}
+
+bool BasicBlockPass::doFinalization(Module &) {
+  // By default, don't do anything.
+  return false;
+}
+
+PassManagerType BasicBlockPass::getPotentialPassManagerType() const {
+  return PMT_BasicBlockPassManager; 
+}
+
 //===----------------------------------------------------------------------===//
 // Pass Registration mechanism
 //
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index ae418a0..52e8a82 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -738,9 +738,15 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
       std::map<AnalysisID, Pass *>::iterator Info = I++;
       if (!dynamic_cast<ImmutablePass*>(Info->second) &&
           std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) == 
-             PreservedSet.end())
+             PreservedSet.end()) {
         // Remove this analysis
+        if (PassDebugging >= Details) {
+          Pass *S = Info->second;
+          errs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+          errs() << S->getPassName() << "'\n";
+        }
         InheritedAnalysis[Index]->erase(Info);
+      }
     }
   }
 }
@@ -1391,8 +1397,7 @@ MPPassManager::runOnModule(Module &M) {
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
     ModulePass *MP = getContainedPass(Index);
 
-    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG,
-                 M.getModuleIdentifier().c_str());
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
     dumpRequiredSet(MP);
 
     initializeAnalysisImpl(MP);
@@ -1406,13 +1411,13 @@ MPPassManager::runOnModule(Module &M) {
 
     if (Changed) 
       dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
-                   M.getModuleIdentifier().c_str());
+                   M.getModuleIdentifier());
     dumpPreservedSet(MP);
     
     verifyPreservedAnalysis(MP);
     removeNotPreservedAnalysis(MP);
     recordAvailableAnalysis(MP);
-    removeDeadPasses(MP, M.getModuleIdentifier().c_str(), ON_MODULE_MSG);
+    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
   }
 
   // Finalize on-the-fly passes