55 files changed, 2604 insertions, 1019 deletions
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 8e3f8e7..bb61682 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -38,16 +38,19 @@ DebugMod("agg-antidep-debugmod",
                       cl::desc("Debug control for aggressive anti-dep breaker"),
                       cl::init(0), cl::Hidden);
 
-AggressiveAntiDepState::AggressiveAntiDepState(MachineBasicBlock *BB) :
-  GroupNodes(TargetRegisterInfo::FirstVirtualRegister, 0) {
-  // Initialize all registers to be in their own group. Initially we
-  // assign the register to the same-indexed GroupNode.
-  for (unsigned i = 0; i < TargetRegisterInfo::FirstVirtualRegister; ++i)
+AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
+                                               MachineBasicBlock *BB) :
+  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0) {
+
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < NumTargetRegs; ++i) {
+    // Initialize all registers to be in their own group. Initially we
+    // assign the register to the same-indexed GroupNode.
     GroupNodeIndices[i] = i;
-
-  // Initialize the indices to indicate that no registers are live.
-  std::fill(KillIndices, array_endof(KillIndices), ~0u);
-  std::fill(DefIndices, array_endof(DefIndices), BB->size());
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
 }
 
 unsigned AggressiveAntiDepState::GetGroup(unsigned Reg)
@@ -64,7 +67,7 @@ void AggressiveAntiDepState::GetGroupRegs(
   std::vector<unsigned> &Regs,
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference> *RegRefs)
 {
-  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg) {
+  for (unsigned Reg = 0; Reg != NumTargetRegs; ++Reg) {
     if ((GetGroup(Reg) == Group) && (RegRefs->count(Reg) > 0))
       Regs.push_back(Reg);
   }
@@ -137,7 +140,7 @@ AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
 
 void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   assert(State == NULL);
-  State = new AggressiveAntiDepState(BB);
+  State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
 
   bool IsReturnBlock = (!BB->empty() && BB->back().getDesc().isReturn());
   unsigned *KillIndices = State->GetKillIndices();
@@ -220,7 +223,7 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   DEBUG(errs() << "\tRegs:");
 
   unsigned *DefIndices = State->GetDefIndices();
-  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg) {
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
     // If Reg is current live, then mark that it can't be renamed as
     // we don't know the extent of its live-range anymore (now that it
     // has been scheduled). If it is not live but was defined in the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 8154d2d..d385a21 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -44,6 +44,10 @@ namespace llvm {
     } RegisterReference;
 
   private:
+    /// NumTargetRegs - Number of non-virtual target registers
+    /// (i.e. TRI->getNumRegs()).
+    const unsigned NumTargetRegs;
+
     /// GroupNodes - Implements a disjoint-union data structure to
     /// form register groups. A node is represented by an index into
     /// the vector. A node can "point to" itself to indicate that it
@@ -69,7 +73,7 @@ namespace llvm {
     unsigned DefIndices[TargetRegisterInfo::FirstVirtualRegister];
 
   public:
-    AggressiveAntiDepState(MachineBasicBlock *BB);
+    AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
     
     /// GetKillIndices - Return the kill indices.
     unsigned *GetKillIndices() { return KillIndices; }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 993cdbf..44fd176 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1374,6 +1374,7 @@ void AsmPrinter::processDebugLoc(const MachineInstr *MI,
       unsigned L = DW->RecordSourceLine(CurDLT.Line, CurDLT.Col,
                                         CurDLT.Scope);
       printLabel(L);
+      O << '\n';
       DW->BeginScope(MI, L);
       PrevDLT = CurDLT;
     }
@@ -1837,15 +1838,16 @@ void AsmPrinter::EmitComments(const MachineInstr &MI) const {
 
     // Print source line info.
     O.PadToColumn(MAI->getCommentColumn());
-    O << MAI->getCommentString() << " SrcLine ";
-    if (DLT.Scope) {
-      DICompileUnit CU(DLT.Scope);
-      if (!CU.isNull())
-        O << CU.getFilename() << " ";
-    }
-    O << DLT.Line;
+    O << MAI->getCommentString() << ' ';
+    DIScope Scope(DLT.Scope);
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (!Scope.isNull())
+      O << Scope.getFilename();
+    else
+      O << "<unknown>";
+    O << ':' << DLT.Line;
     if (DLT.Col != 0)
-      O << ":" << DLT.Col;
+      O << ':' << DLT.Col;
     Newline = true;
   }
 
@@ -1857,35 +1859,40 @@ void AsmPrinter::EmitComments(const MachineInstr &MI) const {
 
   // We assume a single instruction only has a spill or reload, not
   // both.
+  const MachineMemOperand *MMO;
   if (TM.getInstrInfo()->isLoadFromStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Reload";
+      O << MAI->getCommentString() << ' ' << MMO->getSize() << "-byte Reload";
       Newline = true;
     }
   }
-  else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, FI)) {
+  else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Folded Reload";
+      O << MAI->getCommentString() << ' '
+        << MMO->getSize() << "-byte Folded Reload";
       Newline = true;
     }
   }
   else if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Spill";
+      O << MAI->getCommentString() << ' ' << MMO->getSize() << "-byte Spill";
       Newline = true;
     }
   }
-  else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, FI)) {
+  else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       if (Newline) O << '\n';
       O.PadToColumn(MAI->getCommentColumn());
-      O << MAI->getCommentString() << " Folded Spill";
+      O << MAI->getCommentString() << ' '
+        << MMO->getSize() << "-byte Folded Spill";
       Newline = true;
     }
   }
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index dc6a70a..cad8b89 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -274,7 +274,7 @@ namespace llvm {
   };
 
   //===--------------------------------------------------------------------===//
-  /// DIEString - A string value DIE.
+  /// DIEString - A string value DIE. This DIE keeps string reference only.
   ///
   class DIEString : public DIEValue {
     const StringRef Str;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index c2e1e05..c200a46 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -330,8 +330,8 @@ void DwarfDebug::addSInt(DIE *Die, unsigned Attribute,
   Die->addValue(Attribute, Form, Value);
 }
 
-/// addString - Add a string attribute data and value.
-///
+/// addString - Add a string attribute data and value. DIEString only
+/// keeps string reference. 
 void DwarfDebug::addString(DIE *Die, unsigned Attribute, unsigned Form,
                            const StringRef String) {
   DIEValue *Value = new DIEString(String);
@@ -393,7 +393,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIVariable *V) {
     return;
 
   unsigned Line = V->getLineNumber();
-  unsigned FileID = findCompileUnit(V->getCompileUnit()).getID();
+  unsigned FileID = findCompileUnit(V->getCompileUnit())->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -407,7 +407,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIGlobal *G) {
     return;
 
   unsigned Line = G->getLineNumber();
-  unsigned FileID = findCompileUnit(G->getCompileUnit()).getID();
+  unsigned FileID = findCompileUnit(G->getCompileUnit())->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -425,7 +425,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DISubprogram *SP) {
 
 
   unsigned Line = SP->getLineNumber();
-  unsigned FileID = findCompileUnit(SP->getCompileUnit()).getID();
+  unsigned FileID = findCompileUnit(SP->getCompileUnit())->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -440,7 +440,7 @@ void DwarfDebug::addSourceLine(DIE *Die, const DIType *Ty) {
     return;
 
   unsigned Line = Ty->getLineNumber();
-  unsigned FileID = findCompileUnit(CU).getID();
+  unsigned FileID = findCompileUnit(CU)->getID();
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -738,13 +738,49 @@ void DwarfDebug::addAddress(DIE *Die, unsigned Attribute,
   addBlock(Die, Attribute, 0, Block);
 }
 
+/// addToContextOwner - Add Die into the list of its context owner's children.
+void DwarfDebug::addToContextOwner(DIE *Die, DIDescriptor Context) {
+  if (Context.isNull())
+    ModuleCU->addDie(Die);
+  else if (Context.isType()) {
+    DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context.getNode()));
+    ContextDIE->addChild(Die);
+  } else if (DIE *ContextDIE = ModuleCU->getDIE(Context.getNode()))
+    ContextDIE->addChild(Die);
+  else 
+    ModuleCU->addDie(Die);
+}
+
+/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+/// given DIType.
+DIE *DwarfDebug::getOrCreateTypeDIE(DIType Ty) {
+  DIE *TyDIE = ModuleCU->getDIE(Ty.getNode());
+  if (TyDIE)
+    return TyDIE;
+
+  // Create new type.
+  TyDIE = new DIE(dwarf::DW_TAG_base_type);
+  ModuleCU->insertDIE(Ty.getNode(), TyDIE);
+  if (Ty.isBasicType())
+    constructTypeDIE(*TyDIE, DIBasicType(Ty.getNode()));
+  else if (Ty.isCompositeType())
+    constructTypeDIE(*TyDIE, DICompositeType(Ty.getNode()));
+  else {
+    assert(Ty.isDerivedType() && "Unknown kind of DIType");
+    constructTypeDIE(*TyDIE, DIDerivedType(Ty.getNode()));
+  }
+
+  addToContextOwner(TyDIE, Ty.getContext());
+  return TyDIE;
+}
+
 /// addType - Add a new type attribute to the specified entity.
-void DwarfDebug::addType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty) {
+void DwarfDebug::addType(DIE *Entity, DIType Ty) {
   if (Ty.isNull())
     return;
 
   // Check for pre-existence.
-  DIEEntry *Entry = DW_Unit->getDIEEntry(Ty.getNode());
+  DIEEntry *Entry = ModuleCU->getDIEEntry(Ty.getNode());
 
   // If it exists then use the existing value.
   if (Entry) {
@@ -754,36 +790,17 @@ void DwarfDebug::addType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty) {
 
   // Set up proxy.
   Entry = createDIEEntry();
-  DW_Unit->insertDIEEntry(Ty.getNode(), Entry);
+  ModuleCU->insertDIEEntry(Ty.getNode(), Entry);
 
   // Construct type.
-  DIE *Buffer = new DIE(dwarf::DW_TAG_base_type);
-  if (Ty.isBasicType())
-    constructTypeDIE(DW_Unit, *Buffer, DIBasicType(Ty.getNode()));
-  else if (Ty.isCompositeType())
-    constructTypeDIE(DW_Unit, *Buffer, DICompositeType(Ty.getNode()));
-  else {
-    assert(Ty.isDerivedType() && "Unknown kind of DIType");
-    constructTypeDIE(DW_Unit, *Buffer, DIDerivedType(Ty.getNode()));
-  }
+  DIE *Buffer = getOrCreateTypeDIE(Ty);
 
-  // Add debug information entry to entity and appropriate context.
-  DIE *Die = NULL;
-  DIDescriptor Context = Ty.getContext();
-  if (!Context.isNull())
-    Die = DW_Unit->getDIE(Context.getNode());
-
-  if (Die)
-    Die->addChild(Buffer);
-  else
-    DW_Unit->addDie(Buffer);
   Entry->setEntry(Buffer);
   Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
 }
 
 /// constructTypeDIE - Construct basic type die from DIBasicType.
-void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
-                                  DIBasicType BTy) {
+void DwarfDebug::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   // Get core information.
   StringRef Name = BTy.getName();
   Buffer.setTag(dwarf::DW_TAG_base_type);
@@ -798,8 +815,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 }
 
 /// constructTypeDIE - Construct derived type die from DIDerivedType.
-void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
-                                  DIDerivedType DTy) {
+void DwarfDebug::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
   // Get core information.
   StringRef Name = DTy.getName();
   uint64_t Size = DTy.getSizeInBits() >> 3;
@@ -812,7 +828,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 
   // Map to main type, void will not have a type.
   DIType FromTy = DTy.getTypeDerivedFrom();
-  addType(DW_Unit, &Buffer, FromTy);
+  addType(&Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
@@ -828,8 +844,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 }
 
 /// constructTypeDIE - Construct type DIE from DICompositeType.
-void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
-                                  DICompositeType CTy) {
+void DwarfDebug::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   // Get core information.
   StringRef Name = CTy.getName();
 
@@ -840,7 +855,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
   switch (Tag) {
   case dwarf::DW_TAG_vector_type:
   case dwarf::DW_TAG_array_type:
-    constructArrayTypeDIE(DW_Unit, Buffer, &CTy);
+    constructArrayTypeDIE(Buffer, &CTy);
     break;
   case dwarf::DW_TAG_enumeration_type: {
     DIArray Elements = CTy.getTypeArray();
@@ -850,7 +865,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
       DIE *ElemDie = NULL;
       DIEnumerator Enum(Elements.getElement(i).getNode());
       if (!Enum.isNull()) {
-        ElemDie = constructEnumTypeDIE(DW_Unit, &Enum);
+        ElemDie = constructEnumTypeDIE(&Enum);
         Buffer.addChild(ElemDie);
       }
     }
@@ -860,7 +875,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
     // Add return type.
     DIArray Elements = CTy.getTypeArray();
     DIDescriptor RTy = Elements.getElement(0);
-    addType(DW_Unit, &Buffer, DIType(RTy.getNode()));
+    addType(&Buffer, DIType(RTy.getNode()));
 
     // Add prototype flag.
     addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
@@ -869,7 +884,7 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
     for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
       DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
       DIDescriptor Ty = Elements.getElement(i);
-      addType(DW_Unit, Arg, DIType(Ty.getNode()));
+      addType(Arg, DIType(Ty.getNode()));
       Buffer.addChild(Arg);
     }
   }
@@ -891,11 +906,9 @@ void DwarfDebug::constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
         continue;
       DIE *ElemDie = NULL;
       if (Element.getTag() == dwarf::DW_TAG_subprogram)
-        ElemDie = createSubprogramDIE(DW_Unit,
-                                      DISubprogram(Element.getNode()));
+        ElemDie = createSubprogramDIE(DISubprogram(Element.getNode()));
       else
-        ElemDie = createMemberDIE(DW_Unit,
-                                  DIDerivedType(Element.getNode()));
+        ElemDie = createMemberDIE(DIDerivedType(Element.getNode()));
       Buffer.addChild(ElemDie);
     }
 
@@ -944,33 +957,32 @@ void DwarfDebug::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
   addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
   if (L)
     addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
-  if (H)
-    addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
+  addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
 
   Buffer.addChild(DW_Subrange);
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-void DwarfDebug::constructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+void DwarfDebug::constructArrayTypeDIE(DIE &Buffer,
                                        DICompositeType *CTy) {
   Buffer.setTag(dwarf::DW_TAG_array_type);
   if (CTy->getTag() == dwarf::DW_TAG_vector_type)
     addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
 
   // Emit derived type.
-  addType(DW_Unit, &Buffer, CTy->getTypeDerivedFrom());
+  addType(&Buffer, CTy->getTypeDerivedFrom());
   DIArray Elements = CTy->getTypeArray();
 
   // Get an anonymous type for index type.
-  DIE *IdxTy = DW_Unit->getIndexTyDie();
+  DIE *IdxTy = ModuleCU->getIndexTyDie();
   if (!IdxTy) {
     // Construct an anonymous type for index type.
     IdxTy = new DIE(dwarf::DW_TAG_base_type);
     addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
     addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             dwarf::DW_ATE_signed);
-    DW_Unit->addDie(IdxTy);
-    DW_Unit->setIndexTyDie(IdxTy);
+    ModuleCU->addDie(IdxTy);
+    ModuleCU->setIndexTyDie(IdxTy);
   }
 
   // Add subranges to array type.
@@ -982,7 +994,7 @@ void DwarfDebug::constructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
 }
 
 /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-DIE *DwarfDebug::constructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy) {
+DIE *DwarfDebug::constructEnumTypeDIE(DIEnumerator *ETy) {
   DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
   StringRef Name = ETy->getName();
   addString(Enumerator, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
@@ -992,8 +1004,7 @@ DIE *DwarfDebug::constructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy) {
 }
 
 /// createGlobalVariableDIE - Create new DIE using GV.
-DIE *DwarfDebug::createGlobalVariableDIE(CompileUnit *DW_Unit,
-                                         const DIGlobalVariable &GV) {
+DIE *DwarfDebug::createGlobalVariableDIE(const DIGlobalVariable &GV) {
   // If the global variable was optmized out then no need to create debug info
   // entry.
   if (!GV.getGlobal()) return NULL;
@@ -1014,7 +1025,7 @@ DIE *DwarfDebug::createGlobalVariableDIE(CompileUnit *DW_Unit,
     addString(GVDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
               LinkageName);
   }
-  addType(DW_Unit, GVDie, GV.getType());
+  addType(GVDie, GV.getType());
   if (!GV.isLocalToUnit())
     addUInt(GVDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
   addSourceLine(GVDie, &GV);
@@ -1030,13 +1041,13 @@ DIE *DwarfDebug::createGlobalVariableDIE(CompileUnit *DW_Unit,
 }
 
 /// createMemberDIE - Create new member DIE.
-DIE *DwarfDebug::createMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT){
+DIE *DwarfDebug::createMemberDIE(const DIDerivedType &DT) {
   DIE *MemberDie = new DIE(DT.getTag());
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
   
-  addType(DW_Unit, MemberDie, DT.getTypeDerivedFrom());
+  addType(MemberDie, DT.getTypeDerivedFrom());
 
   addSourceLine(MemberDie, &DT);
 
@@ -1073,21 +1084,27 @@ DIE *DwarfDebug::createMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT){
   addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
 
   if (DT.isProtected())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, 0,
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
             dwarf::DW_ACCESS_protected);
   else if (DT.isPrivate())
-    addUInt(MemberDie, dwarf::DW_AT_accessibility, 0,
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
             dwarf::DW_ACCESS_private);
-
+  else if (DT.getTag() == dwarf::DW_TAG_inheritance)
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
+            dwarf::DW_ACCESS_public);
+  if (DT.isVirtual())
+    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag,
+            dwarf::DW_VIRTUALITY_virtual);
   return MemberDie;
 }
 
 /// createSubprogramDIE - Create new DIE using SP.
-DIE *DwarfDebug::createSubprogramDIE(CompileUnit *DW_Unit,
-                                     const DISubprogram &SP,
-                                     bool IsConstructor,
-                                     bool IsInlined) {
-  DIE *SPDie = new DIE(dwarf::DW_TAG_subprogram);
+DIE *DwarfDebug::createSubprogramDIE(const DISubprogram &SP, bool MakeDecl) {
+  DIE *SPDie = ModuleCU->getDIE(SP.getNode());
+  if (SPDie)
+    return SPDie;
+
+  SPDie = new DIE(dwarf::DW_TAG_subprogram);
   addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, SP.getName());
 
   StringRef LinkageName = SP.getLinkageName();
@@ -1103,9 +1120,6 @@ DIE *DwarfDebug::createSubprogramDIE(CompileUnit *DW_Unit,
   }
   addSourceLine(SPDie, &SP);
 
-  DICompositeType SPTy = SP.getType();
-  DIArray Args = SPTy.getTypeArray();
-
   // Add prototyped tag, if C or ObjC.
   unsigned Lang = SP.getCompileUnit().getLanguage();
   if (Lang == dwarf::DW_LANG_C99 || Lang == dwarf::DW_LANG_C89 ||
@@ -1113,98 +1127,56 @@ DIE *DwarfDebug::createSubprogramDIE(CompileUnit *DW_Unit,
     addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
 
   // Add Return Type.
+  DICompositeType SPTy = SP.getType();
+  DIArray Args = SPTy.getTypeArray();
   unsigned SPTag = SPTy.getTag();
-  if (!IsConstructor) {
-    if (Args.isNull() || SPTag != dwarf::DW_TAG_subroutine_type)
-      addType(DW_Unit, SPDie, SPTy);
-    else
-      addType(DW_Unit, SPDie, DIType(Args.getElement(0).getNode()));
+
+  if (Args.isNull() || SPTag != dwarf::DW_TAG_subroutine_type)
+    addType(SPDie, SPTy);
+  else
+    addType(SPDie, DIType(Args.getElement(0).getNode()));
+
+  unsigned VK = SP.getVirtuality();
+  if (VK) {
+    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
+    DIEBlock *Block = new DIEBlock();
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, 0, dwarf::DW_FORM_data1, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
+    ContainingTypeMap.insert(std::make_pair(SPDie, WeakVH(SP.getContainingType().getNode())));
   }
 
-  if (!SP.isDefinition()) {
+  if (MakeDecl || !SP.isDefinition()) {
     addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
 
     // Add arguments. Do not add arguments for subprogram definition. They will
-    // be handled through RecordVariable.
+    // be handled while processing variables.
+    DICompositeType SPTy = SP.getType();
+    DIArray Args = SPTy.getTypeArray();
+    unsigned SPTag = SPTy.getTag();
+
     if (SPTag == dwarf::DW_TAG_subroutine_type)
       for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
         DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        addType(DW_Unit, Arg, DIType(Args.getElement(i).getNode()));
+        addType(Arg, DIType(Args.getElement(i).getNode()));
         addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); // ??
         SPDie->addChild(Arg);
       }
   }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
-  DW_Unit->insertDIE(SP.getNode(), SPDie);
+  ModuleCU->insertDIE(SP.getNode(), SPDie);
   return SPDie;
 }
 
 /// findCompileUnit - Get the compile unit for the given descriptor.
 ///
-CompileUnit &DwarfDebug::findCompileUnit(DICompileUnit Unit) const {
+CompileUnit *DwarfDebug::findCompileUnit(DICompileUnit Unit) {
   DenseMap<Value *, CompileUnit *>::const_iterator I =
     CompileUnitMap.find(Unit.getNode());
-  assert(I != CompileUnitMap.end() && "Missing compile unit.");
-  return *I->second;
-}
-
-/// createDbgScopeVariable - Create a new scope variable.
-///
-DIE *DwarfDebug::createDbgScopeVariable(DbgVariable *DV, CompileUnit *Unit) {
-  // Get the descriptor.
-  const DIVariable &VD = DV->getVariable();
-  StringRef Name = VD.getName();
-  if (Name.empty())
-    return NULL;
-
-  // Translate tag to proper Dwarf tag.  The result variable is dropped for
-  // now.
-  unsigned Tag;
-  switch (VD.getTag()) {
-  case dwarf::DW_TAG_return_variable:
-    return NULL;
-  case dwarf::DW_TAG_arg_variable:
-    Tag = dwarf::DW_TAG_formal_parameter;
-    break;
-  case dwarf::DW_TAG_auto_variable:    // fall thru
-  default:
-    Tag = dwarf::DW_TAG_variable;
-    break;
-  }
-
-  // Define variable debug information entry.
-  DIE *VariableDie = new DIE(Tag);
-  addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
-
-  // Add source line info if available.
-  addSourceLine(VariableDie, &VD);
-
-  // Add variable type.
-  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
-  // addresses instead.
-  if (VD.isBlockByrefVariable())
-    addType(Unit, VariableDie, getBlockByrefType(VD.getType(), Name));
-  else
-    addType(Unit, VariableDie, VD.getType());
-
-  // Add variable address.
-  // Variables for abstract instances of inlined functions don't get a
-  // location.
-  MachineLocation Location;
-  unsigned FrameReg;
-  int Offset = RI->getFrameIndexReference(*MF, DV->getFrameIndex(), FrameReg);
-  Location.set(FrameReg, Offset);
-
-
-  if (VD.hasComplexAddress())
-    addComplexAddress(DV, VariableDie, dwarf::DW_AT_location, Location);
-  else if (VD.isBlockByrefVariable())
-    addBlockByrefAddress(DV, VariableDie, dwarf::DW_AT_location, Location);
-  else
-    addAddress(VariableDie, dwarf::DW_AT_location, Location);
-
-  return VariableDie;
+  if (I == CompileUnitMap.end())
+    return constructCompileUnit(Unit.getNode());
+  return I->second;
 }
 
 /// getUpdatedDbgScope - Find or create DbgScope assicated with the instruction.
@@ -1295,6 +1267,28 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(MDNode *SPNode) {
 
  DIE *SPDie = ModuleCU->getDIE(SPNode);
  assert (SPDie && "Unable to find subprogram DIE!");
+ DISubprogram SP(SPNode);
+ if (SP.isDefinition() && !SP.getContext().isCompileUnit()) {
+   addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+  // Add arguments. 
+   DICompositeType SPTy = SP.getType();
+   DIArray Args = SPTy.getTypeArray();
+   unsigned SPTag = SPTy.getTag();
+   if (SPTag == dwarf::DW_TAG_subroutine_type)
+     for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
+       DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+       addType(Arg, DIType(Args.getElement(i).getNode()));
+       addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1); // ??
+       SPDie->addChild(Arg);
+     }
+   DIE *SPDeclDie = SPDie;
+   SPDie = new DIE(dwarf::DW_TAG_subprogram);
+   addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4, 
+               SPDeclDie);
+   
+   ModuleCU->addDie(SPDie);
+ }
+   
  addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
           DWLabel("func_begin", SubprogramCount));
  addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
@@ -1305,19 +1299,6 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(MDNode *SPNode) {
  if (!DISubprogram(SPNode).isLocalToUnit())
    addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
 
- // If there are global variables at this scope then add their dies.
- for (SmallVector<WeakVH, 4>::iterator SGI = ScopedGVs.begin(),
-        SGE = ScopedGVs.end(); SGI != SGE; ++SGI) {
-   MDNode *N = dyn_cast_or_null<MDNode>(*SGI);
-   if (!N) continue;
-   DIGlobalVariable GV(N);
-   if (GV.getContext().getNode() == SPNode) {
-     DIE *ScopedGVDie = createGlobalVariableDIE(ModuleCU, GV);
-     if (ScopedGVDie)
-       SPDie->addChild(ScopedGVDie);
-   }
- }
- 
  return SPDie;
 }
 
@@ -1401,8 +1382,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
 
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV,
-                                      DbgScope *Scope, CompileUnit *Unit) {
+DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
   // Get the descriptor.
   const DIVariable &VD = DV->getVariable();
   StringRef Name = VD.getName();
@@ -1451,9 +1431,9 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV,
     // FIXME: isBlockByrefVariable should be reformulated in terms of complex
     // addresses instead.
     if (VD.isBlockByrefVariable())
-      addType(Unit, VariableDie, getBlockByrefType(VD.getType(), Name));
+      addType(VariableDie, getBlockByrefType(VD.getType(), Name));
     else
-      addType(Unit, VariableDie, VD.getType());
+      addType(VariableDie, VD.getType());
   }
 
   // Add variable address.
@@ -1522,7 +1502,7 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   // Add variables to scope.
   SmallVector<DbgVariable *, 8> &Variables = Scope->getVariables();
   for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
-    DIE *VariableDIE = constructVariableDIE(Variables[i], Scope, ModuleCU);
+    DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
     if (VariableDIE)
       ScopeDIE->addChild(VariableDIE);
   }
@@ -1579,7 +1559,7 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef DirName, StringRef FileName)
   return SrcId;
 }
 
-void DwarfDebug::constructCompileUnit(MDNode *N) {
+CompileUnit *DwarfDebug::constructCompileUnit(MDNode *N) {
   DICompileUnit DIUnit(N);
   StringRef FN = DIUnit.getFilename();
   StringRef Dir = DIUnit.getDirectory();
@@ -1618,6 +1598,7 @@ void DwarfDebug::constructCompileUnit(MDNode *N) {
 
   CompileUnitMap[DIUnit.getNode()] = Unit;
   CompileUnits.push_back(Unit);
+  return Unit;
 }
 
 void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
@@ -1631,14 +1612,16 @@ void DwarfDebug::constructGlobalVariableDIE(MDNode *N) {
   if (ModuleCU->getDIE(DI_GV.getNode()))
     return;
 
-  DIE *VariableDie = createGlobalVariableDIE(ModuleCU, DI_GV);
+  DIE *VariableDie = createGlobalVariableDIE(DI_GV);
+  if (!VariableDie)
+    return;
 
   // Add to map.
   ModuleCU->insertDIE(N, VariableDie);
 
   // Add to context owner.
-  ModuleCU->getCUDie()->addChild(VariableDie);
-
+  addToContextOwner(VariableDie, DI_GV.getContext());
+  
   // Expose as global. FIXME - need to check external flag.
   ModuleCU->addGlobal(DI_GV.getName(), VariableDie);
 
@@ -1663,13 +1646,15 @@ void DwarfDebug::constructSubprogramDIE(MDNode *N) {
     // class type.
     return;
 
-  DIE *SubprogramDie = createSubprogramDIE(ModuleCU, SP);
+  DIE *SubprogramDie = createSubprogramDIE(SP);
 
   // Add to map.
   ModuleCU->insertDIE(N, SubprogramDie);
 
   // Add to context owner.
-  ModuleCU->getCUDie()->addChild(SubprogramDie);
+  if (SP.getContext().getNode() == SP.getCompileUnit().getNode())
+    if (TopLevelDIEs.insert(SubprogramDie))
+      TopLevelDIEsVector.push_back(SubprogramDie);
 
   // Expose as global.
   ModuleCU->addGlobal(SP.getName(), SubprogramDie);
@@ -1709,21 +1694,16 @@ void DwarfDebug::beginModule(Module *M, MachineModuleInfo *mmi) {
   if (!ModuleCU)
     ModuleCU = CompileUnits[0];
 
-  // Create DIEs for each of the externally visible global variables.
-  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
-         E = DbgFinder.global_variable_end(); I != E; ++I) {
-    DIGlobalVariable GV(*I);
-    if (GV.getContext().getNode() != GV.getCompileUnit().getNode())
-      ScopedGVs.push_back(*I);
-    else
-      constructGlobalVariableDIE(*I);
-  }
-
   // Create DIEs for each subprogram.
   for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
          E = DbgFinder.subprogram_end(); I != E; ++I)
     constructSubprogramDIE(*I);
 
+  // Create DIEs for each global variable.
+  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+         E = DbgFinder.global_variable_end(); I != E; ++I)
+    constructGlobalVariableDIE(*I);
+
   MMI = mmi;
   shouldEmit = true;
   MMI->setDebugInfoAvailability(true);
@@ -1770,6 +1750,22 @@ void DwarfDebug::endModule() {
     addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
   }
 
+  // Insert top level DIEs.
+  for (SmallVector<DIE *, 4>::iterator TI = TopLevelDIEsVector.begin(),
+         TE = TopLevelDIEsVector.end(); TI != TE; ++TI)
+    ModuleCU->getCUDie()->addChild(*TI);
+
+  for (DenseMap<DIE *, WeakVH>::iterator CI = ContainingTypeMap.begin(),
+         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+    DIE *SPDie = CI->first;
+    MDNode *N = dyn_cast_or_null<MDNode>(CI->second);
+    if (!N) continue;
+    DIE *NDie = ModuleCU->getDIE(N);
+    if (!NDie) continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    addDIEEntry(NDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+  }
+
   // Standard sections final addresses.
   Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getTextSection());
   EmitLabel("text_end", 0);
@@ -1898,6 +1894,7 @@ void DwarfDebug::endScope(const MachineInstr *MI) {
 
   unsigned Label = MMI->NextLabelID();
   Asm->printLabel(Label);
+  O << '\n';
 
   SmallVector<DbgScope *, 2> &SD = I->second;
   for (SmallVector<DbgScope *, 2>::iterator SDI = SD.begin(), SDE = SD.end();
@@ -2092,17 +2089,15 @@ void DwarfDebug::endFunction(MachineFunction *MF) {
                                                MMI->getFrameMoves()));
 
   // Clear debug info
-  if (CurrentFnDbgScope) {
-    CurrentFnDbgScope = NULL;
-    DbgScopeMap.clear();
-    DbgScopeBeginMap.clear();
-    DbgScopeEndMap.clear();
-    ConcreteScopes.clear();
-    AbstractScopesList.clear();
-  }
+  CurrentFnDbgScope = NULL;
+  DbgScopeMap.clear();
+  DbgScopeBeginMap.clear();
+  DbgScopeEndMap.clear();
+  ConcreteScopes.clear();
+  AbstractScopesList.clear();
 
   Lines.clear();
-
+  
   if (TimePassesIsEnabled)
     DebugTimer->stopTimer();
 }
@@ -2337,13 +2332,16 @@ void DwarfDebug::emitDIE(DIE *Die) {
   }
 }
 
-/// emitDebugInfo / emitDebugInfoPerCU - Emit the debug info section.
+/// emitDebugInfo - Emit the debug info section.
 ///
-void DwarfDebug::emitDebugInfoPerCU(CompileUnit *Unit) {
-  DIE *Die = Unit->getCUDie();
+void DwarfDebug::emitDebugInfo() {
+  // Start debug info section.
+  Asm->OutStreamer.SwitchSection(
+                            Asm->getObjFileLowering().getDwarfInfoSection());
+  DIE *Die = ModuleCU->getCUDie();
 
   // Emit the compile units header.
-  EmitLabel("info_begin", Unit->getID());
+  EmitLabel("info_begin", ModuleCU->getID());
 
   // Emit size of content not including length itself
   unsigned ContentSize = Die->getSize() +
@@ -2364,17 +2362,10 @@ void DwarfDebug::emitDebugInfoPerCU(CompileUnit *Unit) {
   Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
   Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
   Asm->EmitInt8(0); Asm->EOL("Extra Pad For GDB");
-  EmitLabel("info_end", Unit->getID());
+  EmitLabel("info_end", ModuleCU->getID());
 
   Asm->EOL();
-}
-
-void DwarfDebug::emitDebugInfo() {
-  // Start debug info section.
-  Asm->OutStreamer.SwitchSection(
-                            Asm->getObjFileLowering().getDwarfInfoSection());
 
-  emitDebugInfoPerCU(ModuleCU);
 }
 
 /// emitAbbreviations - Emit the abbreviation section.
@@ -2534,9 +2525,9 @@ void DwarfDebug::emitDebugLines() {
         std::pair<unsigned, unsigned> SourceID =
           getSourceDirectoryAndFileIds(LineInfo.getSourceID());
         O << '\t' << MAI->getCommentString() << ' '
-          << getSourceDirectoryName(SourceID.first) << ' '
+          << getSourceDirectoryName(SourceID.first) << '/'
           << getSourceFileName(SourceID.second)
-          <<" :" << utostr_32(LineInfo.getLine()) << '\n';
+          << ':' << utostr_32(LineInfo.getLine()) << '\n';
       }
 
       // Define the line address.
@@ -2672,24 +2663,30 @@ DwarfDebug::emitFunctionDebugFrame(const FunctionDebugFrameInfo&DebugFrameInfo){
   Asm->EOL();
 }
 
-void DwarfDebug::emitDebugPubNamesPerCU(CompileUnit *Unit) {
-  EmitDifference("pubnames_end", Unit->getID(),
-                 "pubnames_begin", Unit->getID(), true);
+/// emitDebugPubNames - Emit visible names into a debug pubnames section.
+///
+void DwarfDebug::emitDebugPubNames() {
+  // Start the dwarf pubnames section.
+  Asm->OutStreamer.SwitchSection(
+                          Asm->getObjFileLowering().getDwarfPubNamesSection());
+
+  EmitDifference("pubnames_end", ModuleCU->getID(),
+                 "pubnames_begin", ModuleCU->getID(), true);
   Asm->EOL("Length of Public Names Info");
 
-  EmitLabel("pubnames_begin", Unit->getID());
+  EmitLabel("pubnames_begin", ModuleCU->getID());
 
   Asm->EmitInt16(dwarf::DWARF_VERSION); Asm->EOL("DWARF Version");
 
   EmitSectionOffset("info_begin", "section_info",
-                    Unit->getID(), 0, true, false);
+                    ModuleCU->getID(), 0, true, false);
   Asm->EOL("Offset of Compilation Unit Info");
 
-  EmitDifference("info_end", Unit->getID(), "info_begin", Unit->getID(),
+  EmitDifference("info_end", ModuleCU->getID(), "info_begin", ModuleCU->getID(),
                  true);
   Asm->EOL("Compilation Unit Length");
 
-  const StringMap<DIE*> &Globals = Unit->getGlobals();
+  const StringMap<DIE*> &Globals = ModuleCU->getGlobals();
   for (StringMap<DIE*>::const_iterator
          GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
     const char *Name = GI->getKeyData();
@@ -2700,21 +2697,11 @@ void DwarfDebug::emitDebugPubNamesPerCU(CompileUnit *Unit) {
   }
 
   Asm->EmitInt32(0); Asm->EOL("End Mark");
-  EmitLabel("pubnames_end", Unit->getID());
+  EmitLabel("pubnames_end", ModuleCU->getID());
 
   Asm->EOL();
 }
 
-/// emitDebugPubNames - Emit visible names into a debug pubnames section.
-///
-void DwarfDebug::emitDebugPubNames() {
-  // Start the dwarf pubnames section.
-  Asm->OutStreamer.SwitchSection(
-                          Asm->getObjFileLowering().getDwarfPubNamesSection());
-
-  emitDebugPubNamesPerCU(ModuleCU);
-}
-
 void DwarfDebug::emitDebugPubTypes() {
   // Start the dwarf pubnames section.
   Asm->OutStreamer.SwitchSection(
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 679d9b9..12ad322 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -154,12 +154,14 @@ class DwarfDebug : public Dwarf {
   /// (at the end of the module) as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
 
+  DenseMap<DIE *, WeakVH> ContainingTypeMap;
+
   /// AbstractSubprogramDIEs - Collection of abstruct subprogram DIEs.
   SmallPtrSet<DIE *, 4> AbstractSubprogramDIEs;
 
-  /// ScopedGVs - Tracks global variables that are not at file scope.
-  /// For example void f() { static int b = 42; }
-  SmallVector<WeakVH, 4> ScopedGVs;
+  /// TopLevelDIEs - Collection of top level DIEs. 
+  SmallPtrSet<DIE *, 4> TopLevelDIEs;
+  SmallVector<DIE *, 4> TopLevelDIEsVector;
 
   typedef SmallVector<DbgScope *, 2> ScopeVector;
   typedef DenseMap<const MachineInstr *, ScopeVector>
@@ -307,53 +309,52 @@ class DwarfDebug : public Dwarf {
   void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
                             const MachineLocation &Location);
 
+  /// addToContextOwner - Add Die into the list of its context owner's children.
+  void addToContextOwner(DIE *Die, DIDescriptor Context);
+
   /// addType - Add a new type attribute to the specified entity.
-  void addType(CompileUnit *DW_Unit, DIE *Entity, DIType Ty);
+  void addType(DIE *Entity, DIType Ty);
+
+  /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+  /// given DIType.
+  DIE *getOrCreateTypeDIE(DIType Ty);
 
   void addPubTypes(DISubprogram SP);
 
   /// constructTypeDIE - Construct basic type die from DIBasicType.
-  void constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+  void constructTypeDIE(DIE &Buffer,
                         DIBasicType BTy);
 
   /// constructTypeDIE - Construct derived type die from DIDerivedType.
-  void constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+  void constructTypeDIE(DIE &Buffer,
                         DIDerivedType DTy);
 
   /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
+  void constructTypeDIE(DIE &Buffer,
                         DICompositeType CTy);
 
   /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
   void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
 
   /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-  void constructArrayTypeDIE(CompileUnit *DW_Unit, DIE &Buffer, 
+  void constructArrayTypeDIE(DIE &Buffer, 
                              DICompositeType *CTy);
 
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-  DIE *constructEnumTypeDIE(CompileUnit *DW_Unit, DIEnumerator *ETy);
+  DIE *constructEnumTypeDIE(DIEnumerator *ETy);
 
   /// createGlobalVariableDIE - Create new DIE using GV.
-  DIE *createGlobalVariableDIE(CompileUnit *DW_Unit,
-                               const DIGlobalVariable &GV);
+  DIE *createGlobalVariableDIE(const DIGlobalVariable &GV);
 
   /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(CompileUnit *DW_Unit, const DIDerivedType &DT);
+  DIE *createMemberDIE(const DIDerivedType &DT);
 
   /// createSubprogramDIE - Create new DIE using SP.
-  DIE *createSubprogramDIE(CompileUnit *DW_Unit,
-                           const DISubprogram &SP,
-                           bool IsConstructor = false,
-                           bool IsInlined = false);
+  DIE *createSubprogramDIE(const DISubprogram &SP, bool MakeDecl = false);
 
   /// findCompileUnit - Get the compile unit for the given descriptor. 
   ///
-  CompileUnit &findCompileUnit(DICompileUnit Unit) const;
-
-  /// createDbgScopeVariable - Create a new scope variable.
-  ///
-  DIE *createDbgScopeVariable(DbgVariable *DV, CompileUnit *Unit);
+  CompileUnit *findCompileUnit(DICompileUnit Unit);
 
   /// getUpdatedDbgScope - Find or create DbgScope assicated with 
   /// the instruction. Initialize scope and update scope hierarchy.
@@ -384,7 +385,7 @@ class DwarfDebug : public Dwarf {
   DIE *constructInlinedScopeDIE(DbgScope *Scope);
 
   /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, DbgScope *S, CompileUnit *Unit);
+  DIE *constructVariableDIE(DbgVariable *DV, DbgScope *S);
 
   /// constructScopeDIE - Construct a DIE for this scope.
   DIE *constructScopeDIE(DbgScope *Scope);
@@ -405,10 +406,8 @@ class DwarfDebug : public Dwarf {
   ///
   void computeSizeAndOffsets();
 
-  /// EmitDebugInfo / emitDebugInfoPerCU - Emit the debug info section.
+  /// EmitDebugInfo - Emit the debug info section.
   ///
-  void emitDebugInfoPerCU(CompileUnit *Unit);
-
   void emitDebugInfo();
 
   /// emitAbbreviations - Emit the abbreviation section.
@@ -432,8 +431,6 @@ class DwarfDebug : public Dwarf {
   /// section.
   void emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo);
 
-  void emitDebugPubNamesPerCU(CompileUnit *Unit);
-
   /// emitDebugPubNames - Emit visible names into a debug pubnames section.
   ///
   void emitDebugPubNames();
@@ -488,7 +485,7 @@ class DwarfDebug : public Dwarf {
   /// as well.
   unsigned GetOrCreateSourceID(StringRef DirName, StringRef FileName);
 
-  void constructCompileUnit(MDNode *N);
+  CompileUnit *constructCompileUnit(MDNode *N);
 
   void constructGlobalVariableDIE(MDNode *N);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 1c8b8f4..3fd077f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -292,14 +292,13 @@ void DwarfException::EmitFDE(const FunctionEHFrameInfo &EHFrameInfo) {
       Asm->EmitULEB128Bytes(is4Byte ? 4 : 8);
       Asm->EOL("Augmentation size");
 
+      // We force 32-bits here because we've encoded our LSDA in the CIE with
+      // `dwarf::DW_EH_PE_sdata4'. And the CIE and FDE should agree.
       if (EHFrameInfo.hasLandingPads)
-        EmitReference("exception", EHFrameInfo.Number, true, false);
-      else {
-        if (is4Byte)
-          Asm->EmitInt32((int)0);
-        else
-          Asm->EmitInt64((int)0);
-      }
+        EmitReference("exception", EHFrameInfo.Number, true, true);
+      else
+        Asm->EmitInt32((int)0);
+
       Asm->EOL("Language Specific Data Area");
     } else {
       Asm->EmitULEB128Bytes(0);
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index aff1665..aa01c5b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -119,7 +119,6 @@ class DwarfException : public Dwarf {
     static inline unsigned getTombstoneKey() { return -2U; }
     static unsigned getHashValue(const unsigned &Key) { return Key; }
     static bool isEqual(unsigned LHS, unsigned RHS) { return LHS == RHS; }
-    static bool isPod() { return true; }
   };
 
   /// PadRange - Structure holding a try-range and the associated landing pad.
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 8a62eb2..3887e6d 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -427,7 +427,7 @@ static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
 static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
                     const TargetInstrInfo *TII) {
   MachineFunction *MF = CurMBB->getParent();
-  MachineFunction::iterator I = next(MachineFunction::iterator(CurMBB));
+  MachineFunction::iterator I = llvm::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = 0, *FBB = 0;
   SmallVector<MachineOperand, 4> Cond;
   if (I != MF->end() &&
@@ -805,7 +805,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
   // a compile-time infinite loop repeatedly doing and undoing the same
   // transformations.)
 
-  for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
        I != E; ++I) {
     if (I->pred_size() >= 2 && I->pred_size() < TailMergeThreshold) {
       SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
@@ -833,7 +833,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
               continue;
             // This is the QBB case described above
             if (!FBB)
-              FBB = next(MachineFunction::iterator(PBB));
+              FBB = llvm::next(MachineFunction::iterator(PBB));
           }
           // Failing case:  the only way IBB can be reached from PBB is via
           // exception handling.  Happens for landing pads.  Would be nice
@@ -1140,7 +1140,7 @@ ReoptimizeBlock:
       // falls through into MBB and we can't understand the prior block's branch
       // condition.
       if (MBB->empty()) {
-        bool PredHasNoFallThrough = TII->BlockHasNoFallThrough(PrevBB);
+        bool PredHasNoFallThrough = !PrevBB.canFallThrough();
         if (PredHasNoFallThrough || !PriorUnAnalyzable ||
             !PrevBB.isSuccessor(MBB)) {
           // If the prior block falls through into us, turn it into an
@@ -1239,7 +1239,7 @@ ReoptimizeBlock:
           // B elsewhere
           // next:
           if (CurFallsThru) {
-            MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
+            MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
             CurCond.clear();
             TII->InsertBranch(*MBB, NextBB, 0, CurCond);
           }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 6f86614..7a969f0 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMCodeGen
   AggressiveAntiDepBreaker.cpp
   BranchFolding.cpp
+  CalcSpillWeights.cpp
   CodePlacementOpt.cpp
   CriticalAntiDepBreaker.cpp
   DeadMachineInstructionElim.cpp
@@ -35,7 +36,9 @@ add_llvm_library(LLVMCodeGen
   MachinePassRegistry.cpp
   MachineRegisterInfo.cpp
   MachineSink.cpp
+  MachineSSAUpdater.cpp
   MachineVerifier.cpp
+  MaxStackAlignment.cpp
   ObjectCodeEmitter.cpp
   OcamlGC.cpp
   PHIElimination.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
new file mode 100644
index 0000000..dcffb8a2
--- /dev/null
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -0,0 +1,154 @@
+//===------------------------ CalcSpillWeights.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "calcspillweights"
+
+#include "llvm/Function.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+char CalculateSpillWeights::ID = 0;
+static RegisterPass<CalculateSpillWeights> X("calcspillweights",
+                                             "Calculate spill weights");
+
+void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
+  au.addRequired<LiveIntervals>();
+  au.addRequired<MachineLoopInfo>();
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &fn) {
+
+  DEBUG(errs() << "********** Compute Spill Weights **********\n"
+               << "********** Function: "
+               << fn.getFunction()->getName() << '\n');
+
+  LiveIntervals *lis = &getAnalysis<LiveIntervals>();
+  MachineLoopInfo *loopInfo = &getAnalysis<MachineLoopInfo>();
+  const TargetInstrInfo *tii = fn.getTarget().getInstrInfo();
+  MachineRegisterInfo *mri = &fn.getRegInfo();
+
+  SmallSet<unsigned, 4> processed;
+  for (MachineFunction::iterator mbbi = fn.begin(), mbbe = fn.end();
+       mbbi != mbbe; ++mbbi) {
+    MachineBasicBlock* mbb = mbbi;
+    SlotIndex mbbEnd = lis->getMBBEndIdx(mbb);
+    MachineLoop* loop = loopInfo->getLoopFor(mbb);
+    unsigned loopDepth = loop ? loop->getLoopDepth() : 0;
+    bool isExiting = loop ? loop->isLoopExiting(mbb) : false;
+
+    for (MachineBasicBlock::const_iterator mii = mbb->begin(), mie = mbb->end();
+         mii != mie; ++mii) {
+      const MachineInstr *mi = mii;
+      if (tii->isIdentityCopy(*mi))
+        continue;
+
+      if (mi->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
+        continue;
+
+      for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+        const MachineOperand &mopi = mi->getOperand(i);
+        if (!mopi.isReg() || mopi.getReg() == 0)
+          continue;
+        unsigned reg = mopi.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(mopi.getReg()))
+          continue;
+        // Multiple uses of reg by the same instruction. It should not
+        // contribute to spill weight again.
+        if (!processed.insert(reg))
+          continue;
+
+        bool hasDef = mopi.isDef();
+        bool hasUse = !hasDef;
+        for (unsigned j = i+1; j != e; ++j) {
+          const MachineOperand &mopj = mi->getOperand(j);
+          if (!mopj.isReg() || mopj.getReg() != reg)
+            continue;
+          hasDef |= mopj.isDef();
+          hasUse |= mopj.isUse();
+          if (hasDef && hasUse)
+            break;
+        }
+
+        LiveInterval &regInt = lis->getInterval(reg);
+        float weight = lis->getSpillWeight(hasDef, hasUse, loopDepth);
+        if (hasDef && isExiting) {
+          // Looks like this is a loop count variable update.
+          SlotIndex defIdx = lis->getInstructionIndex(mi).getDefIndex();
+          const LiveRange *dlr =
+            lis->getInterval(reg).getLiveRangeContaining(defIdx);
+          if (dlr->end > mbbEnd)
+            weight *= 3.0F;
+        }
+        regInt.weight += weight;
+      }
+      processed.clear();
+    }
+  }
+
+  for (LiveIntervals::iterator I = lis->begin(), E = lis->end(); I != E; ++I) {
+    LiveInterval &li = *I->second;
+    if (TargetRegisterInfo::isVirtualRegister(li.reg)) {
+      // If the live interval length is essentially zero, i.e. in every live
+      // range the use follows def immediately, it doesn't make sense to spill
+      // it and hope it will be easier to allocate for this li.
+      if (isZeroLengthInterval(&li)) {
+        li.weight = HUGE_VALF;
+        continue;
+      }
+
+      bool isLoad = false;
+      SmallVector<LiveInterval*, 4> spillIs;
+      if (lis->isReMaterializable(li, spillIs, isLoad)) {
+        // If all of the definitions of the interval are re-materializable,
+        // it is a preferred candidate for spilling. If non of the defs are
+        // loads, then it's potentially very cheap to re-materialize.
+        // FIXME: this gets much more complicated once we support non-trivial
+        // re-materialization.
+        if (isLoad)
+          li.weight *= 0.9F;
+        else
+          li.weight *= 0.5F;
+      }
+
+      // Slightly prefer live interval that has been assigned a preferred reg.
+      std::pair<unsigned, unsigned> Hint = mri->getRegAllocationHint(li.reg);
+      if (Hint.first || Hint.second)
+        li.weight *= 1.01F;
+
+      // Divide the weight of the interval by its size.  This encourages
+      // spilling of intervals that are large and have few uses, and
+      // discourages spilling of small intervals with many uses.
+      li.weight /= lis->getApproximateInstructionCount(li) * SlotIndex::NUM;
+    }
+  }
+  
+  return false;
+}
+
+/// Returns true if the given live interval is zero length.
+bool CalculateSpillWeights::isZeroLengthInterval(LiveInterval *li) const {
+  for (LiveInterval::Ranges::const_iterator
+       i = li->ranges.begin(), e = li->ranges.end(); i != e; ++i)
+    if (i->end.getPrevIndex() > i->start)
+      return false;
+  return true;
+}
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index e9844d8..ff71f6b 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -182,7 +182,7 @@ bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
       // Move it and all the blocks that can reach it via fallthrough edges
       // exclusively, to keep existing fallthrough edges intact.
       MachineFunction::iterator Begin = Pred;
-      MachineFunction::iterator End = next(Begin);
+      MachineFunction::iterator End = llvm::next(Begin);
       while (Begin != MF.begin()) {
         MachineFunction::iterator Prior = prior(Begin);
         if (Prior == MF.begin())
@@ -255,7 +255,8 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
   // to the top of the loop to avoid loosing that fallthrough. Otherwise append
   // them to the bottom, even if it previously had a fallthrough, on the theory
   // that it's worth an extra branch to keep the loop contiguous.
-  MachineFunction::iterator InsertPt = next(MachineFunction::iterator(BotMBB));
+  MachineFunction::iterator InsertPt =
+    llvm::next(MachineFunction::iterator(BotMBB));
   bool InsertAtTop = false;
   if (TopMBB != MF.begin() &&
       !HasFallthrough(prior(MachineFunction::iterator(TopMBB))) &&
@@ -268,7 +269,7 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
   // with the loop header.
   SmallPtrSet<MachineBasicBlock *, 8> ContiguousBlocks;
   for (MachineFunction::iterator I = TopMBB,
-       E = next(MachineFunction::iterator(BotMBB)); I != E; ++I)
+       E = llvm::next(MachineFunction::iterator(BotMBB)); I != E; ++I)
     ContiguousBlocks.insert(I);
 
   // Find non-contigous blocks and fix them.
@@ -301,7 +302,7 @@ bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
       // Process this block and all loop blocks contiguous with it, to keep
       // them in their relative order.
       MachineFunction::iterator Begin = BB;
-      MachineFunction::iterator End = next(MachineFunction::iterator(BB));
+      MachineFunction::iterator End = llvm::next(MachineFunction::iterator(BB));
       for (; End != MF.end(); ++End) {
         if (!L->contains(End)) break;
         if (!HasAnalyzableTerminator(End)) break;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 1b39fec..3c7961c2 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -43,8 +43,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
             static_cast<const TargetRegisterClass *>(0));
 
   // Initialize the indices to indicate that no registers are live.
-  std::fill(KillIndices, array_endof(KillIndices), ~0u);
-  std::fill(DefIndices, array_endof(DefIndices), BB->size());
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
 
   // Clear "do not change" set.
   KeepRegs.clear();
@@ -122,7 +125,7 @@ void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
   // may have been rescheduled and its lifetime may overlap with registers
   // in ways not reflected in our current liveness state. For each such
   // register, adjust the liveness state to be conservatively correct.
-  for (unsigned Reg = 0; Reg != TargetRegisterInfo::FirstVirtualRegister; ++Reg)
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg)
     if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
       assert(KillIndices[Reg] == ~0u && "Clobbered register is live!");
       // Mark this register to be non-renamable.
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 242cba5..297dd31 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -74,6 +74,9 @@ EnableFastISelOption("fast-isel", cl::Hidden,
 static cl::opt<bool> EnableSplitGEPGVN("split-gep-gvn", cl::Hidden,
     cl::desc("Split GEPs and run no-load GVN"));
 
+static cl::opt<bool> PreAllocTailDup("pre-regalloc-taildup", cl::Hidden,
+    cl::desc("Pre-register allocation tail duplication"));
+
 LLVMTargetMachine::LLVMTargetMachine(const Target &T,
                                      const std::string &TargetTriple)
   : TargetMachine(T) {
@@ -302,6 +305,13 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
                    /* allowDoubleDefs= */ true);
   }
 
+  // Pre-ra tail duplication.
+  if (OptLevel != CodeGenOpt::None &&
+      !DisableTailDuplicate && PreAllocTailDup) {
+    PM.add(createTailDuplicatePass(true));
+    printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
+  }
+
   // Run pre-ra passes.
   if (addPreRegAlloc(PM, OptLevel))
     printAndVerify(PM, "After PreRegAlloc passes",
@@ -348,7 +358,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
 
   // Tail duplication.
   if (OptLevel != CodeGenOpt::None && !DisableTailDuplicate) {
-    PM.add(createTailDuplicatePass());
+    PM.add(createTailDuplicatePass(false));
     printAndVerify(PM, "After TailDuplicate");
   }
 
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 8d632cb..cc286aa 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -847,7 +847,7 @@ void LiveInterval::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
       if (vni->isUnused()) {
         OS << "x";
       } else {
-        if (!vni->isDefAccurate())
+        if (!vni->isDefAccurate() && !vni->isPHIDef())
           OS << "?";
         else
           OS << vni->def;
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 24adf36..8806439f 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -149,44 +149,69 @@ void LiveIntervals::dumpInstrs() const {
   printInstrs(errs());
 }
 
-/// conflictsWithPhysRegDef - Returns true if the specified register
-/// is defined during the duration of the specified interval.
-bool LiveIntervals::conflictsWithPhysRegDef(const LiveInterval &li,
-                                            VirtRegMap &vrm, unsigned reg) {
-  for (LiveInterval::Ranges::const_iterator
-         I = li.ranges.begin(), E = li.ranges.end(); I != E; ++I) {
-    for (SlotIndex index = I->start.getBaseIndex(),
-           end = I->end.getPrevSlot().getBaseIndex().getNextIndex();
-           index != end;
-           index = index.getNextIndex()) {
-      // skip deleted instructions
-      while (index != end && !getInstructionFromIndex(index))
-        index = index.getNextIndex();
-      if (index == end) break;
+bool LiveIntervals::conflictsWithPhysReg(const LiveInterval &li,
+                                         VirtRegMap &vrm, unsigned reg) {
+  // We don't handle fancy stuff crossing basic block boundaries
+  if (li.ranges.size() != 1)
+    return true;
+  const LiveRange &range = li.ranges.front();
+  SlotIndex idx = range.start.getBaseIndex();
+  SlotIndex end = range.end.getPrevSlot().getBaseIndex().getNextIndex();
+
+  // Skip deleted instructions
+  MachineInstr *firstMI = getInstructionFromIndex(idx);
+  while (!firstMI && idx != end) {
+    idx = idx.getNextIndex();
+    firstMI = getInstructionFromIndex(idx);
+  }
+  if (!firstMI)
+    return false;
 
-      MachineInstr *MI = getInstructionFromIndex(index);
-      unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
-      if (tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
-        if (SrcReg == li.reg || DstReg == li.reg)
-          continue;
-      for (unsigned i = 0; i != MI->getNumOperands(); ++i) {
-        MachineOperand& mop = MI->getOperand(i);
-        if (!mop.isReg())
-          continue;
-        unsigned PhysReg = mop.getReg();
-        if (PhysReg == 0 || PhysReg == li.reg)
+  // Find last instruction in range
+  SlotIndex lastIdx = end.getPrevIndex();
+  MachineInstr *lastMI = getInstructionFromIndex(lastIdx);
+  while (!lastMI && lastIdx != idx) {
+    lastIdx = lastIdx.getPrevIndex();
+    lastMI = getInstructionFromIndex(lastIdx);
+  }
+  if (!lastMI)
+    return false;
+
+  // Range cannot cross basic block boundaries or terminators
+  MachineBasicBlock *MBB = firstMI->getParent();
+  if (MBB != lastMI->getParent() || lastMI->getDesc().isTerminator())
+    return true;
+
+  MachineBasicBlock::const_iterator E = lastMI;
+  ++E;
+  for (MachineBasicBlock::const_iterator I = firstMI; I != E; ++I) {
+    const MachineInstr &MI = *I;
+
+    // Allow copies to and from li.reg
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (tii_->isMoveInstr(MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      if (SrcReg == li.reg || DstReg == li.reg)
+        continue;
+
+    // Check for operands using reg
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e;  ++i) {
+      const MachineOperand& mop = MI.getOperand(i);
+      if (!mop.isReg())
+        continue;
+      unsigned PhysReg = mop.getReg();
+      if (PhysReg == 0 || PhysReg == li.reg)
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(PhysReg)) {
+        if (!vrm.hasPhys(PhysReg))
           continue;
-        if (TargetRegisterInfo::isVirtualRegister(PhysReg)) {
-          if (!vrm.hasPhys(PhysReg))
-            continue;
-          PhysReg = vrm.getPhys(PhysReg);
-        }
-        if (PhysReg && tri_->regsOverlap(PhysReg, reg))
-          return true;
+        PhysReg = vrm.getPhys(PhysReg);
       }
+      if (PhysReg && tri_->regsOverlap(PhysReg, reg))
+        return true;
     }
   }
 
+  // No conflicts found.
   return false;
 }
 
@@ -201,15 +226,9 @@ bool LiveIntervals::conflictsWithPhysRegRef(LiveInterval &li,
            end = I->end.getPrevSlot().getBaseIndex().getNextIndex();
            index != end;
            index = index.getNextIndex()) {
-      // Skip deleted instructions.
-      MachineInstr *MI = 0;
-      while (index != end) {
-        MI = getInstructionFromIndex(index);
-        if (MI)
-          break;
-        index = index.getNextIndex();
-      }
-      if (index == end) break;
+      MachineInstr *MI = getInstructionFromIndex(index);
+      if (!MI)
+        continue;               // skip deleted instructions
 
       if (JoinedCopies.count(MI))
         continue;
@@ -374,8 +393,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // Value#0 is now defined by the 2-addr instruction.
       OldValNo->def  = RedefIndex;
       OldValNo->setCopy(0);
-      if (MO.isEarlyClobber())
-        OldValNo->setHasRedefByEC(true);
       
       // Add the new live interval which replaces the range for the input copy.
       LiveRange LR(DefIndex, RedefIndex, ValNo);
@@ -411,7 +428,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         interval.removeRange(Start, End);        
         assert(interval.ranges.size() == 1 &&
                "Newly discovered PHI interval has >1 ranges.");
-        MachineBasicBlock *killMBB = getMBBFromIndex(interval.endIndex());
+        MachineBasicBlock *killMBB = getMBBFromIndex(VNI->def);
         VNI->addKill(indexes_->getTerminatorGap(killMBB));
         VNI->setHasPHIKill(true);
         DEBUG({
@@ -422,7 +439,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         // Replace the interval with one of a NEW value number.  Note that this
         // value number isn't actually defined by an instruction, weird huh? :)
         LiveRange LR(Start, End,
-                     interval.getNextValue(SlotIndex(getMBBStartIdx(mbb), true),
+                     interval.getNextValue(SlotIndex(getMBBStartIdx(Killer->getParent()), true),
                        0, false, VNInfoAllocator));
         LR.valno->setIsPHIDef(true);
         DEBUG(errs() << " replace range with " << LR);
@@ -513,8 +530,6 @@ void LiveIntervals::handlePhysicalRegisterDef(MachineBasicBlock *MBB,
         if (mi->isRegTiedToUseOperand(DefIdx)) {
           // Two-address instruction.
           end = baseIndex.getDefIndex();
-          assert(!mi->getOperand(DefIdx).isEarlyClobber() &&
-                 "Two address instruction is an early clobber?"); 
         } else {
           // Another instruction redefines the register before it is ever read.
           // Then the register is essentially dead at the instruction that defines
@@ -730,8 +745,16 @@ unsigned LiveIntervals::getVNInfoSourceReg(const VNInfo *VNI) const {
   if (VNI->getCopy()->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
     // If it's extracting out of a physical register, return the sub-register.
     unsigned Reg = VNI->getCopy()->getOperand(1).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      unsigned SrcSubReg = VNI->getCopy()->getOperand(2).getImm();
+      unsigned DstSubReg = VNI->getCopy()->getOperand(0).getSubReg();
+      if (SrcSubReg == DstSubReg)
+        // %reg1034:3<def> = EXTRACT_SUBREG %EDX, 3
+        // reg1034 can still be coalesced to EDX.
+        return Reg;
+      assert(DstSubReg == 0);
       Reg = tri_->getSubReg(Reg, VNI->getCopy()->getOperand(2).getImm());
+    }
     return Reg;
   } else if (VNI->getCopy()->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
              VNI->getCopy()->getOpcode() == TargetInstrInfo::SUBREG_TO_REG)
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 68f80ac..3c88e37 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -720,6 +720,51 @@ bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
   return findKill(&MBB);
 }
 
+bool LiveVariables::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB) {
+  LiveVariables::VarInfo &VI = getVarInfo(Reg);
+
+  // Loop over all of the successors of the basic block, checking to see if
+  // the value is either live in the block, or if it is killed in the block.
+  std::vector<MachineBasicBlock*> OpSuccBlocks;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+         E = MBB.succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+
+    // Is it alive in this successor?
+    unsigned SuccIdx = SuccMBB->getNumber();
+    if (VI.AliveBlocks.test(SuccIdx))
+      return true;
+    OpSuccBlocks.push_back(SuccMBB);
+  }
+
+  // Check to see if this value is live because there is a use in a successor
+  // that kills it.
+  switch (OpSuccBlocks.size()) {
+  case 1: {
+    MachineBasicBlock *SuccMBB = OpSuccBlocks[0];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB)
+        return true;
+    break;
+  }
+  case 2: {
+    MachineBasicBlock *SuccMBB1 = OpSuccBlocks[0], *SuccMBB2 = OpSuccBlocks[1];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB1 ||
+          VI.Kills[i]->getParent() == SuccMBB2)
+        return true;
+    break;
+  }
+  default:
+    std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
+                             VI.Kills[i]->getParent()))
+        return true;
+  }
+  return false;
+}
+
 /// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
 /// variables that are live out of DomBB will be marked as passing live through
 /// BB.
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/LowerSubregs.cpp
index 30636a8..80eb6cd 100644
--- a/lib/CodeGen/LowerSubregs.cpp
+++ b/lib/CodeGen/LowerSubregs.cpp
@@ -312,7 +312,7 @@ bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
        mbbi != mbbe; ++mbbi) {
     for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
          mi != me;) {
-      MachineBasicBlock::iterator nmi = next(mi);
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
       MachineInstr *MI = mi;
       if (MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
         MadeChange |= LowerExtract(MI);
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index e55e369..a58286d 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -290,7 +290,7 @@ void MachineBasicBlock::updateTerminator() {
     } else {
       // The block has a fallthrough conditional branch.
       MachineBasicBlock *MBBA = *succ_begin();
-      MachineBasicBlock *MBBB = *next(succ_begin());
+      MachineBasicBlock *MBBB = *llvm::next(succ_begin());
       if (MBBA == TBB) std::swap(MBBB, MBBA);
       if (isLayoutSuccessor(TBB)) {
         if (TII->ReverseBranchCondition(Cond)) {
@@ -359,15 +359,10 @@ bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
 
 bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
   MachineFunction::const_iterator I(this);
-  return next(I) == MachineFunction::const_iterator(MBB);
+  return llvm::next(I) == MachineFunction::const_iterator(MBB);
 }
 
 bool MachineBasicBlock::canFallThrough() {
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  SmallVector<MachineOperand, 4> Cond;
-  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
-  bool BranchUnAnalyzable = TII->AnalyzeBranch(*this, TBB, FBB, Cond, true);
-
   MachineFunction::iterator Fallthrough = this;
   ++Fallthrough;
   // If FallthroughBlock is off the end of the function, it can't fall through.
@@ -378,16 +373,21 @@ bool MachineBasicBlock::canFallThrough() {
   if (!isSuccessor(Fallthrough))
     return false;
 
-  // If we couldn't analyze the branch, examine the last instruction.
-  // If the block doesn't end in a known control barrier, assume fallthrough
-  // is possible. The isPredicable check is needed because this code can be
-  // called during IfConversion, where an instruction which is normally a
-  // Barrier is predicated and thus no longer an actual control barrier. This
-  // is over-conservative though, because if an instruction isn't actually
-  // predicated we could still treat it like a barrier.
-  if (BranchUnAnalyzable)
+  // Analyze the branches, if any, at the end of the block.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond, true)) {
+    // If we couldn't analyze the branch, examine the last instruction.
+    // If the block doesn't end in a known control barrier, assume fallthrough
+    // is possible. The isPredicable check is needed because this code can be
+    // called during IfConversion, where an instruction which is normally a
+    // Barrier is predicated and thus no longer an actual control barrier. This
+    // is over-conservative though, because if an instruction isn't actually
+    // predicated we could still treat it like a barrier.
     return empty() || !back().getDesc().isBarrier() ||
            back().getDesc().isPredicable();
+  }
 
   // If there is no branch, control always falls through.
   if (TBB == 0) return true;
@@ -461,18 +461,19 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
   bool MadeChange = false;
   bool AddedFallThrough = false;
 
-  MachineFunction::iterator FallThru = next(MachineFunction::iterator(this));
+  MachineFunction::iterator FallThru =
+    llvm::next(MachineFunction::iterator(this));
   
-  // If this block ends with a conditional branch that falls through to its
-  // successor, set DestB as the successor.
   if (isCond) {
+    // If this block ends with a conditional branch that falls through to its
+    // successor, set DestB as the successor.
     if (DestB == 0 && FallThru != getParent()->end()) {
       DestB = FallThru;
       AddedFallThrough = true;
     }
   } else {
     // If this is an unconditional branch with no explicit dest, it must just be
-    // a fallthrough into DestB.
+    // a fallthrough into DestA.
     if (DestA == 0 && FallThru != getParent()->end()) {
       DestA = FallThru;
       AddedFallThrough = true;
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d20f446..dd6fd7e 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -328,7 +328,7 @@ void MachineFunction::print(raw_ostream &OS) const {
       if (I->second)
         OS << " in reg%" << I->second;
 
-      if (next(I) != E)
+      if (llvm::next(I) != E)
         OS << ", ";
     }
     OS << '\n';
@@ -342,7 +342,7 @@ void MachineFunction::print(raw_ostream &OS) const {
       else
         OS << "%physreg" << *I;
 
-      if (next(I) != E)
+      if (llvm::next(I) != E)
         OS << " ";
     }
     OS << '\n';
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index f73a5a3..12b974d 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -1058,6 +1058,22 @@ bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
   return true;
 }
 
+/// isConstantValuePHI - If the specified instruction is a PHI that always
+/// merges together the same virtual register, return the register, otherwise
+/// return 0.
+unsigned MachineInstr::isConstantValuePHI() const {
+  if (getOpcode() != TargetInstrInfo::PHI)
+    return 0;
+  assert(getNumOperands() >= 3 &&
+         "It's illegal to have a PHI without source operands");
+
+  unsigned Reg = getOperand(1).getReg();
+  for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
+    if (getOperand(i).getReg() != Reg)
+      return 0;
+  return Reg;
+}
+
 void MachineInstr::dump() const {
   errs() << "  " << *this;
 }
@@ -1150,9 +1166,14 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
     DebugLocTuple DLT = MF->getDebugLocTuple(debugLoc);
     DIScope Scope(DLT.Scope);
     OS << " dbg:";
+    // Omit the directory, since it's usually long and uninteresting.
     if (!Scope.isNull())
-      OS << Scope.getDirectory() << ':' << Scope.getFilename() << ':';
-    OS << DLT.Line << ":" << DLT.Col;
+      OS << Scope.getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << DLT.Line;
+    if (DLT.Col != 0)
+      OS << ':' << DLT.Col;
   }
 
   OS << "\n";
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index db77d19..63f4f18 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -62,11 +62,11 @@ MachineBasicBlock *MachineLoop::getBottomBlock() {
   MachineBasicBlock *BotMBB = getHeader();
   MachineFunction::iterator End = BotMBB->getParent()->end();
   if (BotMBB != prior(End)) {
-    MachineBasicBlock *NextMBB = next(MachineFunction::iterator(BotMBB));
+    MachineBasicBlock *NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
     while (contains(NextMBB)) {
       BotMBB = NextMBB;
-      if (BotMBB == next(MachineFunction::iterator(BotMBB))) break;
-      NextMBB = next(MachineFunction::iterator(BotMBB));
+      if (BotMBB == llvm::next(MachineFunction::iterator(BotMBB))) break;
+      NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
     }
   }
   return BotMBB;
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
new file mode 100644
index 0000000..292096f
--- /dev/null
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -0,0 +1,393 @@
+//===- MachineSSAUpdater.cpp - Unstructured SSA Update Tool ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineSSAUpdater class. It's based on SSAUpdater
+// class in lib/Transforms/Utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+typedef DenseMap<MachineBasicBlock*, unsigned> AvailableValsTy;
+typedef std::vector<std::pair<MachineBasicBlock*, unsigned> >
+                IncomingPredInfoTy;
+
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+static IncomingPredInfoTy &getIncomingPredInfo(void *IPI) {
+  return *static_cast<IncomingPredInfoTy*>(IPI);
+}
+
+
+MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
+                                     SmallVectorImpl<MachineInstr*> *NewPHI)
+  : AV(0), IPI(0), InsertedPHIs(NewPHI) {
+  TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+MachineSSAUpdater::~MachineSSAUpdater() {
+  delete &getAvailableVals(AV);
+  delete &getIncomingPredInfo(IPI);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates.  ProtoValue is the value used to name PHI nodes.
+void MachineSSAUpdater::Initialize(unsigned V) {
+  if (AV == 0)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+
+  if (IPI == 0)
+    IPI = new IncomingPredInfoTy();
+  else
+    getIncomingPredInfo(IPI).clear();
+
+  VR = V;
+  VRC = MRI->getRegClass(VR);
+}
+
+/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
+/// the specified block.
+bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, unsigned V) {
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+  return GetValueAtEndOfBlockInternal(BB);
+}
+
+static
+unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
+          SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> &PredValues) {
+  if (BB->empty())
+    return 0;
+
+  MachineBasicBlock::iterator I = BB->front();
+  if (I->getOpcode() != TargetInstrInfo::PHI)
+    return 0;
+
+  AvailableValsTy AVals;
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    AVals[PredValues[i].first] = PredValues[i].second;
+  while (I != BB->end() && I->getOpcode() == TargetInstrInfo::PHI) {
+    bool Same = true;
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = I->getOperand(i).getReg();
+      MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
+      if (AVals[SrcBB] != SrcReg) {
+        Same = false;
+        break;
+      }
+    }
+    if (Same)
+      return I->getOperand(0).getReg();
+    ++I;
+  }
+  return 0;
+}
+
+/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
+/// a value of the given register class at the start of the specified basic
+/// block. It returns the virtual register defined by the instruction.
+static
+MachineInstr *InsertNewDef(unsigned Opcode,
+                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
+                           const TargetRegisterClass *RC,
+                           MachineRegisterInfo *MRI, const TargetInstrInfo *TII) {
+  unsigned NewVR = MRI->createVirtualRegister(RC);
+  return BuildMI(*BB, I, DebugLoc::getUnknownLoc(), TII->get(Opcode), NewVR);
+}
+                          
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+///
+unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!getAvailableVals(AV).count(BB))
+    return GetValueAtEndOfBlockInternal(BB);
+
+  // If there are no predecessors, just return undef.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetInstrInfo::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> PredValues;
+  unsigned SingularValue = 0;
+
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue != 0)
+    return SingularValue;
+
+  // If an identical PHI is already in BB, just reuse it.
+  unsigned DupPHI = LookForIdenticalPHI(BB, PredValues);
+  if (DupPHI)
+    return DupPHI;
+
+  // Otherwise, we do need a PHI: insert one now.
+  MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+  MachineInstr *InsertedPHI = InsertNewDef(TargetInstrInfo::PHI, BB,
+                                           Loc, VRC, MRI, TII);
+
+  // Fill in all the predecessors of the PHI.
+  MachineInstrBuilder MIB(InsertedPHI);
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    MIB.addReg(PredValues[i].second).addMBB(PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    InsertedPHI->eraseFromParent();
+    return ConstVal;
+  }
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(errs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI->getOperand(0).getReg();
+}
+
+static
+MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
+                                         MachineOperand *U) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+    if (&MI->getOperand(i) == U)
+      return MI->getOperand(i+1).getMBB();
+  }
+
+  llvm_unreachable("MachineOperand::getParent() failure?");
+  return 0;
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
+  MachineInstr *UseMI = U.getParent();
+  unsigned NewVR = 0;
+  if (UseMI->getOpcode() == TargetInstrInfo::PHI) {
+    MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
+    NewVR = GetValueAtEndOfBlockInternal(SourceBB);
+  } else {
+    NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
+  }
+
+  U.setReg(NewVR);
+}
+
+void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) {
+  MRI->replaceRegWith(OldReg, NewReg);
+
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  for (DenseMap<MachineBasicBlock*, unsigned>::iterator
+         I = AvailableVals.begin(), E = AvailableVals.end(); I != E; ++I)
+    if (I->second == OldReg)
+      I->second = NewReg;
+}
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// walking predecessors inserting PHI nodes as needed until we get to a block
+/// where the value is available.
+///
+unsigned MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+
+  // Query AvailableVals by doing an insertion of null.
+  std::pair<AvailableValsTy::iterator, bool> InsertRes =
+    AvailableVals.insert(std::make_pair(BB, 0));
+
+  // Handle the case when the insertion fails because we have already seen BB.
+  if (!InsertRes.second) {
+    // If the insertion failed, there are two cases.  The first case is that the
+    // value is already available for the specified block.  If we get this, just
+    // return the value.
+    if (InsertRes.first->second != 0)
+      return InsertRes.first->second;
+
+    // Otherwise, if the value we find is null, then this is the value is not
+    // known but it is being computed elsewhere in our recursion.  This means
+    // that we have a cycle.  Handle this by inserting a PHI node and returning
+    // it.  When we get back to the first instance of the recursion we will fill
+    // in the PHI node.
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+    MachineInstr *NewPHI = InsertNewDef(TargetInstrInfo::PHI, BB, Loc,
+                                        VRC, MRI,TII);
+    unsigned NewVR = NewPHI->getOperand(0).getReg();
+    InsertRes.first->second = NewVR;
+    return NewVR;
+  }
+
+  // If there are no predecessors, then we must have found an unreachable block
+  // just return 'undef'.  Since there are no predecessors, InsertRes must not
+  // be invalidated.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetInstrInfo::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return InsertRes.first->second = NewDef->getOperand(0).getReg();
+  }
+
+  // Okay, the value isn't in the map and we just inserted a null in the entry
+  // to indicate that we're processing the block.  Since we have no idea what
+  // value is in this block, we have to recurse through our predecessors.
+  //
+  // While we're walking our predecessors, we keep track of them in a vector,
+  // then insert a PHI node in the end if we actually need one.  We could use a
+  // smallvector here, but that would take a lot of stack space for every level
+  // of the recursion, just use IncomingPredInfo as an explicit stack.
+  IncomingPredInfoTy &IncomingPredInfo = getIncomingPredInfo(IPI);
+  unsigned FirstPredInfoEntry = IncomingPredInfo.size();
+
+  // As we're walking the predecessors, keep track of whether they are all
+  // producing the same value.  If so, this value will capture it, if not, it
+  // will get reset to null.  We distinguish the no-predecessor case explicitly
+  // below.
+  unsigned SingularValue = 0;
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  /// Look up BB's entry in AvailableVals.  'InsertRes' may be invalidated.  If
+  /// this block is involved in a loop, a no-entry PHI node will have been
+  /// inserted as InsertedVal.  Otherwise, we'll still have the null we inserted
+  /// above.
+  unsigned &InsertedVal = AvailableVals[BB];
+
+  // If all the predecessor values are the same then we don't need to insert a
+  // PHI.  This is the simple and common case.
+  if (SingularValue) {
+    // If a PHI node got inserted, replace it with the singlar value and delete
+    // it.
+    if (InsertedVal) {
+      MachineInstr *OldVal = MRI->getVRegDef(InsertedVal);
+      // Be careful about dead loops.  These RAUW's also update InsertedVal.
+      assert(InsertedVal != SingularValue && "Dead loop?");
+      ReplaceRegWith(InsertedVal, SingularValue);
+      OldVal->eraseFromParent();
+    }
+
+    InsertedVal = SingularValue;
+
+    // Drop the entries we added in IncomingPredInfo to restore the stack.
+    IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
+                           IncomingPredInfo.end());
+    return InsertedVal;
+  }
+
+
+  // Otherwise, we do need a PHI: insert one now if we don't already have one.
+  MachineInstr *InsertedPHI;
+  if (InsertedVal == 0) {
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->front();
+    InsertedPHI = InsertNewDef(TargetInstrInfo::PHI, BB, Loc,
+                               VRC, MRI, TII);
+    InsertedVal = InsertedPHI->getOperand(0).getReg();
+  } else {
+    InsertedPHI = MRI->getVRegDef(InsertedVal);
+  }
+
+  // Fill in all the predecessors of the PHI.
+  MachineInstrBuilder MIB(InsertedPHI);
+  for (IncomingPredInfoTy::iterator I =
+         IncomingPredInfo.begin()+FirstPredInfoEntry,
+         E = IncomingPredInfo.end(); I != E; ++I)
+    MIB.addReg(I->second).addMBB(I->first);
+
+  // Drop the entries we added in IncomingPredInfo to restore the stack.
+  IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
+                         IncomingPredInfo.end());
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    MRI->replaceRegWith(InsertedVal, ConstVal);
+    InsertedPHI->eraseFromParent();
+    InsertedVal = ConstVal;
+  } else {
+    DEBUG(errs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+
+    // If the client wants to know about all new instructions, tell it.
+    if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+  }
+
+  return InsertedVal;
+}
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index d9f4c99..917d053 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -376,15 +376,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB doesn't fall through but is empty!", MBB);
       }
     }
-    if (TII->BlockHasNoFallThrough(*MBB)) {
-      if (MBB->empty()) {
-        report("TargetInstrInfo says the block has no fall through, but the "
-               "block is empty!", MBB);
-      } else if (!MBB->back().getDesc().isBarrier()) {
-        report("TargetInstrInfo says the block has no fall through, but the "
-               "block does not end in a barrier!", MBB);
-      }
-    }
   } else {
     // Block is last in function.
     if (MBB->empty()) {
diff --git a/lib/CodeGen/MaxStackAlignment.cpp b/lib/CodeGen/MaxStackAlignment.cpp
new file mode 100644
index 0000000..d327cfa
--- /dev/null
+++ b/lib/CodeGen/MaxStackAlignment.cpp
@@ -0,0 +1,70 @@
+//===-- MaxStackAlignment.cpp - Compute the required stack alignment -- ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for vector register usage and aligned local objects to
+// calculate the maximum required alignment for a function. This is used by
+// targets which support it to determine if dynamic stack realignment is
+// necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+namespace {
+  struct MaximalStackAlignmentCalculator : public MachineFunctionPass {
+    static char ID;
+    MaximalStackAlignmentCalculator() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      MachineRegisterInfo &RI = MF.getRegInfo();
+
+      // Calculate max stack alignment of all already allocated stack objects.
+      FFI->calculateMaxStackAlignment();
+      unsigned MaxAlign = FFI->getMaxAlignment();
+
+      // Be over-conservative: scan over all vreg defs and find whether vector
+      // registers are used. If yes, there is probability that vector registers
+      // will be spilled and thus the stack needs to be aligned properly.
+      // FIXME: It would be better to only do this if a spill actually
+      // happens rather than conseratively aligning the stack regardless.
+      for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
+           RegNum < RI.getLastVirtReg(); ++RegNum)
+        MaxAlign = std::max(MaxAlign, RI.getRegClass(RegNum)->getAlignment());
+
+      if (FFI->getMaxAlignment() == MaxAlign)
+        return false;
+
+      FFI->setMaxAlignment(MaxAlign);
+      return true;
+    }
+
+    virtual const char *getPassName() const {
+      return "Stack Alignment Requirements Auto-Detector";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+
+  char MaximalStackAlignmentCalculator::ID = 0;
+}
+
+FunctionPass*
+llvm::createMaxStackAlignmentCalculatorPass() {
+  return new MaximalStackAlignmentCalculator();
+}
+
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 2e30cc6..c62d179 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -287,7 +287,7 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
     // Okay, if we now know that the value is not live out of the block, we can
     // add a kill marker in this block saying that it kills the incoming value!
-    if (!ValueIsUsed && !isLiveOut(SrcReg, opBlock, *LV)) {
+    if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
       // In our final twist, we have to decide which instruction kills the
       // register.  In most cases this is the copy, however, the first
       // terminator instruction at the end of the block may also use the value.
@@ -301,8 +301,8 @@ void llvm::PHIElimination::LowerAtomicPHINode(
 
         // Check that no other terminators use values.
 #ifndef NDEBUG
-        for (MachineBasicBlock::iterator TI = next(Term); TI != opBlock.end();
-             ++TI) {
+        for (MachineBasicBlock::iterator TI = llvm::next(Term);
+             TI != opBlock.end(); ++TI) {
           assert(!TI->readsRegister(SrcReg) &&
                  "Terminator instructions cannot use virtual registers unless"
                  "they are the first terminator in a block!");
@@ -353,59 +353,13 @@ bool llvm::PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // We break edges when registers are live out from the predecessor block
       // (not considering PHI nodes). If the register is live in to this block
       // anyway, we would gain nothing from splitting.
-      if (!LV.isLiveIn(Reg, MBB) && isLiveOut(Reg, *PreMBB, LV))
+      if (!LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB))
         SplitCriticalEdge(PreMBB, &MBB);
     }
   }
   return true;
 }
 
-bool llvm::PHIElimination::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB,
-                                     LiveVariables &LV) {
-  LiveVariables::VarInfo &VI = LV.getVarInfo(Reg);
-
-  // Loop over all of the successors of the basic block, checking to see if
-  // the value is either live in the block, or if it is killed in the block.
-  std::vector<MachineBasicBlock*> OpSuccBlocks;
-  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
-         E = MBB.succ_end(); SI != E; ++SI) {
-    MachineBasicBlock *SuccMBB = *SI;
-
-    // Is it alive in this successor?
-    unsigned SuccIdx = SuccMBB->getNumber();
-    if (VI.AliveBlocks.test(SuccIdx))
-      return true;
-    OpSuccBlocks.push_back(SuccMBB);
-  }
-
-  // Check to see if this value is live because there is a use in a successor
-  // that kills it.
-  switch (OpSuccBlocks.size()) {
-  case 1: {
-    MachineBasicBlock *SuccMBB = OpSuccBlocks[0];
-    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-      if (VI.Kills[i]->getParent() == SuccMBB)
-        return true;
-    break;
-  }
-  case 2: {
-    MachineBasicBlock *SuccMBB1 = OpSuccBlocks[0], *SuccMBB2 = OpSuccBlocks[1];
-    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-      if (VI.Kills[i]->getParent() == SuccMBB1 ||
-          VI.Kills[i]->getParent() == SuccMBB2)
-        return true;
-    break;
-  }
-  default:
-    std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
-    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
-      if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
-                             VI.Kills[i]->getParent()))
-        return true;
-  }
-  return false;
-}
-
 MachineBasicBlock *PHIElimination::SplitCriticalEdge(MachineBasicBlock *A,
                                                      MachineBasicBlock *B) {
   assert(A && B && "Missing MBB end point");
@@ -423,7 +377,7 @@ MachineBasicBlock *PHIElimination::SplitCriticalEdge(MachineBasicBlock *A,
   ++NumSplits;
 
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
-  MF->insert(next(MachineFunction::iterator(A)), NMBB);
+  MF->insert(llvm::next(MachineFunction::iterator(A)), NMBB);
   DEBUG(errs() << "PHIElimination splitting critical edge:"
         " BB#" << A->getNumber()
         << " -- BB#" << NMBB->getNumber()
diff --git a/lib/CodeGen/PHIElimination.h b/lib/CodeGen/PHIElimination.h
index f5872cb..b0b71ce 100644
--- a/lib/CodeGen/PHIElimination.h
+++ b/lib/CodeGen/PHIElimination.h
@@ -93,12 +93,6 @@ namespace llvm {
     bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
                        LiveVariables &LV);
 
-    /// isLiveOut - Determine if Reg is live out from MBB, when not
-    /// considering PHI nodes. This means that Reg is either killed by
-    /// a successor block or passed through one.
-    bool isLiveOut(unsigned Reg, const MachineBasicBlock &MBB,
-                   LiveVariables &LV);
-
     /// SplitCriticalEdge - Split a critical edge from A to B by
     /// inserting a new MBB. Update branches in A and PHI instructions
     /// in B. Return the new block.
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 9101fce2..79be295 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -373,7 +373,8 @@ void SchedulePostRATDList::FinishBlock() {
 ///
 void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
   // Initialize the indices to indicate that no registers are live.
-  std::fill(KillIndices, array_endof(KillIndices), ~0u);
+  for (unsigned i = 0; i < TRI->getNumRegs(); ++i)
+    KillIndices[i] = ~0u;
 
   // Determine the live-out physregs for this block.
   if (!BB->empty() && BB->back().getDesc().isReturn()) {
@@ -510,12 +511,9 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       }
       
       if (MO.isKill() != kill) {
-        bool removed = ToggleKillFlag(MI, MO);
-        if (removed) {
-          DEBUG(errs() << "Fixed <removed> in ");
-        } else {
-          DEBUG(errs() << "Fixed " << MO << " in ");
-        }
+        DEBUG(errs() << "Fixing " << MO << " in ");
+        // Warning: ToggleKillFlag may invalidate MO.
+        ToggleKillFlag(MI, MO);
         DEBUG(MI->dump());
       }
       
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index 8f62345..b0d7a47 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -16,6 +16,7 @@
 
 #define DEBUG_TYPE "pre-alloc-split"
 #include "VirtRegMap.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -104,6 +105,7 @@ namespace {
       AU.addRequired<LiveStacks>();
       AU.addPreserved<LiveStacks>();
       AU.addPreserved<RegisterCoalescer>();
+      AU.addPreserved<CalculateSpillWeights>();
       if (StrongPHIElim)
         AU.addPreservedID(StrongPHIEliminationID);
       else
@@ -876,7 +878,7 @@ bool PreAllocSplitting::Rematerialize(unsigned VReg, VNInfo* ValNo,
   if (!ValNo->isDefAccurate() || DefMI->getParent() == BarrierMBB)
     KillPt = findSpillPoint(BarrierMBB, Barrier, NULL, RefsInMBB);
   else
-    KillPt = next(MachineBasicBlock::iterator(DefMI));
+    KillPt = llvm::next(MachineBasicBlock::iterator(DefMI));
   
   if (KillPt == DefMI->getParent()->end())
     return false;
@@ -1118,7 +1120,7 @@ bool PreAllocSplitting::SplitRegLiveInterval(LiveInterval *LI) {
           return false; // No gap to insert spill.
         }
       } else {
-        SpillPt = next(MachineBasicBlock::iterator(DefMI));
+        SpillPt = llvm::next(MachineBasicBlock::iterator(DefMI));
         if (SpillPt == DefMBB->end()) {
           DEBUG(errs() << "FAILED (could not find a suitable spill point).\n");
           return false; // No gap to insert spill.
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 8905f75..e94247f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -136,9 +136,10 @@ void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
 /// pseudo instructions.
 void PEI::calculateCallsInformation(MachineFunction &Fn) {
   const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  MachineFrameInfo *FFI = Fn.getFrameInfo();
 
   unsigned MaxCallFrameSize = 0;
-  bool HasCalls = false;
+  bool HasCalls = FFI->hasCalls();
 
   // Get the function call frame set-up and tear-down instruction opcode
   int FrameSetupOpcode   = RegInfo->getCallFrameSetupOpcode();
@@ -166,7 +167,6 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
         HasCalls = true;
       }
 
-  MachineFrameInfo *FFI = Fn.getFrameInfo();
   FFI->setHasCalls(HasCalls);
   FFI->setMaxCallFrameSize(MaxCallFrameSize);
 
@@ -674,7 +674,7 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
         if (PrevI == BB->end())
           I = BB->begin();     // The replaced instr was the first in the block.
         else
-          I = next(PrevI);
+          I = llvm::next(PrevI);
         continue;
       }
 
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index 4ff5129..c02d47b 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -16,6 +16,7 @@
 #include "VirtRegRewriter.h"
 #include "Spiller.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -59,6 +60,11 @@ PreSplitIntervals("pre-alloc-split",
                   cl::desc("Pre-register allocation live interval splitting"),
                   cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+TrivCoalesceEnds("trivial-coalesce-ends",
+                  cl::desc("Attempt trivial coalescing of interval ends"),
+                  cl::init(false), cl::Hidden);
+
 static RegisterRegAlloc
 linearscanRegAlloc("linearscan", "linear scan register allocator",
                    createLinearScanRegisterAllocator);
@@ -182,6 +188,7 @@ namespace {
       // Make sure PassManager knows which analyses to make available
       // to coalescing and which analyses coalescing invalidates.
       AU.addRequiredTransitive<RegisterCoalescer>();
+      AU.addRequired<CalculateSpillWeights>();
       if (PreSplitIntervals)
         AU.addRequiredID(PreAllocSplittingID);
       AU.addRequired<LiveStacks>();
@@ -390,66 +397,71 @@ void RALinScan::ComputeRelatedRegClasses() {
         RelatedRegClasses.unionSets(I->second, OneClassForEachPhysReg[*AS]);
 }
 
-/// attemptTrivialCoalescing - If a simple interval is defined by a copy,
-/// try allocate the definition the same register as the source register
-/// if the register is not defined during live time of the interval. This
-/// eliminate a copy. This is used to coalesce copies which were not
-/// coalesced away before allocation either due to dest and src being in
-/// different register classes or because the coalescer was overly
-/// conservative.
+/// attemptTrivialCoalescing - If a simple interval is defined by a copy, try
+/// allocate the definition the same register as the source register if the
+/// register is not defined during live time of the interval. If the interval is
+/// killed by a copy, try to use the destination register. This eliminates a
+/// copy. This is used to coalesce copies which were not coalesced away before
+/// allocation either due to dest and src being in different register classes or
+/// because the coalescer was overly conservative.
 unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
   unsigned Preference = vrm_->getRegAllocPref(cur.reg);
   if ((Preference && Preference == Reg) || !cur.containsOneValue())
     return Reg;
 
-  VNInfo *vni = cur.begin()->valno;
-  if ((vni->def == SlotIndex()) ||
-      vni->isUnused() || !vni->isDefAccurate())
+  // We cannot handle complicated live ranges. Simple linear stuff only.
+  if (cur.ranges.size() != 1)
     return Reg;
-  MachineInstr *CopyMI = li_->getInstructionFromIndex(vni->def);
-  unsigned SrcReg, DstReg, SrcSubReg, DstSubReg, PhysReg;
-  if (!CopyMI ||
-      !tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+
+  const LiveRange &range = cur.ranges.front();
+
+  VNInfo *vni = range.valno;
+  if (vni->isUnused())
     return Reg;
-  PhysReg = SrcReg;
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-    if (!vrm_->isAssignedReg(SrcReg))
+
+  unsigned CandReg;
+  {
+    MachineInstr *CopyMI;
+    unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
+    if (vni->def != SlotIndex() && vni->isDefAccurate() &&
+        (CopyMI = li_->getInstructionFromIndex(vni->def)) &&
+        tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg))
+      // Defined by a copy, try to extend SrcReg forward
+      CandReg = SrcReg;
+    else if (TrivCoalesceEnds &&
+             (CopyMI =
+              li_->getInstructionFromIndex(range.end.getBaseIndex())) &&
+             tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubReg, DstSubReg) &&
+             cur.reg == SrcReg)
+      // Only used by a copy, try to extend DstReg backwards
+      CandReg = DstReg;
+    else
+      return Reg;
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(CandReg)) {
+    if (!vrm_->isAssignedReg(CandReg))
       return Reg;
-    PhysReg = vrm_->getPhys(SrcReg);
+    CandReg = vrm_->getPhys(CandReg);
   }
-  if (Reg == PhysReg)
+  if (Reg == CandReg)
     return Reg;
 
   const TargetRegisterClass *RC = mri_->getRegClass(cur.reg);
-  if (!RC->contains(PhysReg))
+  if (!RC->contains(CandReg))
     return Reg;
 
-  // Try to coalesce.
-  if (!li_->conflictsWithPhysRegDef(cur, *vrm_, PhysReg)) {
-    DEBUG(errs() << "Coalescing: " << cur << " -> " << tri_->getName(PhysReg)
-                 << '\n');
-    vrm_->clearVirt(cur.reg);
-    vrm_->assignVirt2Phys(cur.reg, PhysReg);
-
-    // Remove unnecessary kills since a copy does not clobber the register.
-    if (li_->hasInterval(SrcReg)) {
-      LiveInterval &SrcLI = li_->getInterval(SrcReg);
-      for (MachineRegisterInfo::use_iterator I = mri_->use_begin(cur.reg),
-             E = mri_->use_end(); I != E; ++I) {
-        MachineOperand &O = I.getOperand();
-        if (!O.isKill())
-          continue;
-        MachineInstr *MI = &*I;
-        if (SrcLI.liveAt(li_->getInstructionIndex(MI).getDefIndex()))
-          O.setIsKill(false);
-      }
-    }
+  if (li_->conflictsWithPhysReg(cur, *vrm_, CandReg))
+    return Reg;
 
-    ++NumCoalesce;
-    return PhysReg;
-  }
+  // Try to coalesce.
+  DEBUG(errs() << "Coalescing: " << cur << " -> " << tri_->getName(CandReg)
+        << '\n');
+  vrm_->clearVirt(cur.reg);
+  vrm_->assignVirt2Phys(cur.reg, CandReg);
 
-  return Reg;
+  ++NumCoalesce;
+  return CandReg;
 }
 
 bool RALinScan::runOnMachineFunction(MachineFunction &fn) {
@@ -1261,9 +1273,9 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
 
   // The earliest start of a Spilled interval indicates up to where
   // in handled we need to roll back
+  assert(!spillIs.empty() && "No spill intervals?"); 
+  SlotIndex earliestStart = spillIs[0]->beginIndex();
   
-  LiveInterval *earliestStartInterval = cur;
-
   // Spill live intervals of virtual regs mapped to the physical register we
   // want to clear (and its aliases).  We only spill those that overlap with the
   // current interval as the rest do not affect its allocation. we also keep
@@ -1274,19 +1286,16 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
     LiveInterval *sli = spillIs.back();
     spillIs.pop_back();
     DEBUG(errs() << "\t\t\tspilling(a): " << *sli << '\n');
-    earliestStartInterval =
-      (earliestStartInterval->beginIndex() < sli->beginIndex()) ?
-         earliestStartInterval : sli;
+    if (sli->beginIndex() < earliestStart)
+      earliestStart = sli->beginIndex();
        
     std::vector<LiveInterval*> newIs;
-    newIs = spiller_->spill(sli, spillIs);
+    newIs = spiller_->spill(sli, spillIs, &earliestStart);
     addStackInterval(sli, ls_, li_, mri_, *vrm_);
     std::copy(newIs.begin(), newIs.end(), std::back_inserter(added));
     spilled.insert(sli->reg);
   }
 
-  SlotIndex earliestStart = earliestStartInterval->beginIndex();
-
   DEBUG(errs() << "\t\trolling back to: " << earliestStart << '\n');
 
   // Scan handled in reverse order up to the earliest start of a
@@ -1295,7 +1304,7 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur) {
   while (!handled_.empty()) {
     LiveInterval* i = handled_.back();
     // If this interval starts before t we are done.
-    if (i->beginIndex() < earliestStart)
+    if (!i->empty() && i->beginIndex() < earliestStart)
       break;
     DEBUG(errs() << "\t\t\tundo changes for: " << *i << '\n');
     handled_.pop_back();
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index c677d34..c2014a7 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -36,6 +36,7 @@
 #include "PBQP/Heuristics/Briggs.h"
 #include "VirtRegMap.h"
 #include "VirtRegRewriter.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -90,6 +91,7 @@ namespace {
       au.addRequired<LiveIntervals>();
       //au.addRequiredID(SplitCriticalEdgesID);
       au.addRequired<RegisterCoalescer>();
+      au.addRequired<CalculateSpillWeights>();
       au.addRequired<LiveStacks>();
       au.addPreserved<LiveStacks>();
       au.addRequired<MachineLoopInfo>();
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 94680ed..67bf209 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -125,7 +125,7 @@ void RegScavenger::forward() {
     Tracking = true;
   } else {
     assert(MBBI != MBB->end() && "Already at the end of the basic block!");
-    MBBI = next(MBBI);
+    MBBI = llvm::next(MBBI);
   }
 
   MachineInstr *MI = MBBI;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 06ffdd6..2b52187 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -119,7 +119,8 @@ namespace {
     /// it can be simplified or if things it uses can be simplified by bit
     /// propagation.  If so, return true.
     bool SimplifyDemandedBits(SDValue Op) {
-      APInt Demanded = APInt::getAllOnesValue(Op.getValueSizeInBits());
+      unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+      APInt Demanded = APInt::getAllOnesValue(BitWidth);
       return SimplifyDemandedBits(Op, Demanded);
     }
 
@@ -546,7 +547,8 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
         To[0].getNode()->dump(&DAG);
         errs() << " and " << NumTo-1 << " other values\n";
         for (unsigned i = 0, e = NumTo; i != e; ++i)
-          assert(N->getValueType(i) == To[i].getValueType() &&
+          assert((!To[i].getNode() ||
+                  N->getValueType(i) == To[i].getValueType()) &&
                  "Cannot combine value to value of different type!"));
   WorkListRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To, &DeadNodes);
@@ -1687,10 +1689,14 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
   // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  //
+  // do not sink logical op inside of a vector extend, since it may combine
+  // into a vsetcc.
   if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
        (N0.getOpcode() == ISD::TRUNCATE &&
         !TLI.isTruncateFree(N0.getOperand(0).getValueType(), VT))) &&
+      !VT.isVector() &&
       N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
       (!LegalOperations ||
        TLI.isOperationLegal(N->getOpcode(), N0.getOperand(0).getValueType()))) {
@@ -1943,8 +1949,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   }
 
   // fold (or x, undef) -> -1
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF) {
+    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  }
   // fold (or c1, c2) -> c1|c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
@@ -2434,7 +2442,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
-  unsigned OpSizeInBits = VT.getSizeInBits();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold (shl c1, c2) -> c1<<c2
   if (N0C && N1C)
@@ -2450,7 +2458,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     return N0;
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
-                            APInt::getAllOnesValue(VT.getSizeInBits())))
+                            APInt::getAllOnesValue(OpSizeInBits)))
     return DAG.getConstant(0, VT);
   // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
@@ -2526,6 +2534,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold (sra c1, c2) -> (sra c1, c2)
   if (N0C && N1C)
@@ -2537,7 +2546,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (N0C && N0C->isAllOnesValue())
     return N0;
   // fold (sra x, (setge c, size(x))) -> undef
-  if (N1C && N1C->getZExtValue() >= VT.getSizeInBits())
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
     return DAG.getUNDEF(VT);
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
@@ -2545,7 +2554,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
-    unsigned LowBits = VT.getSizeInBits() - (unsigned)N1C->getZExtValue();
+    unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
     EVT EVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
     if ((!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, EVT)))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
@@ -2556,7 +2565,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (N1C && N0.getOpcode() == ISD::SRA) {
     if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
-      if (Sum >= VT.getSizeInBits()) Sum = VT.getSizeInBits()-1;
+      if (Sum >= OpSizeInBits) Sum = OpSizeInBits-1;
       return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0.getOperand(0),
                          DAG.getConstant(Sum, N1C->getValueType(0)));
     }
@@ -2572,9 +2581,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     const ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     if (N01C && N1C) {
       // Determine what the truncate's result bitsize and type would be.
-      unsigned VTValSize = VT.getSizeInBits();
       EVT TruncVT =
-        EVT::getIntegerVT(*DAG.getContext(), VTValSize - N1C->getZExtValue());
+        EVT::getIntegerVT(*DAG.getContext(), OpSizeInBits - N1C->getZExtValue());
       // Determine the residual right-shift amount.
       signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
 
@@ -2607,7 +2615,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       EVT TruncVT = N1.getValueType();
       SDValue N100 = N1.getOperand(0).getOperand(0);
       APInt TruncC = N101C->getAPIntValue();
-      TruncC.trunc(TruncVT.getSizeInBits());
+      TruncC.trunc(TruncVT.getScalarType().getSizeInBits());
       return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
                          DAG.getNode(ISD::AND, N->getDebugLoc(),
                                      TruncVT,
@@ -2636,7 +2644,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
-  unsigned OpSizeInBits = VT.getSizeInBits();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold (srl c1, c2) -> c1 >>u c2
   if (N0C && N1C)
@@ -3029,7 +3037,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       else if (Op.getValueType().bitsGT(VT))
         Op = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), VT, Op);
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, Op,
-                         DAG.getValueType(N0.getValueType()));
+                         DAG.getValueType(N0.getValueType().getScalarType()));
     }
   }
 
@@ -3170,7 +3178,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     } else if (Op.getValueType().bitsGT(VT)) {
       Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
     }
-    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(), N0.getValueType());
+    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
+                                  N0.getValueType().getScalarType());
   }
 
   // Fold (zext (and (trunc x), cst)) -> (and x, cst),
@@ -3193,6 +3202,19 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                        X, DAG.getConstant(Mask, VT));
   }
 
+  // Fold (zext (and x, cst)) -> (and (zext x), cst)
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      N0.getOperand(0).getOpcode() != ISD::TRUNCATE &&
+      N0.getOperand(0).hasOneUse()) {
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT,
+                                   N0.getOperand(0)),
+                       DAG.getConstant(Mask, VT));
+  }
+
   // fold (zext (load x)) -> (zext (truncate (zextload x)))
   if (ISD::isNON_EXTLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
@@ -3269,6 +3291,26 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     if (SCC.getNode()) return SCC;
   }
 
+  // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse()) {
+    if (N0.getOpcode() == ISD::SHL) {
+      // If the original shl may be shifting out bits, do not perform this
+      // transformation.
+      unsigned ShAmt = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+      unsigned KnownZeroBits = N0.getOperand(0).getValueType().getSizeInBits() -
+        N0.getOperand(0).getOperand(0).getValueType().getSizeInBits();
+      if (ShAmt > KnownZeroBits)
+        return SDValue();
+    }
+    DebugLoc dl = N->getDebugLoc();
+    return DAG.getNode(N0.getOpcode(), dl, VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0.getOperand(0)),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0.getOperand(1)));
+  }
+
   return SDValue();
 }
 
@@ -3529,7 +3571,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT EVT = cast<VTSDNode>(N1)->getVT();
-  unsigned VTBits = VT.getSizeInBits();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
   unsigned EVTBits = EVT.getSizeInBits();
 
   // fold (sext_in_reg c1) -> c1
@@ -3537,7 +3579,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
-  if (DAG.ComputeNumSignBits(N0) >= VT.getSizeInBits()-EVTBits+1)
+  if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
     return N0;
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
@@ -3552,7 +3594,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // if x is small enough.
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N00 = N0.getOperand(0);
-    if (N00.getValueType().getSizeInBits() < EVTBits)
+    if (N00.getValueType().getScalarType().getSizeInBits() < EVTBits)
       return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N00, N1);
   }
 
@@ -3576,11 +3618,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
   if (N0.getOpcode() == ISD::SRL) {
     if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
-      if (ShAmt->getZExtValue()+EVTBits <= VT.getSizeInBits()) {
+      if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
         // We can turn this into an SRA iff the input to the SRL is already sign
         // extended enough.
         unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
-        if (VT.getSizeInBits()-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
+        if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
           return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT,
                              N0.getOperand(0), N0.getOperand(1));
       }
@@ -3681,7 +3723,6 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
     return SDValue();
   EVT LD1VT = LD1->getValueType(0);
-  const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
 
   if (ISD::isNON_EXTLoad(LD2) &&
       LD2->hasOneUse() &&
@@ -3689,7 +3730,7 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
       // If one is volatile it might be ok, but play conservative and bail out.
       !LD1->isVolatile() &&
       !LD2->isVolatile() &&
-      TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) {
+      DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
     unsigned Align = LD1->getAlignment();
     unsigned NewAlign = TLI.getTargetData()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
@@ -4804,49 +4845,6 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
-/// InferAlignment - If we can infer some alignment information from this
-/// pointer, return it.
-static unsigned InferAlignment(SDValue Ptr, SelectionDAG &DAG) {
-  // If this is a direct reference to a stack slot, use information about the
-  // stack slot's alignment.
-  int FrameIdx = 1 << 31;
-  int64_t FrameOffset = 0;
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
-    FrameIdx = FI->getIndex();
-  } else if (Ptr.getOpcode() == ISD::ADD &&
-             isa<ConstantSDNode>(Ptr.getOperand(1)) &&
-             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
-    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
-    FrameOffset = Ptr.getConstantOperandVal(1);
-  }
-
-  if (FrameIdx != (1 << 31)) {
-    // FIXME: Handle FI+CST.
-    const MachineFrameInfo &MFI = *DAG.getMachineFunction().getFrameInfo();
-    if (MFI.isFixedObjectIndex(FrameIdx)) {
-      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
-
-      // The alignment of the frame index can be determined from its offset from
-      // the incoming frame position.  If the frame object is at offset 32 and
-      // the stack is guaranteed to be 16-byte aligned, then we know that the
-      // object is 16-byte aligned.
-      unsigned StackAlign = DAG.getTarget().getFrameInfo()->getStackAlignment();
-      unsigned Align = MinAlign(ObjectOffset, StackAlign);
-
-      // Finally, the frame object itself may have a known alignment.  Factor
-      // the alignment + offset into a new alignment.  For example, if we know
-      // the  FI is 8 byte aligned, but the pointer is 4 off, we really have a
-      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
-      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
-      unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
-                                      FrameOffset);
-      return std::max(Align, FIInfoAlign);
-    }
-  }
-
-  return 0;
-}
-
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -4854,7 +4852,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
-    if (unsigned Align = InferAlignment(Ptr, DAG)) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getAlignment())
         return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
                               LD->getValueType(0),
@@ -5079,7 +5077,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
-    if (unsigned Align = InferAlignment(Ptr, DAG)) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > ST->getAlignment())
         return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
                                  Ptr, ST->getSrcValue(),
@@ -5231,7 +5229,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // SimplifyDemandedBits, which only works if the value has a single use.
     if (SimplifyDemandedBits(Value,
                              APInt::getLowBitsSet(
-                               Value.getValueSizeInBits(),
+                               Value.getValueType().getScalarType().getSizeInBits(),
                                ST->getMemoryVT().getSizeInBits())))
       return SDValue(N, 0);
   }
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 5eb9ca1..4ead9c9 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -532,7 +532,15 @@ bool FastISel::SelectBitCast(User *I) {
 
 bool
 FastISel::SelectInstruction(Instruction *I) {
-  return SelectOperator(I, I->getOpcode());
+  // First, try doing target-independent selection.
+  if (SelectOperator(I, I->getOpcode()))
+    return true;
+
+  // Next, try calling the target to attempt to handle the instruction.
+  if (TargetSelectInstruction(I))
+    return true;
+
+  return false;
 }
 
 /// FastEmitBranch - Emit an unconditional branch to the given block,
@@ -541,7 +549,7 @@ FastISel::SelectInstruction(Instruction *I) {
 void
 FastISel::FastEmitBranch(MachineBasicBlock *MSucc) {
   MachineFunction::iterator NextMBB =
-     next(MachineFunction::iterator(MBB));
+     llvm::next(MachineFunction::iterator(MBB));
 
   if (MBB->isLayoutSuccessor(MSucc)) {
     // The unconditional fall-through case, which needs no instructions.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 273dbf0..f9c05d0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -232,7 +232,7 @@ void SelectionDAGLegalize::LegalizeDAG() {
   // node is only legalized after all of its operands are legalized.
   DAG.AssignTopologicalOrder();
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = prior(DAG.allnodes_end()); I != next(E); ++I)
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
     LegalizeOp(SDValue(I, 0));
 
   // Finally, it's possible the root changed.  Get the new root.
@@ -2294,9 +2294,15 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     // NOTE: we could fall back on load/store here too for targets without
     // SAR.  However, it is doubtful that any exist.
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
-    unsigned BitsDiff = Node->getValueType(0).getSizeInBits() -
+    EVT VT = Node->getValueType(0);
+    EVT ShiftAmountTy = TLI.getShiftAmountTy();
+    if (VT.isVector()) {
+      ShiftAmountTy = VT;
+      VT = VT.getVectorElementType();
+    }
+    unsigned BitsDiff = VT.getSizeInBits() -
                         ExtraVT.getSizeInBits();
-    SDValue ShiftCst = DAG.getConstant(BitsDiff, TLI.getShiftAmountTy());
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, ShiftAmountTy);
     Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
                        Node->getOperand(0), ShiftCst);
     Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
@@ -3059,8 +3065,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node,
 
 // SelectionDAG::Legalize - This is the entry point for the file.
 //
-void SelectionDAG::Legalize(bool TypesNeedLegalizing,
-                            CodeGenOpt::Level OptLevel) {
+void SelectionDAG::Legalize(CodeGenOpt::Level OptLevel) {
   /// run - This is the main entry point to this class.
   ///
   SelectionDAGLegalize(*this, OptLevel).LegalizeDAG();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8ac8063..2f4457e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1167,55 +1167,62 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   GetExpandedInteger(N->getOperand(0), InL, InH);
 
   SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy);
-  SDValue Amt2 = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
-  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
-                             Amt, NVBitsNode, ISD::SETULT);
+  SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
+  SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue isShort = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
+                                 Amt, NVBitsNode, ISD::SETULT);
 
-  SDValue Lo1, Hi1, Lo2, Hi2;
+  SDValue LoS, HiS, LoL, HiL;
   switch (N->getOpcode()) {
   default: llvm_unreachable("Unknown shift");
   case ISD::SHL:
-    // ShAmt < NVTBits
-    Lo1 = DAG.getConstant(0, NVT);                  // Low part is zero.
-    Hi1 = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
-
-    // ShAmt >= NVTBits
-    Lo2 = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
-    Hi2 = DAG.getNode(ISD::OR, dl, NVT,
+    // Short: ShAmt < NVTBits
+    LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    HiS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
-                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt2));
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    LoL = DAG.getConstant(0, NVT);                        // Lo part is zero.
+    HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRL:
-    // ShAmt < NVTBits
-    Hi1 = DAG.getConstant(0, NVT);                  // Hi part is zero.
-    Lo1 = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
-
-    // ShAmt >= NVTBits
-    Hi2 = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
-    Lo2 = DAG.getNode(ISD::OR, dl, NVT,
-                     DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
-                     DAG.getNode(ISD::SHL, dl, NVT, InH, Amt2));
-
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getConstant(0, NVT);                        // Hi part is zero.
+    LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRA:
-    // ShAmt < NVTBits
-    Hi1 = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
-                       DAG.getConstant(NVTBits-1, ShTy));
-    Lo1 = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
-
-    // ShAmt >= NVTBits
-    Hi2 = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
-    Lo2 = DAG.getNode(ISD::OR, dl, NVT,
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
                       DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
-                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt2));
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
+                      DAG.getConstant(NVTBits-1, ShTy));
+    LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Lo1, Lo2);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, Cmp, Hi1, Hi2);
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
     return true;
   }
 
@@ -1989,7 +1996,9 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SRA:
   case ISD::SRL:
   case ISD::ROTL:
-  case ISD::ROTR: Res = ExpandIntOp_Shift(N); break;
+  case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
+  case ISD::RETURNADDR:
+  case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2173,6 +2182,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   return DAG.UpdateNodeOperands(SDValue(N, 0), N->getOperand(0), Lo);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
+  // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
+  // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
+  // constant to valid type.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return DAG.UpdateNodeOperands(SDValue(N, 0), Lo);
+}
+
 SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   EVT DstVT = N->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 2ee9f8a..c35f7ad 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -362,6 +362,7 @@ private:
   SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue ExpandIntOp_TRUNCATE(SDNode *N);
   SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_RETURNADDR(SDNode *N);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
                                   ISD::CondCode &CCCode, DebugLoc dl);
@@ -516,6 +517,7 @@ private:
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
@@ -559,6 +561,7 @@ private:
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
@@ -601,6 +604,7 @@ private:
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_UNDEF(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 785c2ad..2625245 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -79,7 +79,7 @@ bool VectorLegalizer::Run() {
   // node is only legalized after all of its operands are legalized.
   DAG.AssignTopologicalOrder();
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
-       E = prior(DAG.allnodes_end()); I != next(E); ++I)
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
     LegalizeOp(SDValue(I, 0));
 
   // Finally, it's possible the root changed.  Get the new root.
@@ -179,6 +179,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FFLOOR:
+  case ISD::SIGN_EXTEND_INREG:
     QueryType = Node->getValueType(0);
     break;
   case ISD::SINT_TO_FP:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 023324b..cf67ab9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -54,6 +54,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
@@ -195,6 +196,13 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   return InOp;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), EltVT,
+                     LHS, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
@@ -401,6 +409,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
   case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: SplitVecRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
@@ -700,6 +709,18 @@ void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
   Hi = DAG.getUNDEF(HiVT);
 }
 
+void DAGTypeLegalizer::SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
+                   N->getOperand(1));
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
+                   N->getOperand(1));
+}
+
 void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
@@ -1141,6 +1162,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
   case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
@@ -1691,6 +1713,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
                      WidenVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_SIGN_EXTEND_INREG(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(),
+                     WidenVT, WidenLHS, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index d53de34..b2ee8bb 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -20,10 +20,16 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+cl::opt<bool>
+DisableInstScheduling("disable-inst-scheduling",
+                      cl::init(false),
+                      cl::desc("Disable instruction scheduling"));
+
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
   : ScheduleDAG(mf) {
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c38c79b..da55e6b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -47,6 +48,8 @@
 #include <cmath>
 using namespace llvm;
 
+extern cl::opt<bool> DisableInstScheduling;
+
 /// makeVTList - Return an instance of the SDVTList struct initialized with the
 /// specified members.
 static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
@@ -551,6 +554,9 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes,
     }
 
     DeallocateNode(N);
+
+    // Remove the ordering of this node.
+    if (Ordering) Ordering->remove(N);
   }
 }
 
@@ -576,6 +582,9 @@ void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
   N->DropOperands();
 
   DeallocateNode(N);
+
+  // Remove the ordering of this node.
+  if (Ordering) Ordering->remove(N);
 }
 
 void SelectionDAG::DeallocateNode(SDNode *N) {
@@ -587,6 +596,9 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
   N->NodeType = ISD::DELETED_NODE;
 
   NodeAllocator.Deallocate(AllNodes.remove(N));
+
+  // Remove the ordering of this node.
+  if (Ordering) Ordering->remove(N);
 }
 
 /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
@@ -690,7 +702,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 1);
   AddNodeIDCustom(ID, N);
-  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Ordering) Ordering->remove(Node);
+  return Node;
 }
 
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
@@ -707,7 +721,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 2);
   AddNodeIDCustom(ID, N);
-  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Ordering) Ordering->remove(Node);
+  return Node;
 }
 
 
@@ -724,7 +740,9 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, NumOps);
   AddNodeIDCustom(ID, N);
-  return CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Ordering) Ordering->remove(Node);
+  return Node;
 }
 
 /// VerifyNode - Sanity check the given node.  Aborts if it is invalid.
@@ -777,8 +795,13 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 SelectionDAG::SelectionDAG(TargetLowering &tli, FunctionLoweringInfo &fli)
   : TLI(tli), FLI(fli), DW(0),
     EntryNode(ISD::EntryToken, DebugLoc::getUnknownLoc(),
-    getVTList(MVT::Other)), Root(getEntryNode()) {
+              getVTList(MVT::Other)),
+    Root(getEntryNode()), Ordering(0) {
   AllNodes.push_back(&EntryNode);
+  if (DisableInstScheduling) {
+    Ordering = new NodeOrdering();
+    Ordering->add(&EntryNode);
+  }
 }
 
 void SelectionDAG::init(MachineFunction &mf, MachineModuleInfo *mmi,
@@ -791,6 +814,7 @@ void SelectionDAG::init(MachineFunction &mf, MachineModuleInfo *mmi,
 
 SelectionDAG::~SelectionDAG() {
   allnodes_clear();
+  delete Ordering;
 }
 
 void SelectionDAG::allnodes_clear() {
@@ -816,6 +840,10 @@ void SelectionDAG::clear() {
   EntryNode.UseList = 0;
   AllNodes.push_back(&EntryNode);
   Root = getEntryNode();
+  if (DisableInstScheduling) {
+    Ordering = new NodeOrdering();
+    Ordering->add(&EntryNode);
+  }
 }
 
 SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
@@ -831,8 +859,12 @@ SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
 }
 
 SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, EVT VT) {
+  assert(!VT.isVector() &&
+         "getZeroExtendInReg should use the vector element type instead of "
+         "the vector type!");
   if (Op.getValueType() == VT) return Op;
-  APInt Imm = APInt::getLowBitsSet(Op.getValueSizeInBits(),
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  APInt Imm = APInt::getLowBitsSet(BitWidth,
                                    VT.getSizeInBits());
   return getNode(ISD::AND, DL, Op.getValueType(), Op,
                  getConstant(Imm, Op.getValueType()));
@@ -872,14 +904,17 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
   ID.AddPointer(&Val);
   void *IP = 0;
   SDNode *N = NULL;
-  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP))) {
+    if (Ordering) Ordering->add(N);
     if (!VT.isVector())
       return SDValue(N, 0);
+  }
   if (!N) {
     N = NodeAllocator.Allocate<ConstantSDNode>();
     new (N) ConstantSDNode(isT, &Val, EltVT);
     CSEMap.InsertNode(N, IP);
     AllNodes.push_back(N);
+    if (Ordering) Ordering->add(N);
   }
 
   SDValue Result(N, 0);
@@ -916,14 +951,17 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
   ID.AddPointer(&V);
   void *IP = 0;
   SDNode *N = NULL;
-  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP))) {
+    if (Ordering) Ordering->add(N);
     if (!VT.isVector())
       return SDValue(N, 0);
+  }
   if (!N) {
     N = NodeAllocator.Allocate<ConstantFPSDNode>();
     new (N) ConstantFPSDNode(isTarget, &V, EltVT);
     CSEMap.InsertNode(N, IP);
     AllNodes.push_back(N);
+    if (Ordering) Ordering->add(N);
   }
 
   SDValue Result(N, 0);
@@ -978,12 +1016,15 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV,
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<GlobalAddressSDNode>();
   new (N) GlobalAddressSDNode(Opc, GV, VT, Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -993,12 +1034,15 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
   ID.AddInteger(FI);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<FrameIndexSDNode>();
   new (N) FrameIndexSDNode(FI, VT, isTarget);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1012,12 +1056,15 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
   ID.AddInteger(JTI);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<JumpTableSDNode>();
   new (N) JumpTableSDNode(JTI, VT, isTarget, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1037,12 +1084,15 @@ SDValue SelectionDAG::getConstantPool(Constant *C, EVT VT,
   ID.AddPointer(C);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<ConstantPoolSDNode>();
   new (N) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1063,12 +1113,15 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   C->AddSelectionDAGCSEId(ID);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<ConstantPoolSDNode>();
   new (N) ConstantPoolSDNode(isTarget, C, VT, Offset, Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1077,12 +1130,15 @@ SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
   ID.AddPointer(MBB);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<BasicBlockSDNode>();
   new (N) BasicBlockSDNode(MBB);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1098,6 +1154,7 @@ SDValue SelectionDAG::getValueType(EVT VT) {
   N = NodeAllocator.Allocate<VTSDNode>();
   new (N) VTSDNode(VT);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1107,6 +1164,7 @@ SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
   N = NodeAllocator.Allocate<ExternalSymbolSDNode>();
   new (N) ExternalSymbolSDNode(false, Sym, 0, VT);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1119,6 +1177,7 @@ SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
   N = NodeAllocator.Allocate<ExternalSymbolSDNode>();
   new (N) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1131,6 +1190,7 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
     new (N) CondCodeSDNode(Cond);
     CondCodeNodes[Cond] = N;
     AllNodes.push_back(N);
+    if (Ordering) Ordering->add(N);
   }
   return SDValue(CondCodeNodes[Cond], 0);
 }
@@ -1223,8 +1283,10 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
     ID.AddInteger(MaskVec[i]);
 
   void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
 
   // Allocate the mask array for the node out of the BumpPtrAllocator, since
   // SDNode doesn't have access to it.  This memory will be "leaked" when
@@ -1236,6 +1298,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
   new (N) ShuffleVectorSDNode(VT, dl, N1, N2, MaskAlloc);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1253,12 +1316,15 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, DebugLoc dl,
   SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
   AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), &Ops[0], 5);
   void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   CvtRndSatSDNode *N = NodeAllocator.Allocate<CvtRndSatSDNode>();
   new (N) CvtRndSatSDNode(VT, dl, Ops, 5, Code);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1267,12 +1333,15 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   AddNodeIDNode(ID, ISD::Register, getVTList(VT), 0, 0);
   ID.AddInteger(RegNo);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<RegisterSDNode>();
   new (N) RegisterSDNode(RegNo, VT);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1284,12 +1353,15 @@ SDValue SelectionDAG::getLabel(unsigned Opcode, DebugLoc dl,
   AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), &Ops[0], 1);
   ID.AddInteger(LabelID);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<LabelSDNode>();
   new (N) LabelSDNode(Opcode, dl, Root, LabelID);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1303,12 +1375,15 @@ SDValue SelectionDAG::getBlockAddress(BlockAddress *BA, EVT VT,
   ID.AddPointer(BA);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<BlockAddressSDNode>();
   new (N) BlockAddressSDNode(Opc, VT, BA, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1321,13 +1396,16 @@ SDValue SelectionDAG::getSrcValue(const Value *V) {
   ID.AddPointer(V);
 
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
 
   SDNode *N = NodeAllocator.Allocate<SrcValueSDNode>();
   new (N) SrcValueSDNode(V);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -1480,7 +1558,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
   if (Op.getValueType().isVector())
     return false;
 
-  unsigned BitWidth = Op.getValueSizeInBits();
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
   return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
 }
 
@@ -1503,7 +1581,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
                                      APInt &KnownZero, APInt &KnownOne,
                                      unsigned Depth) const {
   unsigned BitWidth = Mask.getBitWidth();
-  assert(BitWidth == Op.getValueType().getSizeInBits() &&
+  assert(BitWidth == Op.getValueType().getScalarType().getSizeInBits() &&
          "Mask size mismatches value type size!");
 
   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
@@ -1760,7 +1838,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
     APInt InMask    = Mask;
     InMask.trunc(InBits);
@@ -1774,7 +1852,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InSignBit = APInt::getSignBit(InBits);
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
     APInt InMask = Mask;
@@ -1815,7 +1893,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::ANY_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InMask = Mask;
     InMask.trunc(InBits);
     KnownZero.trunc(InBits);
@@ -1827,7 +1905,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   }
   case ISD::TRUNCATE: {
     EVT InVT = Op.getOperand(0).getValueType();
-    unsigned InBits = InVT.getSizeInBits();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InMask = Mask;
     InMask.zext(InBits);
     KnownZero.zext(InBits);
@@ -1960,7 +2038,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
-  unsigned VTBits = VT.getSizeInBits();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
@@ -1987,7 +2065,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   case ISD::SIGN_EXTEND:
-    Tmp = VTBits-Op.getOperand(0).getValueType().getSizeInBits();
+    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
   case ISD::SIGN_EXTEND_INREG:
@@ -2238,13 +2316,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<SDNode>();
   new (N) SDNode(Opcode, DL, getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -2349,6 +2430,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isFloatingPoint() &&
            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (Operand.getOpcode() == ISD::UNDEF)
       return getUNDEF(VT);
     break;
@@ -2356,8 +2441,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid SIGN_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().bitsLT(VT)
-           && "Invalid sext node, dst < src!");
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid sext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
     break;
@@ -2365,8 +2454,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ZERO_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().bitsLT(VT)
-           && "Invalid zext node, dst < src!");
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid zext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
       return getNode(ISD::ZERO_EXTEND, DL, VT,
                      Operand.getNode()->getOperand(0));
@@ -2375,8 +2468,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid ANY_EXTEND!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
-    assert(Operand.getValueType().bitsLT(VT)
-           && "Invalid anyext node, dst < src!");
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid anyext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
       return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
@@ -2385,14 +2482,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     assert(VT.isInteger() && Operand.getValueType().isInteger() &&
            "Invalid TRUNCATE!");
     if (Operand.getValueType() == VT) return Operand;   // noop truncate
-    assert(Operand.getValueType().bitsGT(VT)
-           && "Invalid truncate node, src < dst!");
+    assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) &&
+           "Invalid truncate node, src < dst!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
     if (OpOpcode == ISD::TRUNCATE)
       return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
     else if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
              OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
-      if (Operand.getNode()->getOperand(0).getValueType().bitsLT(VT))
+      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+            .bitsLT(VT.getScalarType()))
         return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
       else if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
         return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
@@ -2447,8 +2549,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     SDValue Ops[1] = { Operand };
     AddNodeIDNode(ID, Opcode, VTs, Ops, 1);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     N = NodeAllocator.Allocate<UnarySDNode>();
     new (N) UnarySDNode(Opcode, DL, VTs, Operand);
     CSEMap.InsertNode(N, IP);
@@ -2458,6 +2562,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
   }
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -2623,6 +2728,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     assert(VT == N1.getValueType() && "Not an inreg extend!");
     assert(VT.isInteger() && EVT.isInteger() &&
            "Cannot *_EXTEND_INREG FP types");
+    assert(!EVT.isVector() &&
+           "AssertSExt/AssertZExt type should be the vector element type "
+           "rather than the vector type!");
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (VT == EVT) return N1; // noop assertion.
     break;
@@ -2632,12 +2740,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     assert(VT == N1.getValueType() && "Not an inreg extend!");
     assert(VT.isInteger() && EVT.isInteger() &&
            "Cannot *_EXTEND_INREG FP types");
-    assert(EVT.bitsLE(VT) && "Not extending!");
+    assert(!EVT.isVector() &&
+           "SIGN_EXTEND_INREG type should be the vector element type rather "
+           "than the vector type!");
+    assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
 
     if (N1C) {
       APInt Val = N1C->getAPIntValue();
-      unsigned FromBits = cast<VTSDNode>(N2)->getVT().getSizeInBits();
+      unsigned FromBits = EVT.getSizeInBits();
       Val <<= Val.getBitWidth()-FromBits;
       Val = Val.ashr(Val.getBitWidth()-FromBits);
       return getConstant(Val, VT);
@@ -2859,8 +2970,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     N = NodeAllocator.Allocate<BinarySDNode>();
     new (N) BinarySDNode(Opcode, DL, VTs, N1, N2);
     CSEMap.InsertNode(N, IP);
@@ -2870,6 +2983,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   }
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -2936,8 +3050,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     N = NodeAllocator.Allocate<TernarySDNode>();
     new (N) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
     CSEMap.InsertNode(N, IP);
@@ -2945,7 +3061,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     N = NodeAllocator.Allocate<TernarySDNode>();
     new (N) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
   }
+
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -3541,12 +3659,14 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode* N = NodeAllocator.Allocate<AtomicSDNode>();
   new (N) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain, Ptr, Cmp, Swp, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3604,12 +3724,14 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode* N = NodeAllocator.Allocate<AtomicSDNode>();
   new (N) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain, Ptr, Val, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3682,6 +3804,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
     void *IP = 0;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
     }
 
@@ -3693,6 +3816,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
     new (N) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps, MemVT, MMO);
   }
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3732,16 +3856,15 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
     assert(VT == MemVT && "Non-extending load from different memory type!");
   } else {
     // Extending load.
-    if (VT.isVector())
-      assert(MemVT.getVectorNumElements() == VT.getVectorNumElements() &&
-             "Invalid vector extload!");
-    else
-      assert(MemVT.bitsLT(VT) &&
-             "Should only be an extending load, not truncating!");
-    assert((ExtType == ISD::EXTLOAD || VT.isInteger()) &&
-           "Cannot sign/zero extend a FP/Vector load!");
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
     assert(VT.isInteger() == MemVT.isInteger() &&
            "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use trunc store to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+           "Cannot use trunc store to change the number of vector elements!");
   }
 
   bool Indexed = AM != ISD::UNINDEXED;
@@ -3758,12 +3881,14 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, DebugLoc dl,
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode *N = NodeAllocator.Allocate<LoadSDNode>();
   new (N) LoadSDNode(Ops, dl, VTs, AM, ExtType, MemVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3834,12 +3959,14 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
   new (N) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3874,10 +4001,15 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   if (VT == SVT)
     return getStore(Chain, dl, Val, Ptr, MMO);
 
-  assert(VT.bitsGT(SVT) && "Not a truncation?");
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
   assert(VT.isInteger() == SVT.isInteger() &&
          "Can't do FP-INT conversion!");
-
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+         "Cannot use trunc store to change the number of vector elements!");
 
   SDVTList VTs = getVTList(MVT::Other);
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -3889,12 +4021,14 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
   }
   SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
   new (N) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3911,14 +4045,17 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
   void *IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    if (Ordering) Ordering->add(E);
     return SDValue(E, 0);
+  }
   SDNode *N = NodeAllocator.Allocate<StoreSDNode>();
   new (N) StoreSDNode(Ops, dl, VTs, AM,
                       ST->isTruncatingStore(), ST->getMemoryVT(),
                       ST->getMemOperand());
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
   return SDValue(N, 0);
 }
 
@@ -3984,8 +4121,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     AddNodeIDNode(ID, Opcode, VTs, Ops, NumOps);
     void *IP = 0;
 
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
 
     N = NodeAllocator.Allocate<SDNode>();
     new (N) SDNode(Opcode, DL, VTs, Ops, NumOps);
@@ -3996,6 +4135,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   }
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -4051,8 +4191,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return SDValue(E, 0);
+    }
     if (NumOps == 1) {
       N = NodeAllocator.Allocate<UnarySDNode>();
       new (N) UnarySDNode(Opcode, DL, VTList, Ops[0]);
@@ -4083,6 +4225,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
     }
   }
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -4166,7 +4309,7 @@ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
                           I->VTs[2] == VT3 && I->VTs[3] == VT4)
       return *I;
 
-  EVT *Array = Allocator.Allocate<EVT>(3);
+  EVT *Array = Allocator.Allocate<EVT>(4);
   Array[0] = VT1;
   Array[1] = VT2;
   Array[2] = VT3;
@@ -4545,8 +4688,10 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
   if (VTs.VTs[VTs.NumVTs-1] != MVT::Flag) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
-    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(ON);
       return ON;
+    }
   }
 
   if (!RemoveNodeFromCSEMaps(N))
@@ -4610,6 +4755,7 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
 
   if (IP)
     CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  if (Ordering) Ordering->add(N);
   return N;
 }
 
@@ -4748,8 +4894,10 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, ~Opcode, VTs, Ops, NumOps);
     IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return cast<MachineSDNode>(E);
+    }
   }
 
   // Allocate a new MachineSDNode.
@@ -4771,6 +4919,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
     CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
+  if (Ordering) Ordering->add(N);
 #ifndef NDEBUG
   VerifyNode(N);
 #endif
@@ -4807,8 +4956,10 @@ SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
     void *IP = 0;
-    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      if (Ordering) Ordering->add(E);
       return E;
+    }
   }
   return NULL;
 }
@@ -5867,6 +6018,99 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
                  &Scalars[0], Scalars.size());
 }
 
+
+/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a 
+/// location that is 'Dist' units away from the location that the 'Base' load 
+/// is loading from.
+bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, 
+                                     unsigned Bytes, int Dist) const {
+  if (LD->getChain() != Base->getChain())
+    return false;
+  EVT VT = LD->getValueType(0);
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LD->getOperand(1);
+  SDValue BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+  if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
+    if (V && (V->getSExtValue() == Dist*Bytes))
+      return true;
+  }
+
+  GlobalValue *GV1 = NULL;
+  GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+
+/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
+/// it cannot be inferred.
+unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+  // If this is a GlobalAddress + cst, return the alignment.
+  GlobalValue *GV;
+  int64_t GVOffset = 0;
+  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset))
+    return MinAlign(GV->getAlignment(), GVOffset);
+
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = 1 << 31;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (Ptr.getOpcode() == ISD::ADD &&
+             isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != (1 << 31)) {
+    // FIXME: Handle FI+CST.
+    const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
+    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                    FrameOffset);
+    if (MFI.isFixedObjectIndex(FrameIdx)) {
+      int64_t ObjectOffset = MFI.getObjectOffset(FrameIdx) + FrameOffset;
+
+      // The alignment of the frame index can be determined from its offset from
+      // the incoming frame position.  If the frame object is at offset 32 and
+      // the stack is guaranteed to be 16-byte aligned, then we know that the
+      // object is 16-byte aligned.
+      unsigned StackAlign = getTarget().getFrameInfo()->getStackAlignment();
+      unsigned Align = MinAlign(ObjectOffset, StackAlign);
+
+      // Finally, the frame object itself may have a known alignment.  Factor
+      // the alignment + offset into a new alignment.  For example, if we know
+      // the FI is 8 byte aligned, but the pointer is 4 off, we really have a
+      // 4-byte alignment of the resultant pointer.  Likewise align 4 + 4-byte
+      // offset = 4-byte alignment, align 4 + 1-byte offset = align 1, etc.
+      return std::max(Align, FIInfoAlign);
+    }
+    return FIInfoAlign;
+  }
+
+  return 0;
+}
+
 void SelectionDAG::dump() const {
   errs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
 
@@ -5882,6 +6126,9 @@ void SelectionDAG::dump() const {
   errs() << "\n\n";
 }
 
+void SelectionDAG::NodeOrdering::dump() const {
+}
+
 void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
   print_types(OS, G);
   print_details(OS, G);
@@ -6022,4 +6269,3 @@ bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
       return false;
   return true;
 }
-
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 57d8903..7568384 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -583,6 +583,9 @@ void SelectionDAGBuilder::visit(Instruction &I) {
 }
 
 void SelectionDAGBuilder::visit(unsigned Opcode, User &I) {
+  // Tell the DAG that we're processing a new instruction.
+  DAG.NewInst();
+
   // Note: this doesn't use InstVisitor, because it has to work with
   // ConstantExpr's in addition to instructions.
   switch (Opcode) {
@@ -2108,7 +2111,7 @@ void SelectionDAGBuilder::visitSelect(User &I) {
 
     for (unsigned i = 0; i != NumValues; ++i)
       Values[i] = DAG.getNode(ISD::SELECT, getCurDebugLoc(),
-                              TrueVal.getValueType(), Cond,
+                              TrueVal.getNode()->getValueType(i), Cond,
                               SDValue(TrueVal.getNode(), TrueVal.getResNo() + i),
                               SDValue(FalseVal.getNode(), FalseVal.getResNo() + i));
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c39437f..a640c7d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -60,8 +60,6 @@
 using namespace llvm;
 
 static cl::opt<bool>
-DisableLegalizeTypes("disable-legalize-types", cl::Hidden);
-static cl::opt<bool>
 EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
           cl::desc("Enable verbose messages in the \"fast\" "
                    "instruction selector"));
@@ -362,6 +360,39 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
+/// SetDebugLoc - Update MF's and SDB's DebugLocs if debug information is
+/// attached with this instruction.
+static void SetDebugLoc(unsigned MDDbgKind,
+                        MetadataContext &TheMetadata,
+                        Instruction *I,
+                        SelectionDAGBuilder *SDB,
+                        FastISel *FastIS,
+                        MachineFunction *MF) {
+  if (!isa<DbgInfoIntrinsic>(I)) 
+    if (MDNode *Dbg = TheMetadata.getMD(MDDbgKind, I)) {
+      DILocation DILoc(Dbg);
+      DebugLoc Loc = ExtractDebugLocation(DILoc, MF->getDebugLocInfo());
+
+      SDB->setCurDebugLoc(Loc);
+
+      if (FastIS)
+        FastIS->setCurDebugLoc(Loc);
+
+      // If the function doesn't have a default debug location yet, set
+      // it. This is kind of a hack.
+      if (MF->getDefaultDebugLoc().isUnknown())
+        MF->setDefaultDebugLoc(Loc);
+    }
+}
+
+/// ResetDebugLoc - Set MF's and SDB's DebugLocs to Unknown.
+static void ResetDebugLoc(SelectionDAGBuilder *SDB,
+                          FastISel *FastIS) {
+  SDB->setCurDebugLoc(DebugLoc::getUnknownLoc());
+  if (FastIS)
+    FastIS->setCurDebugLoc(DebugLoc::getUnknownLoc());
+}
+
 void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
                                         BasicBlock::iterator Begin,
                                         BasicBlock::iterator End,
@@ -373,20 +404,16 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
   // Lower all of the non-terminator instructions. If a call is emitted
   // as a tail call, cease emitting nodes for this block.
   for (BasicBlock::iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
-    if (MDDbgKind) {
-      // Update DebugLoc if debug information is attached with this
-      // instruction.
-      if (!isa<DbgInfoIntrinsic>(I)) 
-        if (MDNode *Dbg = TheMetadata.getMD(MDDbgKind, I)) {
-          DILocation DILoc(Dbg);
-          DebugLoc Loc = ExtractDebugLocation(DILoc, MF->getDebugLocInfo());
-          SDB->setCurDebugLoc(Loc);
-          if (MF->getDefaultDebugLoc().isUnknown())
-            MF->setDefaultDebugLoc(Loc);
-        }
-    }
-    if (!isa<TerminatorInst>(I))
+    if (MDDbgKind)
+      SetDebugLoc(MDDbgKind, TheMetadata, I, SDB, 0, MF);
+
+    if (!isa<TerminatorInst>(I)) {
       SDB->visit(*I);
+
+      // Set the current debug location back to "unknown" so that it doesn't
+      // spuriously apply to subsequent instructions.
+      ResetDebugLoc(SDB, 0);
+    }
   }
 
   if (!SDB->HasTailCall) {
@@ -401,7 +428,9 @@ void SelectionDAGISel::SelectBasicBlock(BasicBlock *LLVMBB,
       HandlePHINodesInSuccessorBlocks(LLVMBB);
 
       // Lower the terminator after the copies are emitted.
+      SetDebugLoc(MDDbgKind, TheMetadata, LLVMBB->getTerminator(), SDB, 0, MF);
       SDB->visit(*LLVMBB->getTerminator());
+      ResetDebugLoc(SDB, 0);
     }
   }
 
@@ -498,75 +527,73 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
-  if (!DisableLegalizeTypes) {
-    if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
-                                                 BlockName);
+  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
+                                               BlockName);
+
+  bool Changed;
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Type Legalization", GroupName);
+    Changed = CurDAG->LegalizeTypes();
+  } else {
+    Changed = CurDAG->LegalizeTypes();
+  }
+
+  DEBUG(errs() << "Type-legalized selection DAG:\n");
+  DEBUG(CurDAG->dump());
 
-    bool Changed;
+  if (Changed) {
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
     if (TimePassesIsEnabled) {
-      NamedRegionTimer T("Type Legalization", GroupName);
-      Changed = CurDAG->LegalizeTypes();
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName);
+      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
     } else {
-      Changed = CurDAG->LegalizeTypes();
+      CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
     }
 
-    DEBUG(errs() << "Type-legalized selection DAG:\n");
+    DEBUG(errs() << "Optimized type-legalized selection DAG:\n");
     DEBUG(CurDAG->dump());
+  }
 
-    if (Changed) {
-      if (ViewDAGCombineLT)
-        CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
-
-      // Run the DAG combiner in post-type-legalize mode.
-      if (TimePassesIsEnabled) {
-        NamedRegionTimer T("DAG Combining after legalize types", GroupName);
-        CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
-      } else {
-        CurDAG->Combine(NoIllegalTypes, *AA, OptLevel);
-      }
-
-      DEBUG(errs() << "Optimized type-legalized selection DAG:\n");
-      DEBUG(CurDAG->dump());
-    }
+  if (TimePassesIsEnabled) {
+    NamedRegionTimer T("Vector Legalization", GroupName);
+    Changed = CurDAG->LegalizeVectors();
+  } else {
+    Changed = CurDAG->LegalizeVectors();
+  }
 
+  if (Changed) {
     if (TimePassesIsEnabled) {
-      NamedRegionTimer T("Vector Legalization", GroupName);
-      Changed = CurDAG->LegalizeVectors();
+      NamedRegionTimer T("Type Legalization 2", GroupName);
+      Changed = CurDAG->LegalizeTypes();
     } else {
-      Changed = CurDAG->LegalizeVectors();
+      Changed = CurDAG->LegalizeTypes();
     }
 
-    if (Changed) {
-      if (TimePassesIsEnabled) {
-        NamedRegionTimer T("Type Legalization 2", GroupName);
-        Changed = CurDAG->LegalizeTypes();
-      } else {
-        Changed = CurDAG->LegalizeTypes();
-      }
-
-      if (ViewDAGCombineLT)
-        CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
-      // Run the DAG combiner in post-type-legalize mode.
-      if (TimePassesIsEnabled) {
-        NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
-        CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-      } else {
-        CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
-      }
-
-      DEBUG(errs() << "Optimized vector-legalized selection DAG:\n");
-      DEBUG(CurDAG->dump());
+    // Run the DAG combiner in post-type-legalize mode.
+    if (TimePassesIsEnabled) {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName);
+      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
+    } else {
+      CurDAG->Combine(NoIllegalOperations, *AA, OptLevel);
     }
+
+    DEBUG(errs() << "Optimized vector-legalized selection DAG:\n");
+    DEBUG(CurDAG->dump());
   }
 
   if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
 
   if (TimePassesIsEnabled) {
     NamedRegionTimer T("DAG Legalization", GroupName);
-    CurDAG->Legalize(DisableLegalizeTypes, OptLevel);
+    CurDAG->Legalize(OptLevel);
   } else {
-    CurDAG->Legalize(DisableLegalizeTypes, OptLevel);
+    CurDAG->Legalize(OptLevel);
   }
 
   DEBUG(errs() << "Legalized selection DAG:\n");
@@ -738,24 +765,11 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
       FastIS->startNewBlock(BB);
       // Do FastISel on as many instructions as possible.
       for (; BI != End; ++BI) {
-        if (MDDbgKind) {
-          // Update DebugLoc if debug information is attached with this
-          // instruction.
-          if (!isa<DbgInfoIntrinsic>(BI)) 
-            if (MDNode *Dbg = TheMetadata.getMD(MDDbgKind, BI)) {
-              DILocation DILoc(Dbg);
-              DebugLoc Loc = ExtractDebugLocation(DILoc,
-                                                  MF.getDebugLocInfo());
-              FastIS->setCurDebugLoc(Loc);
-              if (MF.getDefaultDebugLoc().isUnknown())
-                MF.setDefaultDebugLoc(Loc);
-            }
-        }
-
         // Just before the terminator instruction, insert instructions to
         // feed PHI nodes in successor blocks.
         if (isa<TerminatorInst>(BI))
           if (!HandlePHINodesInSuccessorBlocksFast(LLVMBB, FastIS)) {
+            ResetDebugLoc(SDB, FastIS);
             if (EnableFastISelVerbose || EnableFastISelAbort) {
               errs() << "FastISel miss: ";
               BI->dump();
@@ -765,13 +779,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
             break;
           }
 
+        if (MDDbgKind)
+          SetDebugLoc(MDDbgKind, TheMetadata, BI, SDB, FastIS, &MF);
+
         // First try normal tablegen-generated "fast" selection.
-        if (FastIS->SelectInstruction(BI))
+        if (FastIS->SelectInstruction(BI)) {
+          ResetDebugLoc(SDB, FastIS);
           continue;
+        }
 
-        // Next, try calling the target to attempt to handle the instruction.
-        if (FastIS->TargetSelectInstruction(BI))
-          continue;
+        // Clear out the debug location so that it doesn't carry over to
+        // unrelated instructions.
+        ResetDebugLoc(SDB, FastIS);
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
         if (isa<CallInst>(BI)) {
@@ -786,10 +805,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
               R = FuncInfo->CreateRegForValue(BI);
           }
 
-          SDB->setCurDebugLoc(FastIS->getCurDebugLoc());
-
           bool HadTailCall = false;
-          SelectBasicBlock(LLVMBB, BI, next(BI), HadTailCall);
+          SelectBasicBlock(LLVMBB, BI, llvm::next(BI), HadTailCall);
 
           // If the call was emitted as a tail call, we're done with the block.
           if (HadTailCall) {
@@ -823,9 +840,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(Function &Fn,
     // not handled by FastISel. If FastISel is not run, this is the entire
     // block.
     if (BI != End) {
-      // If FastISel is run and it has known DebugLoc then use it.
-      if (FastIS && !FastIS->getCurDebugLoc().isUnknown())
-        SDB->setCurDebugLoc(FastIS->getCurDebugLoc());
       bool HadTailCall;
       SelectBasicBlock(LLVMBB, BI, End, HadTailCall);
     }
@@ -1313,14 +1327,6 @@ SDNode *SelectionDAGISel::Select_UNDEF(const SDValue &N) {
                               N.getValueType());
 }
 
-SDNode *SelectionDAGISel::Select_DBG_LABEL(const SDValue &N) {
-  SDValue Chain = N.getOperand(0);
-  unsigned C = cast<LabelSDNode>(N)->getLabelID();
-  SDValue Tmp = CurDAG->getTargetConstant(C, MVT::i32);
-  return CurDAG->SelectNodeTo(N.getNode(), TargetInstrInfo::DBG_LABEL,
-                              MVT::Other, Tmp, Chain);
-}
-
 SDNode *SelectionDAGISel::Select_EH_LABEL(const SDValue &N) {
   SDValue Chain = N.getOperand(0);
   unsigned C = cast<LabelSDNode>(N)->getLabelID();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index c5adc50..83fa5a8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -29,14 +29,14 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
-#include <fstream>
 using namespace llvm;
 
 namespace llvm {
   template<>
   struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
 
-    DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+    explicit DOTGraphTraits(bool isSimple=false) :
+      DefaultDOTGraphTraits(isSimple) {}
 
     static bool hasEdgeDestLabels() {
       return true;
@@ -50,6 +50,11 @@ namespace llvm {
       return ((const SDNode *) Node)->getValueType(i).getEVTString();
     }
 
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
+      return itostr(I - SDNodeIterator::begin((SDNode *) Node));
+    }
+
     /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
     /// should actually target another edge source, not a node.  If this method
     /// is implemented, getEdgeTarget should be implemented.
@@ -76,12 +81,12 @@ namespace llvm {
     static bool renderGraphFromBottomUp() {
       return true;
     }
-    
+
     static bool hasNodeAddressLabel(const SDNode *Node,
                                     const SelectionDAG *Graph) {
       return true;
     }
-    
+
     /// If you want to override the dot attributes printed for a particular
     /// edge, override this method.
     template<typename EdgeIter>
@@ -94,7 +99,7 @@ namespace llvm {
         return "color=blue,style=dashed";
       return "";
     }
-    
+
 
     static std::string getSimpleNodeLabel(const SDNode *Node,
                                           const SelectionDAG *G) {
@@ -132,7 +137,7 @@ namespace llvm {
 
 std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
                                                         const SelectionDAG *G) {
-  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel (Node, G);
+  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel(Node, G);
 }
 
 
@@ -142,7 +147,7 @@ std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
 void SelectionDAG::viewGraph(const std::string &Title) {
 // This code is only for debugging!
 #ifndef NDEBUG
-  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getNameStr(), 
+  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getNameStr(),
             false, Title);
 #else
   errs() << "SelectionDAG::viewGraph is only available in debug builds on "
@@ -186,7 +191,7 @@ const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
 #ifndef NDEBUG
   std::map<const SDNode *, std::string>::const_iterator I =
     NodeGraphAttrs.find(N);
-    
+
   if (I != NodeGraphAttrs.end())
     return I->second;
   else
@@ -252,8 +257,7 @@ void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
     // Visually mark that we hit the limit
     if (strcmp(Color, "red") == 0) {
       setSubgraphColorHelper(N, "blue", visited, 0, printed);
-    }
-    else if (strcmp(Color, "yellow") == 0) {
+    } else if (strcmp(Color, "yellow") == 0) {
       setSubgraphColorHelper(N, "green", visited, 0, printed);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 68bc2d6..1026169 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -911,7 +911,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                           TargetLoweringOpt &TLO,
                                           unsigned Depth) const {
   unsigned BitWidth = DemandedMask.getBitWidth();
-  assert(Op.getValueSizeInBits() == BitWidth &&
+  assert(Op.getValueType().getScalarType().getSizeInBits() == BitWidth &&
          "Mask size mismatches value type size!");
   APInt NewMask = DemandedMask;
   DebugLoc dl = Op.getDebugLoc();
@@ -1240,7 +1240,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // demand the input sign bit.
       APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
       if (HighBits.intersects(NewMask))
-        InDemandedMask |= APInt::getSignBit(VT.getSizeInBits());
+        InDemandedMask |= APInt::getSignBit(VT.getScalarType().getSizeInBits());
       
       if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
                                KnownZero, KnownOne, TLO, Depth+1))
@@ -2184,48 +2184,6 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA,
 }
 
 
-/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a 
-/// location that is 'Dist' units away from the location that the 'Base' load 
-/// is loading from.
-bool TargetLowering::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, 
-                                       unsigned Bytes, int Dist, 
-                                       const MachineFrameInfo *MFI) const {
-  if (LD->getChain() != Base->getChain())
-    return false;
-  EVT VT = LD->getValueType(0);
-  if (VT.getSizeInBits() / 8 != Bytes)
-    return false;
-
-  SDValue Loc = LD->getOperand(1);
-  SDValue BaseLoc = Base->getOperand(1);
-  if (Loc.getOpcode() == ISD::FrameIndex) {
-    if (BaseLoc.getOpcode() != ISD::FrameIndex)
-      return false;
-    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
-    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
-    int FS  = MFI->getObjectSize(FI);
-    int BFS = MFI->getObjectSize(BFI);
-    if (FS != BFS || FS != (int)Bytes) return false;
-    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
-  }
-  if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
-    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
-    if (V && (V->getSExtValue() == Dist*Bytes))
-      return true;
-  }
-
-  GlobalValue *GV1 = NULL;
-  GlobalValue *GV2 = NULL;
-  int64_t Offset1 = 0;
-  int64_t Offset2 = 0;
-  bool isGA1 = isGAPlusOffset(Loc.getNode(), GV1, Offset1);
-  bool isGA2 = isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
-  if (isGA1 && isGA2 && GV1 == GV2)
-    return Offset1 == (Offset2 + Dist*Bytes);
-  return false;
-}
-
-
 SDValue TargetLowering::
 PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   // Default implementation: no optimization.
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index 5876371..ed407eb 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -130,7 +130,8 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(LiveInterval &IntA,
   // See PR3149:
   // 172     %ECX<def> = MOV32rr %reg1039<kill>
   // 180     INLINEASM <es:subl $5,$1
-  //         sbbl $3,$0>, 10, %EAX<def>, 14, %ECX<earlyclobber,def>, 9, %EAX<kill>,
+  //         sbbl $3,$0>, 10, %EAX<def>, 14, %ECX<earlyclobber,def>, 9,
+  //         %EAX<kill>,
   // 36, <fi#0>, 1, %reg0, 0, 9, %ECX<kill>, 36, <fi#1>, 1, %reg0, 0
   // 188     %EAX<def> = MOV32rr %EAX<kill>
   // 196     %ECX<def> = MOV32rr %ECX<kill>
@@ -281,12 +282,12 @@ TransferImplicitOps(MachineInstr *MI, MachineInstr *NewMI) {
   }
 }
 
-/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with IntA
-/// being the source and IntB being the dest, thus this defines a value number
-/// in IntB.  If the source value number (in IntA) is defined by a commutable
-/// instruction and its other operand is coalesced to the copy dest register,
-/// see if we can transform the copy into a noop by commuting the definition. For
-/// example,
+/// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// IntA being the source and IntB being the dest, thus this defines a value
+/// number in IntB.  If the source value number (in IntA) is defined by a
+/// commutable instruction and its other operand is coalesced to the copy dest
+/// register, see if we can transform the copy into a noop by commuting the
+/// definition. For example,
 ///
 ///  A3 = op A2 B0<kill>
 ///    ...
@@ -508,7 +509,8 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(LiveInterval &IntA,
     if (BHasSubRegs) {
       for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
         LiveInterval &SRLI = li_->getInterval(*SR);
-        SRLI.MergeInClobberRange(*li_, AI->start, End, li_->getVNInfoAllocator());
+        SRLI.MergeInClobberRange(*li_, AI->start, End,
+                                 li_->getVNInfoAllocator());
       }
     }
   }
@@ -708,7 +710,8 @@ bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
       checkForDeadDef = true;
     }
 
-  MachineBasicBlock::iterator MII = next(MachineBasicBlock::iterator(CopyMI));
+  MachineBasicBlock::iterator MII =
+    llvm::next(MachineBasicBlock::iterator(CopyMI));
   tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, tri_);
   MachineInstr *NewMI = prior(MII);
 
@@ -1314,7 +1317,13 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
                       "coalesced to another register.\n");
       return false;  // Not coalescable.
     }
-  } else if (!tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)){
+  } else if (tii_->isMoveInstr(*CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
+    if (SrcSubIdx && DstSubIdx && SrcSubIdx != DstSubIdx) {
+      // e.g. %reg16404:1<def> = MOV8rr %reg16412:2<kill>
+      Again = true;
+      return false;  // Not coalescable.
+    }
+  } else {
     llvm_unreachable("Unrecognized copy instruction!");
   }
 
@@ -1611,9 +1620,9 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
           }
         }
       } else {
-        // If the virtual register live interval is long but it has low use desity,
-        // do not join them, instead mark the physical register as its allocation
-        // preference.
+        // If the virtual register live interval is long but it has low use
+        // density, do not join them, instead mark the physical register as its
+        // allocation preference.
         LiveInterval &JoinVInt = SrcIsPhys ? DstInt : SrcInt;
         unsigned JoinVReg = SrcIsPhys ? DstReg : SrcReg;
         unsigned JoinPReg = SrcIsPhys ? SrcReg : DstReg;
@@ -1938,6 +1947,10 @@ bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
     if (Overlaps) {
       // If we haven't already recorded that this value # is safe, check it.
       if (!InVector(LHSIt->valno, EliminatedLHSVals)) {
+        // If it's re-defined by an early clobber somewhere in the live range,
+        // then conservatively abort coalescing.
+        if (LHSIt->valno->hasRedefByEC())
+          return false;
         // Copy from the RHS?
         if (!RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg))
           return false;    // Nope, bail out.
@@ -1977,6 +1990,10 @@ bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
           // if coalescing succeeds.  Just skip the liverange.
           if (++LHSIt == LHSEnd) break;
         } else {
+          // If it's re-defined by an early clobber somewhere in the live range,
+          // then conservatively abort coalescing.
+          if (LHSIt->valno->hasRedefByEC())
+            return false;
           // Otherwise, if this is a copy from the RHS, mark it as being merged
           // in.
           if (RangeIsDefinedByCopyFromReg(LHS, LHSIt, RHS.reg)) {
@@ -2316,6 +2333,10 @@ SimpleRegisterCoalescing::JoinIntervals(LiveInterval &LHS, LiveInterval &RHS,
       if (LHSValNoAssignments[I->valno->id] !=
           RHSValNoAssignments[J->valno->id])
         return false;
+      // If it's re-defined by an early clobber somewhere in the live range,
+      // then conservatively abort coalescing.
+      if (NewVNInfo[LHSValNoAssignments[I->valno->id]]->hasRedefByEC())
+        return false;
     }
 
     if (I->end < J->end) {
@@ -2401,9 +2422,15 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
 
     // If this isn't a copy nor a extract_subreg, we can't join intervals.
     unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+    bool isInsUndef = false;
     if (Inst->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(1).getReg();
+    } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
+      DstReg = Inst->getOperand(0).getReg();
+      SrcReg = Inst->getOperand(2).getReg();
+      if (Inst->getOperand(1).isUndef())
+        isInsUndef = true;
     } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
                Inst->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
       DstReg = Inst->getOperand(0).getReg();
@@ -2413,7 +2440,8 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
 
     bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
     bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    if (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty())
+    if (isInsUndef ||
+        (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty()))
       ImpDefCopies.push_back(CopyRec(Inst, 0));
     else if (SrcIsPhys || DstIsPhys)
       PhysCopies.push_back(CopyRec(Inst, 0));
@@ -2421,9 +2449,9 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
       VirtCopies.push_back(CopyRec(Inst, 0));
   }
 
-  // Try coalescing implicit copies first, followed by copies to / from
-  // physical registers, then finally copies from virtual registers to
-  // virtual registers.
+  // Try coalescing implicit copies and insert_subreg <undef> first,
+  // followed by copies to / from physical registers, then finally copies
+  // from virtual registers to virtual registers.
   for (unsigned i = 0, e = ImpDefCopies.size(); i != e; ++i) {
     CopyRec &TheCopy = ImpDefCopies[i];
     bool Again = false;
@@ -2594,114 +2622,6 @@ void SimpleRegisterCoalescing::releaseMemory() {
   ReMatDefs.clear();
 }
 
-/// Returns true if the given live interval is zero length.
-static bool isZeroLengthInterval(LiveInterval *li, LiveIntervals *li_) {
-  for (LiveInterval::Ranges::const_iterator
-         i = li->ranges.begin(), e = li->ranges.end(); i != e; ++i)
-    if (i->end.getPrevIndex() > i->start)
-      return false;
-  return true;
-}
-
-
-void SimpleRegisterCoalescing::CalculateSpillWeights() {
-  SmallSet<unsigned, 4> Processed;
-  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
-       mbbi != mbbe; ++mbbi) {
-    MachineBasicBlock* MBB = mbbi;
-    SlotIndex MBBEnd = li_->getMBBEndIdx(MBB);
-    MachineLoop* loop = loopInfo->getLoopFor(MBB);
-    unsigned loopDepth = loop ? loop->getLoopDepth() : 0;
-    bool isExiting = loop ? loop->isLoopExiting(MBB) : false;
-
-    for (MachineBasicBlock::const_iterator mii = MBB->begin(), mie = MBB->end();
-         mii != mie; ++mii) {
-      const MachineInstr *MI = mii;
-      if (tii_->isIdentityCopy(*MI))
-        continue;
-
-      if (MI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF)
-        continue;
-
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &mopi = MI->getOperand(i);
-        if (!mopi.isReg() || mopi.getReg() == 0)
-          continue;
-        unsigned Reg = mopi.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(mopi.getReg()))
-          continue;
-        // Multiple uses of reg by the same instruction. It should not
-        // contribute to spill weight again.
-        if (!Processed.insert(Reg))
-          continue;
-
-        bool HasDef = mopi.isDef();
-        bool HasUse = !HasDef;
-        for (unsigned j = i+1; j != e; ++j) {
-          const MachineOperand &mopj = MI->getOperand(j);
-          if (!mopj.isReg() || mopj.getReg() != Reg)
-            continue;
-          HasDef |= mopj.isDef();
-          HasUse |= mopj.isUse();
-          if (HasDef && HasUse)
-            break;
-        }
-
-        LiveInterval &RegInt = li_->getInterval(Reg);
-        float Weight = li_->getSpillWeight(HasDef, HasUse, loopDepth);
-        if (HasDef && isExiting) {
-          // Looks like this is a loop count variable update.
-          SlotIndex DefIdx = li_->getInstructionIndex(MI).getDefIndex();
-          const LiveRange *DLR =
-            li_->getInterval(Reg).getLiveRangeContaining(DefIdx);
-          if (DLR->end > MBBEnd)
-            Weight *= 3.0F;
-        }
-        RegInt.weight += Weight;
-      }
-      Processed.clear();
-    }
-  }
-
-  for (LiveIntervals::iterator I = li_->begin(), E = li_->end(); I != E; ++I) {
-    LiveInterval &LI = *I->second;
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      // If the live interval length is essentially zero, i.e. in every live
-      // range the use follows def immediately, it doesn't make sense to spill
-      // it and hope it will be easier to allocate for this li.
-      if (isZeroLengthInterval(&LI, li_)) {
-        LI.weight = HUGE_VALF;
-        continue;
-      }
-
-      bool isLoad = false;
-      SmallVector<LiveInterval*, 4> SpillIs;
-      if (li_->isReMaterializable(LI, SpillIs, isLoad)) {
-        // If all of the definitions of the interval are re-materializable,
-        // it is a preferred candidate for spilling. If non of the defs are
-        // loads, then it's potentially very cheap to re-materialize.
-        // FIXME: this gets much more complicated once we support non-trivial
-        // re-materialization.
-        if (isLoad)
-          LI.weight *= 0.9F;
-        else
-          LI.weight *= 0.5F;
-      }
-
-      // Slightly prefer live interval that has been assigned a preferred reg.
-      std::pair<unsigned, unsigned> Hint = mri_->getRegAllocationHint(LI.reg);
-      if (Hint.first || Hint.second)
-        LI.weight *= 1.01F;
-
-      // Divide the weight of the interval by its size.  This encourages
-      // spilling of intervals that are large and have few uses, and
-      // discourages spilling of small intervals with many uses.
-      LI.weight /= li_->getApproximateInstructionCount(LI) * InstrSlots::NUM;
-    }
-  }
-}
-
-
 bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
   mf_ = &fn;
   mri_ = &fn.getRegInfo();
@@ -2727,7 +2647,8 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
     joinIntervals();
     DEBUG({
         errs() << "********** INTERVALS POST JOINING **********\n";
-        for (LiveIntervals::iterator I = li_->begin(), E = li_->end(); I != E; ++I){
+        for (LiveIntervals::iterator I = li_->begin(), E = li_->end();
+             I != E; ++I){
           I->second->print(errs(), tri_);
           errs() << "\n";
         }
@@ -2768,7 +2689,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           DoDelete = true;
         }
         if (!DoDelete)
-          mii = next(mii);
+          mii = llvm::next(mii);
         else {
           li_->RemoveMachineInstrFromMaps(MI);
           mii = mbbi->erase(mii);
@@ -2831,8 +2752,6 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
     }
   }
 
-  CalculateSpillWeights();
-
   DEBUG(dump());
   return true;
 }
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index 78f8a9a..605a740 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -244,10 +244,6 @@ namespace llvm {
     MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
                                     unsigned Reg, SlotIndex &LastUseIdx) const;
 
-    /// CalculateSpillWeights - Compute spill weights for all virtual register
-    /// live intervals.
-    void CalculateSpillWeights();
-
     void printRegName(unsigned reg) const;
   };
 
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 237d0b5..bc246c1 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -20,22 +20,25 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <set>
 
 using namespace llvm;
 
 namespace {
-  enum SpillerName { trivial, standard };
+  enum SpillerName { trivial, standard, splitting };
 }
 
 static cl::opt<SpillerName>
 spillerOpt("spiller",
            cl::desc("Spiller to use: (default: standard)"),
            cl::Prefix,
-           cl::values(clEnumVal(trivial, "trivial spiller"),
-                      clEnumVal(standard, "default spiller"),
+           cl::values(clEnumVal(trivial,   "trivial spiller"),
+                      clEnumVal(standard,  "default spiller"),
+                      clEnumVal(splitting, "splitting spiller"),
                       clEnumValEnd),
            cl::init(standard));
 
+// Spiller virtual destructor implementation.
 Spiller::~Spiller() {}
 
 namespace {
@@ -140,9 +143,9 @@ protected:
 
       // Insert store if necessary.
       if (hasDef) {
-        tii->storeRegToStackSlot(*mi->getParent(), next(miItr), newVReg, true,
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr), newVReg, true,
                                  ss, trc);
-        MachineInstr *storeInstr(next(miItr));
+        MachineInstr *storeInstr(llvm::next(miItr));
         SlotIndex storeIndex =
           lis->InsertMachineInstrInMaps(storeInstr).getDefIndex();
         SlotIndex beginIndex = storeIndex.getPrevIndex();
@@ -170,7 +173,8 @@ public:
     : SpillerBase(mf, lis, vrm) {}
 
   std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs) {
+                                   SmallVectorImpl<LiveInterval*> &spillIs,
+                                   SlotIndex*) {
     // Ignore spillIs - we don't use it.
     return trivialSpillEverywhere(li);
   }
@@ -179,23 +183,336 @@ public:
 
 /// Falls back on LiveIntervals::addIntervalsForSpills.
 class StandardSpiller : public Spiller {
-private:
+protected:
   LiveIntervals *lis;
   const MachineLoopInfo *loopInfo;
   VirtRegMap *vrm;
 public:
-  StandardSpiller(MachineFunction *mf, LiveIntervals *lis,
-                  const MachineLoopInfo *loopInfo, VirtRegMap *vrm)
+  StandardSpiller(LiveIntervals *lis, const MachineLoopInfo *loopInfo,
+                  VirtRegMap *vrm)
     : lis(lis), loopInfo(loopInfo), vrm(vrm) {}
 
   /// Falls back on LiveIntervals::addIntervalsForSpills.
   std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs) {
+                                   SmallVectorImpl<LiveInterval*> &spillIs,
+                                   SlotIndex*) {
     return lis->addIntervalsForSpills(*li, spillIs, loopInfo, *vrm);
   }
 
 };
 
+/// When a call to spill is placed this spiller will first try to break the
+/// interval up into its component values (one new interval per value).
+/// If this fails, or if a call is placed to spill a previously split interval
+/// then the spiller falls back on the standard spilling mechanism. 
+class SplittingSpiller : public StandardSpiller {
+public:
+  SplittingSpiller(MachineFunction *mf, LiveIntervals *lis,
+                   const MachineLoopInfo *loopInfo, VirtRegMap *vrm)
+    : StandardSpiller(lis, loopInfo, vrm) {
+
+    mri = &mf->getRegInfo();
+    tii = mf->getTarget().getInstrInfo();
+    tri = mf->getTarget().getRegisterInfo();
+  }
+
+  std::vector<LiveInterval*> spill(LiveInterval *li,
+                                   SmallVectorImpl<LiveInterval*> &spillIs,
+                                   SlotIndex *earliestStart) {
+    
+    if (worthTryingToSplit(li)) {
+      return tryVNISplit(li, earliestStart);
+    }
+    // else
+    return StandardSpiller::spill(li, spillIs, earliestStart);
+  }
+
+private:
+
+  MachineRegisterInfo *mri;
+  const TargetInstrInfo *tii;
+  const TargetRegisterInfo *tri;  
+  DenseSet<LiveInterval*> alreadySplit;
+
+  bool worthTryingToSplit(LiveInterval *li) const {
+    return (!alreadySplit.count(li) && li->getNumValNums() > 1);
+  }
+
+  /// Try to break a LiveInterval into its component values.
+  std::vector<LiveInterval*> tryVNISplit(LiveInterval *li,
+                                         SlotIndex *earliestStart) {
+
+    DEBUG(errs() << "Trying VNI split of %reg" << *li << "\n");
+
+    std::vector<LiveInterval*> added;
+    SmallVector<VNInfo*, 4> vnis;
+
+    std::copy(li->vni_begin(), li->vni_end(), std::back_inserter(vnis));
+   
+    for (SmallVectorImpl<VNInfo*>::iterator vniItr = vnis.begin(),
+         vniEnd = vnis.end(); vniItr != vniEnd; ++vniItr) {
+      VNInfo *vni = *vniItr;
+      
+      // Skip unused VNIs, or VNIs with no kills.
+      if (vni->isUnused() || vni->kills.empty())
+        continue;
+
+      DEBUG(errs() << "  Extracted Val #" << vni->id << " as ");
+      LiveInterval *splitInterval = extractVNI(li, vni);
+      
+      if (splitInterval != 0) {
+        DEBUG(errs() << *splitInterval << "\n");
+        added.push_back(splitInterval);
+        alreadySplit.insert(splitInterval);
+        if (earliestStart != 0) {
+          if (splitInterval->beginIndex() < *earliestStart)
+            *earliestStart = splitInterval->beginIndex();
+        }
+      } else {
+        DEBUG(errs() << "0\n");
+      }
+    } 
+
+    DEBUG(errs() << "Original LI: " << *li << "\n");
+
+    // If there original interval still contains some live ranges
+    // add it to added and alreadySplit.    
+    if (!li->empty()) {
+      added.push_back(li);
+      alreadySplit.insert(li);
+      if (earliestStart != 0) {
+        if (li->beginIndex() < *earliestStart)
+          *earliestStart = li->beginIndex();
+      }
+    }
+
+    return added;
+  }
+
+  /// Extract the given value number from the interval.
+  LiveInterval* extractVNI(LiveInterval *li, VNInfo *vni) const {
+    assert(vni->isDefAccurate() || vni->isPHIDef());
+    assert(!vni->kills.empty());
+
+    // Create a new vreg and live interval, copy VNI kills & ranges over.                                                                                                                                                     
+    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+    unsigned newVReg = mri->createVirtualRegister(trc);
+    vrm->grow();
+    LiveInterval *newLI = &lis->getOrCreateInterval(newVReg);
+    VNInfo *newVNI = newLI->createValueCopy(vni, lis->getVNInfoAllocator());
+
+    // Start by copying all live ranges in the VN to the new interval.                                                                                                                                                        
+    for (LiveInterval::iterator rItr = li->begin(), rEnd = li->end();
+         rItr != rEnd; ++rItr) {
+      if (rItr->valno == vni) {
+        newLI->addRange(LiveRange(rItr->start, rItr->end, newVNI));
+      }
+    }
+
+    // Erase the old VNI & ranges.                                                                                                                                                                                            
+    li->removeValNo(vni);
+
+    // Collect all current uses of the register belonging to the given VNI.
+    // We'll use this to rename the register after we've dealt with the def.
+    std::set<MachineInstr*> uses;
+    for (MachineRegisterInfo::use_iterator
+         useItr = mri->use_begin(li->reg), useEnd = mri->use_end();
+         useItr != useEnd; ++useItr) {
+      uses.insert(&*useItr);
+    }
+
+    // Process the def instruction for this VNI.
+    if (newVNI->isPHIDef()) {
+      // Insert a copy at the start of the MBB. The range proceeding the
+      // copy will be attached to the original LiveInterval.
+      MachineBasicBlock *defMBB = lis->getMBBFromIndex(newVNI->def);
+      tii->copyRegToReg(*defMBB, defMBB->begin(), newVReg, li->reg, trc, trc);
+      MachineInstr *copyMI = defMBB->begin();
+      copyMI->addRegisterKilled(li->reg, tri);
+      SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+      VNInfo *phiDefVNI = li->getNextValue(lis->getMBBStartIdx(defMBB),
+                                           0, false, lis->getVNInfoAllocator());
+      phiDefVNI->setIsPHIDef(true);
+      phiDefVNI->addKill(copyIdx.getDefIndex());
+      li->addRange(LiveRange(phiDefVNI->def, copyIdx.getDefIndex(), phiDefVNI));
+      LiveRange *oldPHIDefRange =
+        newLI->getLiveRangeContaining(lis->getMBBStartIdx(defMBB));
+
+      // If the old phi def starts in the middle of the range chop it up.
+      if (oldPHIDefRange->start < lis->getMBBStartIdx(defMBB)) {
+        LiveRange oldPHIDefRange2(copyIdx.getDefIndex(), oldPHIDefRange->end,
+                                  oldPHIDefRange->valno);
+        oldPHIDefRange->end = lis->getMBBStartIdx(defMBB);
+        newLI->addRange(oldPHIDefRange2);
+      } else if (oldPHIDefRange->start == lis->getMBBStartIdx(defMBB)) {
+        // Otherwise if it's at the start of the range just trim it.
+        oldPHIDefRange->start = copyIdx.getDefIndex();
+      } else {
+        assert(false && "PHI def range doesn't cover PHI def?");
+      }
+
+      newVNI->def = copyIdx.getDefIndex();
+      newVNI->setCopy(copyMI);
+      newVNI->setIsPHIDef(false); // not a PHI def anymore.
+      newVNI->setIsDefAccurate(true);
+    } else {
+      // non-PHI def. Rename the def. If it's two-addr that means renaming the use
+      // and inserting a new copy too.
+      MachineInstr *defInst = lis->getInstructionFromIndex(newVNI->def);
+      // We'll rename this now, so we can remove it from uses.
+      uses.erase(defInst);
+      unsigned defOpIdx = defInst->findRegisterDefOperandIdx(li->reg);
+      bool isTwoAddr = defInst->isRegTiedToUseOperand(defOpIdx),
+        twoAddrUseIsUndef = false;
+
+      for (unsigned i = 0; i < defInst->getNumOperands(); ++i) {
+        MachineOperand &mo = defInst->getOperand(i);
+        if (mo.isReg() && (mo.isDef() || isTwoAddr) && (mo.getReg()==li->reg)) {
+          mo.setReg(newVReg);
+          if (isTwoAddr && mo.isUse() && mo.isUndef())
+            twoAddrUseIsUndef = true;
+        }
+      }
+    
+      SlotIndex defIdx = lis->getInstructionIndex(defInst);
+      newVNI->def = defIdx.getDefIndex();
+
+      if (isTwoAddr && !twoAddrUseIsUndef) {
+        MachineBasicBlock *defMBB = defInst->getParent();
+        tii->copyRegToReg(*defMBB, defInst, newVReg, li->reg, trc, trc);
+        MachineInstr *copyMI = prior(MachineBasicBlock::iterator(defInst));
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+        copyMI->addRegisterKilled(li->reg, tri);
+        LiveRange *origUseRange =
+          li->getLiveRangeContaining(newVNI->def.getUseIndex());
+        VNInfo *origUseVNI = origUseRange->valno;
+        origUseRange->end = copyIdx.getDefIndex();
+        bool updatedKills = false;
+        for (unsigned k = 0; k < origUseVNI->kills.size(); ++k) {
+          if (origUseVNI->kills[k] == defIdx.getDefIndex()) {
+            origUseVNI->kills[k] = copyIdx.getDefIndex();
+            updatedKills = true;
+            break;
+          }
+        }
+        assert(updatedKills && "Failed to update VNI kill list.");
+        VNInfo *copyVNI = newLI->getNextValue(copyIdx.getDefIndex(), copyMI,
+                                              true, lis->getVNInfoAllocator());
+        copyVNI->addKill(defIdx.getDefIndex());
+        LiveRange copyRange(copyIdx.getDefIndex(),defIdx.getDefIndex(),copyVNI);
+        newLI->addRange(copyRange);
+      }    
+    }
+    
+    for (std::set<MachineInstr*>::iterator
+         usesItr = uses.begin(), usesEnd = uses.end();
+         usesItr != usesEnd; ++usesItr) {
+      MachineInstr *useInst = *usesItr;
+      SlotIndex useIdx = lis->getInstructionIndex(useInst);
+      LiveRange *useRange =
+        newLI->getLiveRangeContaining(useIdx.getUseIndex());
+
+      // If this use doesn't belong to the new interval skip it.
+      if (useRange == 0)
+        continue;
+
+      // This use doesn't belong to the VNI, skip it.
+      if (useRange->valno != newVNI)
+        continue;
+
+      // Check if this instr is two address.
+      unsigned useOpIdx = useInst->findRegisterUseOperandIdx(li->reg);
+      bool isTwoAddress = useInst->isRegTiedToDefOperand(useOpIdx);
+      
+      // Rename uses (and defs for two-address instrs).
+      for (unsigned i = 0; i < useInst->getNumOperands(); ++i) {
+        MachineOperand &mo = useInst->getOperand(i);
+        if (mo.isReg() && (mo.isUse() || isTwoAddress) &&
+            (mo.getReg() == li->reg)) {
+          mo.setReg(newVReg);
+        }
+      }
+
+      // If this is a two address instruction we've got some extra work to do.
+      if (isTwoAddress) {
+        // We modified the def operand, so we need to copy back to the original
+        // reg.
+        MachineBasicBlock *useMBB = useInst->getParent();
+        MachineBasicBlock::iterator useItr(useInst);
+        tii->copyRegToReg(*useMBB, next(useItr), li->reg, newVReg, trc, trc);
+        MachineInstr *copyMI = next(useItr);
+        copyMI->addRegisterKilled(newVReg, tri);
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+
+        // Change the old two-address defined range & vni to start at
+        // (and be defined by) the copy.
+        LiveRange *origDefRange =
+          li->getLiveRangeContaining(useIdx.getDefIndex());
+        origDefRange->start = copyIdx.getDefIndex();
+        origDefRange->valno->def = copyIdx.getDefIndex();
+        origDefRange->valno->setCopy(copyMI);
+
+        // Insert a new range & vni for the two-address-to-copy value. This
+        // will be attached to the new live interval.
+        VNInfo *copyVNI =
+          newLI->getNextValue(useIdx.getDefIndex(), 0, true,
+                              lis->getVNInfoAllocator());
+        copyVNI->addKill(copyIdx.getDefIndex());
+        LiveRange copyRange(useIdx.getDefIndex(),copyIdx.getDefIndex(),copyVNI);
+        newLI->addRange(copyRange);
+      }
+    }
+    
+    // Iterate over any PHI kills - we'll need to insert new copies for them.
+    for (VNInfo::KillSet::iterator
+         killItr = newVNI->kills.begin(), killEnd = newVNI->kills.end();
+         killItr != killEnd; ++killItr) {
+      SlotIndex killIdx(*killItr);
+      if (killItr->isPHI()) {
+        MachineBasicBlock *killMBB = lis->getMBBFromIndex(killIdx);
+        LiveRange *oldKillRange =
+          newLI->getLiveRangeContaining(killIdx);
+
+        assert(oldKillRange != 0 && "No kill range?");
+
+        tii->copyRegToReg(*killMBB, killMBB->getFirstTerminator(),
+                          li->reg, newVReg, trc, trc);
+        MachineInstr *copyMI = prior(killMBB->getFirstTerminator());
+        copyMI->addRegisterKilled(newVReg, tri);
+        SlotIndex copyIdx = lis->InsertMachineInstrInMaps(copyMI);
+
+        // Save the current end. We may need it to add a new range if the
+        // current range runs of the end of the MBB.
+        SlotIndex newKillRangeEnd = oldKillRange->end;
+        oldKillRange->end = copyIdx.getDefIndex();
+
+        if (newKillRangeEnd != lis->getMBBEndIdx(killMBB).getNextSlot()) {
+          assert(newKillRangeEnd > lis->getMBBEndIdx(killMBB).getNextSlot() &&
+                 "PHI kill range doesn't reach kill-block end. Not sane.");
+          newLI->addRange(LiveRange(lis->getMBBEndIdx(killMBB).getNextSlot(),
+                                    newKillRangeEnd, newVNI));
+        }
+
+        *killItr = oldKillRange->end;
+        VNInfo *newKillVNI = li->getNextValue(copyIdx.getDefIndex(),
+                                              copyMI, true,
+                                              lis->getVNInfoAllocator());
+        newKillVNI->addKill(lis->getMBBTerminatorGap(killMBB));
+        newKillVNI->setHasPHIKill(true);
+        li->addRange(LiveRange(copyIdx.getDefIndex(),
+                               lis->getMBBEndIdx(killMBB).getNextSlot(),
+                               newKillVNI));
+      }
+
+    }
+
+    newVNI->setHasPHIKill(false);
+
+    return newLI;
+  }
+
+};
+
 }
 
 llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
@@ -203,7 +520,8 @@ llvm::Spiller* llvm::createSpiller(MachineFunction *mf, LiveIntervals *lis,
                                    VirtRegMap *vrm) {
   switch (spillerOpt) {
     case trivial: return new TrivialSpiller(mf, lis, vrm); break;
-    case standard: return new StandardSpiller(mf, lis, loopInfo, vrm); break;
+    case standard: return new StandardSpiller(lis, loopInfo, vrm); break;
+    case splitting: return new SplittingSpiller(mf, lis, loopInfo, vrm); break;
     default: llvm_unreachable("Unreachable!"); break;
   }
 }
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index c6bd985..dda52e8 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -21,6 +21,7 @@ namespace llvm {
   class MachineFunction;
   class MachineInstr;
   class MachineLoopInfo;
+  class SlotIndex;
   class VirtRegMap;
   class VNInfo;
 
@@ -35,7 +36,8 @@ namespace llvm {
     /// Spill the given live range. The method used will depend on the Spiller
     /// implementation selected.
     virtual std::vector<LiveInterval*> spill(LiveInterval *li,
-                                   SmallVectorImpl<LiveInterval*> &spillIs) = 0;
+					     SmallVectorImpl<LiveInterval*> &spillIs,
+                                             SlotIndex *earliestIndex = 0) = 0;
 
   };
 
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index c299192..fd25a37 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -668,7 +668,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     if (DCELimit != -1 && (int)NumDead >= DCELimit)
       break;
     
-    MachineBasicBlock::iterator NextMI = next(I);
+    MachineBasicBlock::iterator NextMI = llvm::next(I);
     if (NextMI == MBB->end()) continue;
     
     int FirstSS, SecondSS;
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 9c0b596..bf58902 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -17,15 +17,19 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
+STATISTIC(NumTails     , "Number of tails duplicated");
 STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
 STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
@@ -36,34 +40,71 @@ TailDuplicateSize("tail-dup-size",
                   cl::desc("Maximum instructions to consider tail duplicating"),
                   cl::init(2), cl::Hidden);
 
+static cl::opt<bool>
+TailDupVerify("tail-dup-verify",
+              cl::desc("Verify sanity of PHI instructions during taildup"),
+              cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned>
+TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden);
+
+typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy;
+
 namespace {
   /// TailDuplicatePass - Perform tail duplication.
   class TailDuplicatePass : public MachineFunctionPass {
+    bool PreRegAlloc;
     const TargetInstrInfo *TII;
     MachineModuleInfo *MMI;
+    MachineRegisterInfo *MRI;
+
+    // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
+    SmallVector<unsigned, 16> SSAUpdateVRs;
+
+    // SSAUpdateVals - For each virtual register in SSAUpdateVals keep a list of
+    // source virtual registers.
+    DenseMap<unsigned, AvailableValsTy> SSAUpdateVals;
 
   public:
     static char ID;
-    explicit TailDuplicatePass() : MachineFunctionPass(&ID) {}
+    explicit TailDuplicatePass(bool PreRA) :
+      MachineFunctionPass(&ID), PreRegAlloc(PreRA) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "Tail Duplication"; }
 
   private:
+    void AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                           MachineBasicBlock *BB);
+    void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
+                    MachineBasicBlock *PredBB,
+                    DenseMap<unsigned, unsigned> &LocalVRMap,
+                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies);
+    void DuplicateInstruction(MachineInstr *MI,
+                              MachineBasicBlock *TailBB,
+                              MachineBasicBlock *PredBB,
+                              MachineFunction &MF,
+                              DenseMap<unsigned, unsigned> &LocalVRMap);
+    void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                              SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                              SmallSetVector<MachineBasicBlock*, 8> &Succs);
     bool TailDuplicateBlocks(MachineFunction &MF);
-    bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF);
+    bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                       SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                       SmallVector<MachineInstr*, 16> &Copies);
     void RemoveDeadBlock(MachineBasicBlock *MBB);
   };
 
   char TailDuplicatePass::ID = 0;
 }
 
-FunctionPass *llvm::createTailDuplicatePass() {
-  return new TailDuplicatePass();
+FunctionPass *llvm::createTailDuplicatePass(bool PreRegAlloc) {
+  return new TailDuplicatePass(PreRegAlloc);
 }
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
 
   bool MadeChange = false;
@@ -77,36 +118,325 @@ bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   return MadeChange;
 }
 
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(),
+                                                MBB->pred_end());
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != MBB->end()) {
+      if (MI->getOpcode() != TargetInstrInfo::PHI)
+        break;
+      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+             PE = Preds.end(); PI != PE; ++PI) {
+        MachineBasicBlock *PredBB = *PI;
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          errs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          errs() << "  missing input from predecessor BB#"
+                 << PredBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          // This is not a hard error.
+          errs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
+                 << ": " << *MI;
+          errs() << "  extra input from predecessor BB#"
+                 << PHIBB->getNumber() << '\n';
+        }
+        if (PHIBB->getNumber() < 0) {
+          errs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          errs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
 /// TailDuplicateBlocks - Look for small blocks that are unconditionally
 /// branched to and do not fall through. Tail-duplicate their instructions
 /// into their predecessors to eliminate (dynamic) branches.
 bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
   bool MadeChange = false;
 
+  if (PreRegAlloc && TailDupVerify) {
+    DEBUG(errs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(MF, true);
+  }
+
+  SmallVector<MachineInstr*, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
+
   for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
     MachineBasicBlock *MBB = I++;
 
+    if (NumTails == TailDupLimit)
+      break;
+
     // Only duplicate blocks that end with unconditional branches.
     if (MBB->canFallThrough())
       continue;
 
-    MadeChange |= TailDuplicate(MBB, MF);
-
-    // If it is dead, remove it.
-    if (MBB->pred_empty()) {
-      NumInstrDups -= MBB->size();
-      RemoveDeadBlock(MBB);
+    // Save the successors list.
+    SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
+                                                MBB->succ_end());
+
+    SmallVector<MachineBasicBlock*, 8> TDBBs;
+    SmallVector<MachineInstr*, 16> Copies;
+    if (TailDuplicate(MBB, MF, TDBBs, Copies)) {
+      ++NumTails;
+
+      // TailBB's immediate successors are now successors of those predecessors
+      // which duplicated TailBB. Add the predecessors as sources to the PHI
+      // instructions.
+      bool isDead = MBB->pred_empty();
+      if (PreRegAlloc)
+        UpdateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+      // If it is dead, remove it.
+      if (isDead) {
+        NumInstrDups -= MBB->size();
+        RemoveDeadBlock(MBB);
+        ++NumDeadBlocks;
+      }
+
+      // Update SSA form.
+      if (!SSAUpdateVRs.empty()) {
+        for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+          unsigned VReg = SSAUpdateVRs[i];
+          SSAUpdate.Initialize(VReg);
+
+          // If the original definition is still around, add it as an available
+          // value.
+          MachineInstr *DefMI = MRI->getVRegDef(VReg);
+          MachineBasicBlock *DefBB = 0;
+          if (DefMI) {
+            DefBB = DefMI->getParent();
+            SSAUpdate.AddAvailableValue(DefBB, VReg);
+          }
+
+          // Add the new vregs as available values.
+          DenseMap<unsigned, AvailableValsTy>::iterator LI =
+            SSAUpdateVals.find(VReg);  
+          for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+            MachineBasicBlock *SrcBB = LI->second[j].first;
+            unsigned SrcReg = LI->second[j].second;
+            SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+          }
+
+          // Rewrite uses that are outside of the original def's block.
+          MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
+          while (UI != MRI->use_end()) {
+            MachineOperand &UseMO = UI.getOperand();
+            MachineInstr *UseMI = &*UI;
+            ++UI;
+            if (UseMI->getParent() == DefBB)
+              continue;
+            SSAUpdate.RewriteUse(UseMO);
+          }
+        }
+
+        SSAUpdateVRs.clear();
+        SSAUpdateVals.clear();
+      }
+
+      // Eliminate some of the copies inserted tail duplication to maintain
+      // SSA form.
+      for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+        MachineInstr *Copy = Copies[i];
+        unsigned Src, Dst, SrcSR, DstSR;
+        if (TII->isMoveInstr(*Copy, Src, Dst, SrcSR, DstSR)) {
+          MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src);
+          if (++UI == MRI->use_end()) {
+            // Copy is the only use. Do trivial copy propagation here.
+            MRI->replaceRegWith(Dst, Src);
+            Copy->eraseFromParent();
+          }
+        }
+      }
+
+      if (PreRegAlloc && TailDupVerify)
+        VerifyPHIs(MF, false);
       MadeChange = true;
-      ++NumDeadBlocks;
     }
   }
+
   return MadeChange;
 }
 
+static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i+1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+/// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
+/// SSA update.
+void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                                          MachineBasicBlock *BB) {
+  DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB.
+/// Remember the source register that's contributed by PredBB and update SSA
+/// update map.
+void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
+                                   MachineBasicBlock *TailBB,
+                                   MachineBasicBlock *PredBB,
+                                   DenseMap<unsigned, unsigned> &LocalVRMap,
+                         SmallVector<std::pair<unsigned,unsigned>, 4> &Copies) {
+  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, SrcReg));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, SrcReg));
+  if (isDefLiveOut(DefReg, TailBB, MRI))
+    AddSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  // Remove PredBB from the PHI node.
+  MI->RemoveOperand(SrcOpIdx+1);
+  MI->RemoveOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1)
+    MI->eraseFromParent();
+}
+
+/// DuplicateInstruction - Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
+                                     MachineBasicBlock *TailBB,
+                                     MachineBasicBlock *PredBB,
+                                     MachineFunction &MF,
+                                     DenseMap<unsigned, unsigned> &LocalVRMap) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO.isDef()) {
+      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+      unsigned NewReg = MRI->createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      LocalVRMap.insert(std::make_pair(Reg, NewReg));
+      if (isDefLiveOut(Reg, TailBB, MRI))
+        AddSSAUpdateEntry(Reg, NewReg, PredBB);
+    } else {
+      DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
+      if (VI != LocalVRMap.end())
+        MO.setReg(VI->second);
+    }
+  }
+  PredBB->insert(PredBB->end(), NewMI);
+}
+
+/// UpdateSuccessorsPHIs - After FromBB is tail duplicated into its predecessor
+/// blocks, the successors have gained new predecessors. Update the PHI
+/// instructions in them accordingly.
+void
+TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                                  SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                  SmallSetVector<MachineBasicBlock*,8> &Succs) {
+  for (SmallSetVector<MachineBasicBlock*, 8>::iterator SI = Succs.begin(),
+         SE = Succs.end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
+         II != EE; ++II) {
+      if (II->getOpcode() != TargetInstrInfo::PHI)
+        break;
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = II->getOperand(i+1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = II->getOperand(Idx);
+      unsigned Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = II->getNumOperands()-2; i != Idx; i -= 2) {
+          MachineOperand &MO = II->getOperand(i+1);
+          if (MO.getMBB() == FromBB) {
+            II->RemoveOperand(i+1);
+            II->RemoveOperand(i);
+          }
+        }
+        II->RemoveOperand(Idx+1);
+        II->RemoveOperand(Idx);
+      }
+      DenseMap<unsigned,AvailableValsTy>::iterator LI=SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = LI->second[j].first;
+          unsigned SrcReg = LI->second[j].second;
+          II->addOperand(MachineOperand::CreateReg(SrcReg, false));
+          II->addOperand(MachineOperand::CreateMBB(SrcBB));
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = TDBBs[j];
+          II->addOperand(MachineOperand::CreateReg(Reg, false));
+          II->addOperand(MachineOperand::CreateMBB(SrcBB));
+        }
+      }
+    }
+  }
+}
+
 /// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
 /// of its predecessors.
-bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
-                                        MachineFunction &MF) {
+bool
+TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                 SmallVector<MachineInstr*, 16> &Copies) {
   // Don't try to tail-duplicate single-block loops.
   if (TailBB->isSuccessor(TailBB))
     return false;
@@ -129,28 +459,36 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
 
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
-  unsigned i = 0;
+  unsigned InstrCount = 0;
   bool HasCall = false;
   for (MachineBasicBlock::iterator I = TailBB->begin();
-       I != TailBB->end(); ++I, ++i) {
+       I != TailBB->end(); ++I) {
     // Non-duplicable things shouldn't be tail-duplicated.
     if (I->getDesc().isNotDuplicable()) return false;
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && I->getDesc().isReturn()) return false;
     // Don't duplicate more than the threshold.
-    if (i == MaxDuplicateCount) return false;
+    if (InstrCount == MaxDuplicateCount) return false;
     // Remember if we saw a call.
     if (I->getDesc().isCall()) HasCall = true;
+    if (I->getOpcode() != TargetInstrInfo::PHI)
+      InstrCount += 1;
   }
   // Heuristically, don't tail-duplicate calls if it would expand code size,
   // as it's less likely to be worth the extra cost.
-  if (i > 1 && HasCall)
+  if (InstrCount > 1 && HasCall)
     return false;
 
+  DEBUG(errs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
+
   // Iterate through all the unique predecessors and tail-duplicate this
   // block into them, if possible. Copying the list ahead of time also
   // avoids trouble with the predecessor list reallocating.
   bool Changed = false;
-  SmallSetVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
-                                               TailBB->pred_end());
+  SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
+                                              TailBB->pred_end());
   for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
        PE = Preds.end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
@@ -175,13 +513,35 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
     DEBUG(errs() << "\nTail-duplicating into PredBB: " << *PredBB
                  << "From Succ: " << *TailBB);
 
+    TDBBs.push_back(PredBB);
+
     // Remove PredBB's unconditional branch.
     TII->RemoveBranch(*PredBB);
+
     // Clone the contents of TailBB into PredBB.
-    for (MachineBasicBlock::iterator I = TailBB->begin(), E = TailBB->end();
-         I != E; ++I) {
-      MachineInstr *NewMI = MF.CloneMachineInstr(I);
-      PredBB->insert(PredBB->end(), NewMI);
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    while (I != TailBB->end()) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->getOpcode() == TargetInstrInfo::PHI) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap);
+      }
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
+      TII->copyRegToReg(*PredBB, Loc, CopyInfos[i].first,
+                        CopyInfos[i].second, RC,RC);
+      MachineInstr *CopyMI = prior(Loc);
+      Copies.push_back(CopyMI);
     }
     NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
 
@@ -190,8 +550,8 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
     assert(PredBB->succ_empty() &&
            "TailDuplicate called on block with multiple successors!");
     for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
-         E = TailBB->succ_end(); I != E; ++I)
-       PredBB->addSuccessor(*I);
+           E = TailBB->succ_end(); I != E; ++I)
+      PredBB->addSuccessor(*I);
 
     Changed = true;
     ++NumTailDups;
@@ -200,22 +560,56 @@ bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
   // If TailBB was duplicated into all its predecessors except for the prior
   // block, which falls through unconditionally, move the contents of this
   // block into the prior block.
-  MachineBasicBlock &PrevBB = *prior(MachineFunction::iterator(TailBB));
+  MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB));
   MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
   SmallVector<MachineOperand, 4> PriorCond;
   bool PriorUnAnalyzable =
-    TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+    TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true);
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
   if (!PriorUnAnalyzable && PriorCond.empty() && !PriorTBB &&
-      TailBB->pred_size() == 1 && PrevBB.succ_size() == 1 &&
+      TailBB->pred_size() == 1 && PrevBB->succ_size() == 1 &&
       !TailBB->hasAddressTaken()) {
-    DEBUG(errs() << "\nMerging into block: " << PrevBB
+    DEBUG(errs() << "\nMerging into block: " << *PrevBB
           << "From MBB: " << *TailBB);
-    PrevBB.splice(PrevBB.end(), TailBB, TailBB->begin(), TailBB->end());
-    PrevBB.removeSuccessor(PrevBB.succ_begin());;
-    assert(PrevBB.succ_empty());
-    PrevBB.transferSuccessors(TailBB);
+    if (PreRegAlloc) {
+      DenseMap<unsigned, unsigned> LocalVRMap;
+      SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+      MachineBasicBlock::iterator I = TailBB->begin();
+      // Process PHI instructions first.
+      while (I != TailBB->end() && I->getOpcode() == TargetInstrInfo::PHI) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        MachineInstr *MI = &*I++;
+        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos);
+        if (MI->getParent())
+          MI->eraseFromParent();
+      }
+
+      // Now copy the non-PHI instructions.
+      while (I != TailBB->end()) {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        MachineInstr *MI = &*I++;
+        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap);
+        MI->eraseFromParent();
+      }
+      MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
+      for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+        const TargetRegisterClass *RC = MRI->getRegClass(CopyInfos[i].first);
+        TII->copyRegToReg(*PrevBB, Loc, CopyInfos[i].first,
+                          CopyInfos[i].second, RC, RC);
+        MachineInstr *CopyMI = prior(Loc);
+        Copies.push_back(CopyMI);
+      }
+    } else {
+      // No PHIs to worry about, just splice the instructions over.
+      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+    }
+    PrevBB->removeSuccessor(PrevBB->succ_begin());
+    assert(PrevBB->succ_empty());
+    PrevBB->transferSuccessors(TailBB);
+    TDBBs.push_back(PrevBB);
     Changed = true;
   }
 
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 102e2a3..393e315 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -329,7 +329,7 @@ TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(const MachineInstr *
       return false;
 
     // For the def, it should be the only def of that register.
-    if (MO.isDef() && (next(MRI.def_begin(Reg)) != MRI.def_end() ||
+    if (MO.isDef() && (llvm::next(MRI.def_begin(Reg)) != MRI.def_end() ||
                        MRI.isLiveIn(Reg)))
       return false;
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 5fa690b..98b95ac 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -211,7 +211,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   ++KillPos;
 
   unsigned NumVisited = 0;
-  for (MachineBasicBlock::iterator I = next(OldPos); I != KillPos; ++I) {
+  for (MachineBasicBlock::iterator I = llvm::next(OldPos); I != KillPos; ++I) {
     MachineInstr *OtherMI = I;
     if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
       return false;
@@ -412,7 +412,7 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
     MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
     // If there are multiple defs, we can't do a simple analysis, so just
     // go with what the kill flag says.
-    if (next(Begin) != MRI->def_end())
+    if (llvm::next(Begin) != MRI->def_end())
       return true;
     DefMI = &*Begin;
     bool IsSrcPhys, IsDstPhys;
@@ -643,7 +643,7 @@ TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
     if (!Sunk) {
       DistanceMap.insert(std::make_pair(NewMI, Dist));
       mi = NewMI;
-      nmi = next(mi);
+      nmi = llvm::next(mi);
     }
     return true;
   }
@@ -923,7 +923,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
     Processed.clear();
     for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
          mi != me; ) {
-      MachineBasicBlock::iterator nmi = next(mi);
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
       const TargetInstrDesc &TID = mi->getDesc();
       bool FirstTied = true;
 
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index 10c8066..054c3b6 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -754,7 +754,7 @@ void AvailableSpills::AddAvailableRegsToLiveIn(MachineBasicBlock &MBB,
     }
 
     // Skip over the same register.
-    std::multimap<unsigned, int>::iterator NI = next(I);
+    std::multimap<unsigned, int>::iterator NI = llvm::next(I);
     while (NI != E && NI->first == Reg) {
       ++I;
       ++NI;
@@ -1133,7 +1133,7 @@ private:
                          std::vector<MachineOperand*> &KillOps,
                          VirtRegMap &VRM) {
 
-    MachineBasicBlock::iterator NextMII = next(MII);
+    MachineBasicBlock::iterator NextMII = llvm::next(MII);
     if (NextMII == MBB.end())
       return false;
 
@@ -1186,7 +1186,7 @@ private:
     // Unfold next instructions that fold the same SS.
     do {
       MachineInstr &NextMI = *NextMII;
-      NextMII = next(NextMII);
+      NextMII = llvm::next(NextMII);
       NewMIs.clear();
       if (!TII->unfoldMemoryOperand(MF, &NextMI, VirtReg, false, false, NewMIs))
         llvm_unreachable("Unable unfold the load / store folding instruction!");
@@ -1463,8 +1463,8 @@ private:
                            std::vector<MachineOperand*> &KillOps,
                            VirtRegMap &VRM) {
 
-    MachineBasicBlock::iterator oldNextMII = next(MII);
-    TII->storeRegToStackSlot(MBB, next(MII), PhysReg, true, StackSlot, RC);
+    MachineBasicBlock::iterator oldNextMII = llvm::next(MII);
+    TII->storeRegToStackSlot(MBB, llvm::next(MII), PhysReg, true, StackSlot, RC);
     MachineInstr *StoreMI = prior(oldNextMII);
     VRM.addSpillSlotUse(StackSlot, StoreMI);
     DEBUG(errs() << "Store:\t" << *StoreMI);
@@ -1626,14 +1626,14 @@ private:
     DistanceMap.clear();
     for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
          MII != E; ) {
-      MachineBasicBlock::iterator NextMII = next(MII);
+      MachineBasicBlock::iterator NextMII = llvm::next(MII);
 
       VirtRegMap::MI2VirtMapTy::const_iterator I, End;
       bool Erased = false;
       bool BackTracked = false;
       if (OptimizeByUnfold(MBB, MII,
                            MaybeDeadStores, Spills, RegKills, KillOps, VRM))
-        NextMII = next(MII);
+        NextMII = llvm::next(MII);
 
       MachineInstr &MI = *MII;
 
@@ -1657,7 +1657,7 @@ private:
 
           // Back-schedule reloads and remats.
           MachineBasicBlock::iterator InsertLoc =
-            ComputeReloadLoc(next(MII), MBB.begin(), PhysReg, TRI, false,
+            ComputeReloadLoc(llvm::next(MII), MBB.begin(), PhysReg, TRI, false,
                              SS, TII, MF);
 
           TII->loadRegFromStackSlot(MBB, InsertLoc, PhysReg, SS, RC);
@@ -1667,7 +1667,7 @@ private:
           ++NumPSpills;
           DistanceMap.insert(std::make_pair(LoadMI, Dist++));
         }
-        NextMII = next(MII);
+        NextMII = llvm::next(MII);
       }
 
       // Insert restores here if asked to.
@@ -1785,14 +1785,14 @@ private:
           const TargetRegisterClass *RC = RegInfo->getRegClass(VirtReg);
           unsigned Phys = VRM.getPhys(VirtReg);
           int StackSlot = VRM.getStackSlot(VirtReg);
-          MachineBasicBlock::iterator oldNextMII = next(MII);
-          TII->storeRegToStackSlot(MBB, next(MII), Phys, isKill, StackSlot, RC);
+          MachineBasicBlock::iterator oldNextMII = llvm::next(MII);
+          TII->storeRegToStackSlot(MBB, llvm::next(MII), Phys, isKill, StackSlot, RC);
           MachineInstr *StoreMI = prior(oldNextMII);
           VRM.addSpillSlotUse(StackSlot, StoreMI);
           DEBUG(errs() << "Store:\t" << *StoreMI);
           VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
         }
-        NextMII = next(MII);
+        NextMII = llvm::next(MII);
       }
 
       /// ReusedOperands - Keep track of operand reuse in case we need to undo
@@ -2265,7 +2265,7 @@ private:
 
               if (CommuteToFoldReload(MBB, MII, VirtReg, SrcReg, StackSlot,
                                       Spills, RegKills, KillOps, TRI, VRM)) {
-                NextMII = next(MII);
+                NextMII = llvm::next(MII);
                 BackTracked = true;
                 goto ProcessNextInst;
               }
@@ -2381,7 +2381,7 @@ private:
           MachineInstr *&LastStore = MaybeDeadStores[StackSlot];
           SpillRegToStackSlot(MBB, MII, -1, PhysReg, StackSlot, RC, true,
                             LastStore, Spills, ReMatDefs, RegKills, KillOps, VRM);
-          NextMII = next(MII);
+          NextMII = llvm::next(MII);
 
           // Check to see if this is a noop copy.  If so, eliminate the
           // instruction before considering the dest reg to be changed.