Update LLVM to r84949.

author: rdivacky <rdivacky@FreeBSD.org> 2009-10-23 14:19:52 +0000
committer: rdivacky <rdivacky@FreeBSD.org> 2009-10-23 14:19:52 +0000
commit: 9643cca39fb9fb3b49a8912926de98acf882283c (patch)
tree: 22cc59e4b240d84c3a5a60531119c4eca914a256 /lib
parent: 1adacceba9c9ee0f16e54388e56c9a249b296f75 (diff)
download: FreeBSD-src-9643cca39fb9fb3b49a8912926de98acf882283c.zip
FreeBSD-src-9643cca39fb9fb3b49a8912926de98acf882283c.tar.gz
182 files changed, 6893 insertions, 3684 deletions
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index c5be616..756ffea 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -40,10 +40,6 @@ using namespace llvm;
 // Useful predicates
 //===----------------------------------------------------------------------===//
 
-static const GEPOperator *isGEP(const Value *V) {
-  return dyn_cast<GEPOperator>(V);
-}
-
 static const Value *GetGEPOperands(const Value *V, 
                                    SmallVector<Value*, 16> &GEPOps) {
   assert(GEPOps.empty() && "Expect empty list to populate!");
@@ -53,7 +49,7 @@ static const Value *GetGEPOperands(const Value *V,
   // Accumulate all of the chained indexes into the operand array
   V = cast<User>(V)->getOperand(0);
 
-  while (const User *G = isGEP(V)) {
+  while (const GEPOperator *G = dyn_cast<GEPOperator>(V)) {
     if (!isa<Constant>(GEPOps[0]) || isa<GlobalValue>(GEPOps[0]) ||
         !cast<Constant>(GEPOps[0])->isNullValue())
       break;  // Don't handle folding arbitrary pointer offsets yet...
@@ -222,7 +218,7 @@ namespace {
 
   private:
     // VisitedPHIs - Track PHI nodes visited by a aliasCheck() call.
-    SmallSet<const PHINode*, 16> VisitedPHIs;
+    SmallPtrSet<const PHINode*, 16> VisitedPHIs;
 
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
     // against another.
@@ -358,11 +354,13 @@ BasicAliasAnalysis::getModRefInfo(CallSite CS, Value *P, unsigned Size) {
         if (alias(II->getOperand(2), PtrSize, P, Size) == NoAlias)
           return NoModRef;
       }
+      break;
       case Intrinsic::invariant_end: {
         unsigned PtrSize = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
         if (alias(II->getOperand(3), PtrSize, P, Size) == NoAlias)
           return NoModRef;
       }
+      break;
       }
     }
   }
@@ -400,7 +398,7 @@ BasicAliasAnalysis::aliasGEP(const Value *V1, unsigned V1Size,
   // Note that we also handle chains of getelementptr instructions as well as
   // constant expression getelementptrs here.
   //
-  if (isGEP(V1) && isGEP(V2)) {
+  if (isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
     const User *GEP1 = cast<User>(V1);
     const User *GEP2 = cast<User>(V2);
     
@@ -419,13 +417,13 @@ BasicAliasAnalysis::aliasGEP(const Value *V1, unsigned V1Size,
     
     // Drill down into the first non-gep value, to test for must-aliasing of
     // the base pointers.
-    while (isGEP(GEP1->getOperand(0)) &&
+    while (isa<GEPOperator>(GEP1->getOperand(0)) &&
            GEP1->getOperand(1) ==
            Constant::getNullValue(GEP1->getOperand(1)->getType()))
       GEP1 = cast<User>(GEP1->getOperand(0));
     const Value *BasePtr1 = GEP1->getOperand(0);
 
-    while (isGEP(GEP2->getOperand(0)) &&
+    while (isa<GEPOperator>(GEP2->getOperand(0)) &&
            GEP2->getOperand(1) ==
            Constant::getNullValue(GEP2->getOperand(1)->getType()))
       GEP2 = cast<User>(GEP2->getOperand(0));
@@ -531,7 +529,7 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, unsigned PNSize,
   if (!VisitedPHIs.insert(PN))
     return MayAlias;
 
-  SmallSet<Value*, 4> UniqueSrc;
+  SmallPtrSet<Value*, 4> UniqueSrc;
   SmallVector<Value*, 4> V1Srcs;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     Value *PV1 = PN->getIncomingValue(i);
@@ -555,7 +553,7 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, unsigned PNSize,
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
-    AliasResult ThisAlias = aliasCheck(V, PNSize, V2, V2Size);
+    AliasResult ThisAlias = aliasCheck(V2, V2Size, V, PNSize);
     if (ThisAlias != Alias || ThisAlias == MayAlias)
       return MayAlias;
   }
@@ -617,11 +615,11 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, unsigned V1Size,
       isNonEscapingLocalObject(O1) && O1 != O2)
     return NoAlias;
 
-  if (!isGEP(V1) && isGEP(V2)) {
+  if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
   }
-  if (isGEP(V1))
+  if (isa<GEPOperator>(V1))
     return aliasGEP(V1, V1Size, V2, V2Size);
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index 6fed400..08f070c 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -17,71 +17,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
 #include "llvm/Analysis/CFGPrinter.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/Support/CFG.h"
+
+#include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/GraphWriter.h"
 using namespace llvm;
 
-namespace llvm {
-template<>
-struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
-  static std::string getGraphName(const Function *F) {
-    return "CFG for '" + F->getNameStr() + "' function";
-  }
-
-  static std::string getNodeLabel(const BasicBlock *Node,
-                                  const Function *Graph,
-                                  bool ShortNames) {
-    if (ShortNames && !Node->getName().empty())
-      return Node->getNameStr() + ":";
-
-    std::string Str;
-    raw_string_ostream OS(Str);
-
-    if (ShortNames) {
-      WriteAsOperand(OS, Node, false);
-      return OS.str();
-    }
-
-    if (Node->getName().empty()) {
-      WriteAsOperand(OS, Node, false);
-      OS << ":";
-    }
-    
-    OS << *Node;
-    std::string OutStr = OS.str();
-    if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
-
-    // Process string output to make it nicer...
-    for (unsigned i = 0; i != OutStr.length(); ++i)
-      if (OutStr[i] == '\n') {                            // Left justify
-        OutStr[i] = '\\';
-        OutStr.insert(OutStr.begin()+i+1, 'l');
-      } else if (OutStr[i] == ';') {                      // Delete comments!
-        unsigned Idx = OutStr.find('\n', i+1);            // Find end of line
-        OutStr.erase(OutStr.begin()+i, OutStr.begin()+Idx);
-        --i;
-      }
-
-    return OutStr;
-  }
-
-  static std::string getEdgeSourceLabel(const BasicBlock *Node,
-                                        succ_const_iterator I) {
-    // Label source of conditional branches with "T" or "F"
-    if (const BranchInst *BI = dyn_cast<BranchInst>(Node->getTerminator()))
-      if (BI->isConditional())
-        return (I == succ_begin(Node)) ? "T" : "F";
-    return "";
-  }
-};
-}
-
 namespace {
   struct VISIBILITY_HIDDEN CFGViewer : public FunctionPass {
     static char ID; // Pass identifcation, replacement for typeid
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 1d2f118..d4be986 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMAnalysis
   ConstantFolding.cpp
   DbgInfoPrinter.cpp
   DebugInfo.cpp
+  DomPrinter.cpp
   IVUsers.cpp
   InlineCost.cpp
   InstCount.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 0ce1c24..214caeb 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -24,9 +24,10 @@
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
@@ -92,6 +93,265 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
   return false;
 }
 
+/// ReadDataFromGlobal - Recursive helper to read bits out of global.  C is the
+/// constant being copied out of. ByteOffset is an offset into C.  CurPtr is the
+/// pointer to copy results into and BytesLeft is the number of bytes left in
+/// the CurPtr buffer.  TD is the target data.
+static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
+                               unsigned char *CurPtr, unsigned BytesLeft,
+                               const TargetData &TD) {
+  assert(ByteOffset <= TD.getTypeAllocSize(C->getType()) &&
+         "Out of range access");
+  
+  if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
+    return true;
+  
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    if (CI->getBitWidth() > 64 ||
+        (CI->getBitWidth() & 7) != 0)
+      return false;
+    
+    uint64_t Val = CI->getZExtValue();
+    unsigned IntBytes = unsigned(CI->getBitWidth()/8);
+    
+    for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
+      CurPtr[i] = (unsigned char)(Val >> ByteOffset * 8);
+      ++ByteOffset;
+    }
+    return true;
+  }
+  
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    if (CFP->getType()->isDoubleTy()) {
+      C = ConstantExpr::getBitCast(C, Type::getInt64Ty(C->getContext()));
+      return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
+    }
+    if (CFP->getType()->isFloatTy()){
+      C = ConstantExpr::getBitCast(C, Type::getInt32Ty(C->getContext()));
+      return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
+    }
+  }
+
+  if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
+    const StructLayout *SL = TD.getStructLayout(CS->getType());
+    unsigned Index = SL->getElementContainingOffset(ByteOffset);
+    uint64_t CurEltOffset = SL->getElementOffset(Index);
+    ByteOffset -= CurEltOffset;
+    
+    while (1) {
+      // If the element access is to the element itself and not to tail padding,
+      // read the bytes from the element.
+      uint64_t EltSize = TD.getTypeAllocSize(CS->getOperand(Index)->getType());
+
+      if (ByteOffset < EltSize &&
+          !ReadDataFromGlobal(CS->getOperand(Index), ByteOffset, CurPtr,
+                              BytesLeft, TD))
+        return false;
+      
+      ++Index;
+      
+      // Check to see if we read from the last struct element, if so we're done.
+      if (Index == CS->getType()->getNumElements())
+        return true;
+
+      // If we read all of the bytes we needed from this element we're done.
+      uint64_t NextEltOffset = SL->getElementOffset(Index);
+
+      if (BytesLeft <= NextEltOffset-CurEltOffset-ByteOffset)
+        return true;
+
+      // Move to the next element of the struct.
+      BytesLeft -= NextEltOffset-CurEltOffset-ByteOffset;
+      ByteOffset = 0;
+      CurEltOffset = NextEltOffset;
+    }
+    // not reached.
+  }
+
+  if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) {
+    uint64_t EltSize = TD.getTypeAllocSize(CA->getType()->getElementType());
+    uint64_t Index = ByteOffset / EltSize;
+    uint64_t Offset = ByteOffset - Index * EltSize;
+    for (; Index != CA->getType()->getNumElements(); ++Index) {
+      if (!ReadDataFromGlobal(CA->getOperand(Index), Offset, CurPtr,
+                              BytesLeft, TD))
+        return false;
+      if (EltSize >= BytesLeft)
+        return true;
+      
+      Offset = 0;
+      BytesLeft -= EltSize;
+      CurPtr += EltSize;
+    }
+    return true;
+  }
+  
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
+    uint64_t EltSize = TD.getTypeAllocSize(CV->getType()->getElementType());
+    uint64_t Index = ByteOffset / EltSize;
+    uint64_t Offset = ByteOffset - Index * EltSize;
+    for (; Index != CV->getType()->getNumElements(); ++Index) {
+      if (!ReadDataFromGlobal(CV->getOperand(Index), Offset, CurPtr,
+                              BytesLeft, TD))
+        return false;
+      if (EltSize >= BytesLeft)
+        return true;
+      
+      Offset = 0;
+      BytesLeft -= EltSize;
+      CurPtr += EltSize;
+    }
+    return true;
+  }
+  
+  // Otherwise, unknown initializer type.
+  return false;
+}
+
+static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
+                                                 const TargetData &TD) {
+  const Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
+  const IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
+  
+  // If this isn't an integer load we can't fold it directly.
+  if (!IntType) {
+    // If this is a float/double load, we can try folding it as an int32/64 load
+    // and then bitcast the result.  This can be useful for union cases.  Note
+    // that address spaces don't matter here since we're not going to result in
+    // an actual new load.
+    const Type *MapTy;
+    if (LoadTy->isFloatTy())
+      MapTy = Type::getInt32PtrTy(C->getContext());
+    else if (LoadTy->isDoubleTy())
+      MapTy = Type::getInt64PtrTy(C->getContext());
+    else if (isa<VectorType>(LoadTy)) {
+      MapTy = IntegerType::get(C->getContext(),
+                               TD.getTypeAllocSizeInBits(LoadTy));
+      MapTy = PointerType::getUnqual(MapTy);
+    } else
+      return 0;
+
+    C = ConstantExpr::getBitCast(C, MapTy);
+    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, TD))
+      return ConstantExpr::getBitCast(Res, LoadTy);
+    return 0;
+  }
+  
+  unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
+  if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
+  
+  GlobalValue *GVal;
+  int64_t Offset;
+  if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
+    return 0;
+  
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GVal);
+  if (!GV || !GV->isConstant() || !GV->hasInitializer() ||
+      !GV->hasDefinitiveInitializer() ||
+      !GV->getInitializer()->getType()->isSized())
+    return 0;
+
+  // If we're loading off the beginning of the global, some bytes may be valid,
+  // but we don't try to handle this.
+  if (Offset < 0) return 0;
+  
+  // If we're not accessing anything in this constant, the result is undefined.
+  if (uint64_t(Offset) >= TD.getTypeAllocSize(GV->getInitializer()->getType()))
+    return UndefValue::get(IntType);
+  
+  unsigned char RawBytes[32] = {0};
+  if (!ReadDataFromGlobal(GV->getInitializer(), Offset, RawBytes,
+                          BytesLoaded, TD))
+    return 0;
+
+  APInt ResultVal(IntType->getBitWidth(), 0);
+  for (unsigned i = 0; i != BytesLoaded; ++i) {
+    ResultVal <<= 8;
+    ResultVal |= APInt(IntType->getBitWidth(), RawBytes[BytesLoaded-1-i]);
+  }
+
+  return ConstantInt::get(IntType->getContext(), ResultVal);
+}
+
+/// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
+/// produce if it is constant and determinable.  If this is not determinable,
+/// return null.
+Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
+                                             const TargetData *TD) {
+  // First, try the easy cases:
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+
+  // If the loaded value isn't a constant expr, we can't handle it.
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE) return 0;
+  
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+        if (Constant *V = 
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
+          return V;
+  }
+  
+  // Instead of loading constant c string, use corresponding integer value
+  // directly if string length is small enough.
+  std::string Str;
+  if (TD && GetConstantStringInfo(CE->getOperand(0), Str) && !Str.empty()) {
+    unsigned StrLen = Str.length();
+    const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
+    unsigned NumBits = Ty->getPrimitiveSizeInBits();
+    // Replace LI with immediate integer store.
+    if ((NumBits >> 3) == StrLen + 1) {
+      APInt StrVal(NumBits, 0);
+      APInt SingleChar(NumBits, 0);
+      if (TD->isLittleEndian()) {
+        for (signed i = StrLen-1; i >= 0; i--) {
+          SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+          StrVal = (StrVal << 8) | SingleChar;
+        }
+      } else {
+        for (unsigned i = 0; i < StrLen; i++) {
+          SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+          StrVal = (StrVal << 8) | SingleChar;
+        }
+        // Append NULL at the end.
+        SingleChar = 0;
+        StrVal = (StrVal << 8) | SingleChar;
+      }
+      return ConstantInt::get(CE->getContext(), StrVal);
+    }
+  }
+  
+  // If this load comes from anywhere in a constant global, and if the global
+  // is all undef or zero, we know what it loads.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getUnderlyingObject())){
+    if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+      const Type *ResTy = cast<PointerType>(C->getType())->getElementType();
+      if (GV->getInitializer()->isNullValue())
+        return Constant::getNullValue(ResTy);
+      if (isa<UndefValue>(GV->getInitializer()))
+        return UndefValue::get(ResTy);
+    }
+  }
+  
+  // Try hard to fold loads from bitcasted strange and non-type-safe things.  We
+  // currently don't do any of this for big endian systems.  It can be
+  // generalized in the future if someone is interested.
+  if (TD && TD->isLittleEndian())
+    return FoldReinterpretLoadFromConstPtr(CE, *TD);
+  return 0;
+}
+
+static Constant *ConstantFoldLoadInst(const LoadInst *LI, const TargetData *TD){
+  if (LI->isVolatile()) return 0;
+  
+  if (Constant *C = dyn_cast<Constant>(LI->getOperand(0)))
+    return ConstantFoldLoadFromConstPtr(C, TD);
+
+  return 0;
+}
 
 /// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
 /// Attempt to symbolically evaluate the result of a binary operator merging
@@ -380,6 +640,9 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, LLVMContext &Context,
                                            Ops.data(), Ops.size(), 
                                            Context, TD);
   
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return ConstantFoldLoadInst(LI, TD);
+  
   return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
                                   Ops.data(), Ops.size(), Context, TD);
 }
@@ -640,15 +903,15 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
           C = UndefValue::get(ATy->getElementType());
         else
           return 0;
-      } else if (const VectorType *PTy = dyn_cast<VectorType>(*I)) {
-        if (CI->getZExtValue() >= PTy->getNumElements())
+      } else if (const VectorType *VTy = dyn_cast<VectorType>(*I)) {
+        if (CI->getZExtValue() >= VTy->getNumElements())
           return 0;
         if (ConstantVector *CP = dyn_cast<ConstantVector>(C))
           C = CP->getOperand(CI->getZExtValue());
         else if (isa<ConstantAggregateZero>(C))
-          C = Constant::getNullValue(PTy->getElementType());
+          C = Constant::getNullValue(VTy->getElementType());
         else if (isa<UndefValue>(C))
-          C = UndefValue::get(PTy->getElementType());
+          C = UndefValue::get(VTy->getElementType());
         else
           return 0;
       } else {
diff --git a/lib/Analysis/DomPrinter.cpp b/lib/Analysis/DomPrinter.cpp
new file mode 100644
index 0000000..f1b44d0
--- /dev/null
+++ b/lib/Analysis/DomPrinter.cpp
@@ -0,0 +1,265 @@
+//===- DomPrinter.cpp - DOT printer for the dominance trees    ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines '-dot-dom' and '-dot-postdom' analysis passes, which emit
+// a dom.<fnname>.dot or postdom.<fnname>.dot file for each function in the
+// program, with a graph of the dominance/postdominance tree of that
+// function.
+//
+// There are also passes available to directly call dotty ('-view-dom' or
+// '-view-postdom'). By appending '-only' like '-dot-dom-only' only the
+// names of the bbs are printed, but the content is hidden.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DomPrinter.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/PostDominators.h"
+
+using namespace llvm;
+
+namespace llvm {
+template<>
+struct DOTGraphTraits<DomTreeNode*> : public DefaultDOTGraphTraits {
+  static std::string getNodeLabel(DomTreeNode *Node, DomTreeNode *Graph,
+                                  bool ShortNames) {
+
+    BasicBlock *BB = Node->getBlock();
+
+    if (!BB)
+      return "Post dominance root node";
+
+    return DOTGraphTraits<const Function*>::getNodeLabel(BB, BB->getParent(),
+                                                         ShortNames);
+  }
+};
+
+template<>
+struct DOTGraphTraits<DominatorTree*> : public DOTGraphTraits<DomTreeNode*> {
+
+  static std::string getGraphName(DominatorTree *DT) {
+    return "Dominator tree";
+  }
+
+  static std::string getNodeLabel(DomTreeNode *Node,
+                                  DominatorTree *G,
+                                  bool ShortNames) {
+    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node, G->getRootNode(),
+                                                      ShortNames);
+  }
+};
+
+template<>
+struct DOTGraphTraits<PostDominatorTree*>
+  : public DOTGraphTraits<DomTreeNode*> {
+  static std::string getGraphName(PostDominatorTree *DT) {
+    return "Post dominator tree";
+  }
+  static std::string getNodeLabel(DomTreeNode *Node,
+                                  PostDominatorTree *G,
+                                  bool ShortNames) {
+    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node,
+                                                      G->getRootNode(),
+                                                      ShortNames);
+  }
+};
+}
+
+namespace {
+template <class Analysis, bool OnlyBBS>
+struct GenericGraphViewer : public FunctionPass {
+  std::string Name;
+
+  GenericGraphViewer(std::string GraphName, const void *ID) : FunctionPass(ID) {
+    Name = GraphName;
+  }
+
+  virtual bool runOnFunction(Function &F) {
+    Analysis *Graph;
+
+    Graph = &getAnalysis<Analysis>();
+    ViewGraph(Graph, Name, OnlyBBS);
+
+    return false;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    AU.addRequired<Analysis>();
+  }
+};
+
+struct DomViewer
+  : public GenericGraphViewer<DominatorTree, false> {
+  static char ID;
+  DomViewer() : GenericGraphViewer<DominatorTree, false>("dom", &ID){}
+};
+
+struct DomOnlyViewer
+  : public GenericGraphViewer<DominatorTree, true> {
+  static char ID;
+  DomOnlyViewer() : GenericGraphViewer<DominatorTree, true>("domonly", &ID){}
+};
+
+struct PostDomViewer
+  : public GenericGraphViewer<PostDominatorTree, false> {
+  static char ID;
+  PostDomViewer() :
+    GenericGraphViewer<PostDominatorTree, false>("postdom", &ID){}
+};
+
+struct PostDomOnlyViewer
+  : public GenericGraphViewer<PostDominatorTree, true> {
+  static char ID;
+  PostDomOnlyViewer() :
+    GenericGraphViewer<PostDominatorTree, true>("postdomonly", &ID){}
+};
+} // end anonymous namespace
+
+char DomViewer::ID = 0;
+RegisterPass<DomViewer> A("view-dom",
+                          "View dominance tree of function");
+
+char DomOnlyViewer::ID = 0;
+RegisterPass<DomOnlyViewer> B("view-dom-only",
+                              "View dominance tree of function "
+                              "(with no function bodies)");
+
+char PostDomViewer::ID = 0;
+RegisterPass<PostDomViewer> C("view-postdom",
+                              "View postdominance tree of function");
+
+char PostDomOnlyViewer::ID = 0;
+RegisterPass<PostDomOnlyViewer> D("view-postdom-only",
+                                  "View postdominance tree of function "
+                                  "(with no function bodies)");
+
+namespace {
+template <class Analysis, bool OnlyBBS>
+struct GenericGraphPrinter : public FunctionPass {
+
+  std::string Name;
+
+  GenericGraphPrinter(std::string GraphName, const void *ID)
+    : FunctionPass(ID) {
+    Name = GraphName;
+  }
+
+  virtual bool runOnFunction(Function &F) {
+    Analysis *Graph;
+    std::string Filename = Name + "." + F.getNameStr() + ".dot";
+    errs() << "Writing '" << Filename << "'...";
+
+    std::string ErrorInfo;
+    raw_fd_ostream File(Filename.c_str(), ErrorInfo);
+    Graph = &getAnalysis<Analysis>();
+
+    if (ErrorInfo.empty())
+      WriteGraph(File, Graph, OnlyBBS);
+    else
+      errs() << "  error opening file for writing!";
+    errs() << "\n";
+    return false;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    AU.addRequired<Analysis>();
+  }
+};
+
+struct DomPrinter
+  : public GenericGraphPrinter<DominatorTree, false> {
+  static char ID;
+  DomPrinter() : GenericGraphPrinter<DominatorTree, false>("dom", &ID) {}
+};
+
+struct DomOnlyPrinter
+  : public GenericGraphPrinter<DominatorTree, true> {
+  static char ID;
+  DomOnlyPrinter() : GenericGraphPrinter<DominatorTree, true>("domonly", &ID) {}
+};
+
+struct PostDomPrinter
+  : public GenericGraphPrinter<PostDominatorTree, false> {
+  static char ID;
+  PostDomPrinter() :
+    GenericGraphPrinter<PostDominatorTree, false>("postdom", &ID) {}
+};
+
+struct PostDomOnlyPrinter
+  : public GenericGraphPrinter<PostDominatorTree, true> {
+  static char ID;
+  PostDomOnlyPrinter() :
+    GenericGraphPrinter<PostDominatorTree, true>("postdomonly", &ID) {}
+};
+} // end anonymous namespace
+
+
+
+char DomPrinter::ID = 0;
+RegisterPass<DomPrinter> E("dot-dom",
+                           "Print dominance tree of function "
+                           "to 'dot' file");
+
+char DomOnlyPrinter::ID = 0;
+RegisterPass<DomOnlyPrinter> F("dot-dom-only",
+                               "Print dominance tree of function "
+                               "to 'dot' file "
+                               "(with no function bodies)");
+
+char PostDomPrinter::ID = 0;
+RegisterPass<PostDomPrinter> G("dot-postdom",
+                               "Print postdominance tree of function "
+                               "to 'dot' file");
+
+char PostDomOnlyPrinter::ID = 0;
+RegisterPass<PostDomOnlyPrinter> H("dot-postdom-only",
+                                   "Print postdominance tree of function "
+                                   "to 'dot' file "
+                                   "(with no function bodies)");
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+FunctionPass *llvm::createDomPrinterPass() {
+  return new DomPrinter();
+}
+
+FunctionPass *llvm::createDomOnlyPrinterPass() {
+  return new DomOnlyPrinter();
+}
+
+FunctionPass *llvm::createDomViewerPass() {
+  return new DomViewer();
+}
+
+FunctionPass *llvm::createDomOnlyViewerPass() {
+  return new DomOnlyViewer();
+}
+
+FunctionPass *llvm::createPostDomPrinterPass() {
+  return new PostDomPrinter();
+}
+
+FunctionPass *llvm::createPostDomOnlyPrinterPass() {
+  return new PostDomOnlyPrinter();
+}
+
+FunctionPass *llvm::createPostDomViewerPass() {
+  return new PostDomViewer();
+}
+
+FunctionPass *llvm::createPostDomOnlyViewerPass() {
+  return new PostDomOnlyViewer();
+}
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index f5c1108..7949288 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -303,7 +303,7 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
       // Check the value being stored.
       Value *Ptr = SI->getOperand(0)->getUnderlyingObject();
 
-      if (isa<MallocInst>(Ptr) || isMalloc(Ptr)) {
+      if (isMalloc(Ptr)) {
         // Okay, easy case.
       } else if (CallInst *CI = dyn_cast<CallInst>(Ptr)) {
         Function *F = CI->getCalledFunction();
@@ -439,8 +439,7 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
           if (cast<StoreInst>(*II).isVolatile())
             // Treat volatile stores as reading memory somewhere.
             FunctionEffect |= Ref;
-        } else if (isa<MallocInst>(*II) || isa<FreeInst>(*II) ||
-                   isMalloc(&cast<Instruction>(*II))) {
+        } else if (isMalloc(&cast<Instruction>(*II)) || isa<FreeInst>(*II)) {
           FunctionEffect |= ModRef;
         }
 
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 3b0d2c9..b833baa 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -131,7 +131,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
     }
     
     // These, too, are calls.
-    if (isa<MallocInst>(II) || isa<FreeInst>(II))
+    if (isa<FreeInst>(II))
       NumInsts += InlineConstants::CallPenalty;
 
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
index 83724ca..4cde793 100644
--- a/lib/Analysis/InstCount.cpp
+++ b/lib/Analysis/InstCount.cpp
@@ -76,11 +76,11 @@ FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
 bool InstCount::runOnFunction(Function &F) {
   unsigned StartMemInsts =
     NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst + NumMallocInst + NumFreeInst;
+    NumInvokeInst + NumAllocaInst + NumFreeInst;
   visit(F);
   unsigned EndMemInsts =
     NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst + NumMallocInst + NumFreeInst;
+    NumInvokeInst + NumAllocaInst + NumFreeInst;
   TotalMemInst += EndMemInsts-StartMemInsts;
   return false;
 }
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index ce2d29f..e9256b7 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -292,6 +292,9 @@ bool Loop::isLoopSimplifyForm() const {
   // Normal-form loops have a single backedge.
   if (!getLoopLatch())
     return false;
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallPtrSet<BasicBlock *, 16> LoopBBs(block_begin(), block_end());
   // Each predecessor of each exit block of a normal loop is contained
   // within the loop.
   SmallVector<BasicBlock *, 4> ExitBlocks;
@@ -299,7 +302,7 @@ bool Loop::isLoopSimplifyForm() const {
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
     for (pred_iterator PI = pred_begin(ExitBlocks[i]),
          PE = pred_end(ExitBlocks[i]); PI != PE; ++PI)
-      if (!contains(*PI))
+      if (!LoopBBs.count(*PI))
         return false;
   // All the requirements are met.
   return true;
diff --git a/lib/Analysis/MallocHelper.cpp b/lib/Analysis/MallocHelper.cpp
index 89051d1..e7bb41e 100644
--- a/lib/Analysis/MallocHelper.cpp
+++ b/lib/Analysis/MallocHelper.cpp
@@ -130,9 +130,9 @@ static bool isArrayMallocHelper(const CallInst *CI, LLVMContext &Context,
 /// matches the malloc call IR generated by CallInst::CreateMalloc().  This 
 /// means that it is a malloc call with one bitcast use AND the malloc call's 
 /// size argument is:
-///  1. a constant not equal to the malloc's allocated type
+///  1. a constant not equal to the size of the malloced type
 /// or
-///  2. the result of a multiplication by the malloc's allocated type
+///  2. the result of a multiplication by the size of the malloced type
 /// Otherwise it returns NULL.
 /// The unique bitcast is needed to determine the type/size of the array
 /// allocation.
@@ -183,25 +183,60 @@ const Type* llvm::getMallocAllocatedType(const CallInst* CI) {
   return PT ? PT->getElementType() : NULL;
 }
 
+/// isSafeToGetMallocArraySize - Returns true if the array size of a malloc can
+/// be determined.  It can be determined in these 3 cases of malloc codegen:
+/// 1. non-array malloc: The malloc's size argument is a constant and equals the ///    size of the type being malloced.
+/// 2. array malloc: This is a malloc call with one bitcast use AND the malloc
+///    call's size argument is a constant multiple of the size of the malloced
+///    type.
+/// 3. array malloc: This is a malloc call with one bitcast use AND the malloc
+///    call's size argument is the result of a multiplication by the size of the
+///    malloced type.
+/// Otherwise returns false.
+static bool isSafeToGetMallocArraySize(const CallInst *CI,
+                                       LLVMContext &Context,
+                                       const TargetData* TD) {
+  if (!CI)
+    return false;
+
+  // Type must be known to determine array size.
+  const Type* T = getMallocAllocatedType(CI);
+  if (!T) return false;
+
+  Value* MallocArg = CI->getOperand(1);
+  Constant *ElementSize = ConstantExpr::getSizeOf(T);
+  ElementSize = ConstantExpr::getTruncOrBitCast(ElementSize, 
+                                                MallocArg->getType());
+
+  // First, check if it is a non-array malloc.
+  if (isa<ConstantExpr>(MallocArg) && (MallocArg == ElementSize))
+    return true;
+
+  // Second, check if it can be determined that this is an array malloc.
+  return isArrayMallocHelper(CI, Context, TD);
+}
+
 /// isConstantOne - Return true only if val is constant int 1.
 static bool isConstantOne(Value *val) {
   return isa<ConstantInt>(val) && cast<ConstantInt>(val)->isOne();
 }
 
-/// getMallocArraySize - Returns the array size of a malloc call.  The array
-/// size is computated in 1 of 3 ways:
-///  1. If the element type if of size 1, then array size is the argument to 
+/// getMallocArraySize - Returns the array size of a malloc call.  For array
+/// mallocs, the size is computated in 1 of 3 ways:
+///  1. If the element type is of size 1, then array size is the argument to 
 ///     malloc.
 ///  2. Else if the malloc's argument is a constant, the array size is that
 ///     argument divided by the element type's size.
 ///  3. Else the malloc argument must be a multiplication and the array size is
 ///     the first operand of the multiplication.
-/// This function returns constant 1 if:
-///  1. The malloc call's allocated type cannot be determined.
-///  2. IR wasn't created by a call to CallInst::CreateMalloc() with a non-NULL
-///     ArraySize.
+/// For non-array mallocs, the computed size is constant 1. 
+/// This function returns NULL for all mallocs whose array size cannot be
+/// determined.
 Value* llvm::getMallocArraySize(CallInst* CI, LLVMContext &Context,
                                 const TargetData* TD) {
+  if (!isSafeToGetMallocArraySize(CI, Context, TD))
+    return NULL;
+
   // Match CreateMalloc's use of constant 1 array-size for non-array mallocs.
   if (!isArrayMalloc(CI, Context, TD))
     return ConstantInt::get(CI->getOperand(1)->getType(), 1);
diff --git a/lib/Analysis/PointerTracking.cpp b/lib/Analysis/PointerTracking.cpp
index 43f4af3..2309fbc 100644
--- a/lib/Analysis/PointerTracking.cpp
+++ b/lib/Analysis/PointerTracking.cpp
@@ -102,8 +102,9 @@ const SCEV *PointerTracking::computeAllocationCount(Value *P,
 
   if (CallInst *CI = extractMallocCall(V)) {
     Value *arraySize = getMallocArraySize(CI, P->getContext(), TD);
-    Ty = getMallocAllocatedType(CI);
-    if (!Ty || !arraySize) return SE->getCouldNotCompute();
+    const Type* AllocTy = getMallocAllocatedType(CI);
+    if (!AllocTy || !arraySize) return SE->getCouldNotCompute();
+    Ty = AllocTy;
     // arraySize elements of type Ty.
     return SE->getSCEV(arraySize);
   }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index baa347a..dc0d489 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -469,26 +469,11 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     break;
   }
 
-  case Instruction::Alloca:
-  case Instruction::Malloc: {
+  case Instruction::Alloca: {
     AllocationInst *AI = cast<AllocationInst>(V);
     unsigned Align = AI->getAlignment();
-    if (Align == 0 && TD) {
-      if (isa<AllocaInst>(AI))
-        Align = TD->getABITypeAlignment(AI->getType()->getElementType());
-      else if (isa<MallocInst>(AI)) {
-        // Malloc returns maximally aligned memory.
-        Align = TD->getABITypeAlignment(AI->getType()->getElementType());
-        Align =
-          std::max(Align,
-                   (unsigned)TD->getABITypeAlignment(
-                     Type::getDoubleTy(V->getContext())));
-        Align =
-          std::max(Align,
-                   (unsigned)TD->getABITypeAlignment(
-                      Type::getInt64Ty(V->getContext())));
-      }
-    }
+    if (Align == 0 && TD)
+      Align = TD->getABITypeAlignment(AI->getType()->getElementType());
     
     if (Align > 0)
       KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 0e9f1a0..f6cea88 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -529,7 +529,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(module);
   KEYWORD(asm);
   KEYWORD(sideeffect);
-  KEYWORD(msasm);
+  KEYWORD(alignstack);
   KEYWORD(gc);
 
   KEYWORD(ccc);
@@ -602,6 +602,10 @@ lltok::Kind LLLexer::LexIdentifier() {
     // Scan CurPtr ahead, seeing if there is just whitespace before the newline.
     if (JustWhitespaceNewLine(CurPtr))
       return lltok::kw_zeroext;
+  } else if (Len == 6 && !memcmp(StartChar, "malloc", 6)) {
+    // FIXME: Remove in LLVM 3.0.
+    // Autoupgrade malloc instruction.
+    return lltok::kw_malloc;
   }
 
   // Keywords for instructions.
@@ -641,7 +645,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(unwind,      Unwind);
   INSTKEYWORD(unreachable, Unreachable);
 
-  INSTKEYWORD(malloc,      Malloc);
   INSTKEYWORD(alloca,      Alloca);
   INSTKEYWORD(free,        Free);
   INSTKEYWORD(load,        Load);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 09bc5f7..271567b 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -69,6 +69,23 @@ bool LLParser::Run() {
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
 bool LLParser::ValidateEndOfModule() {
+  // Update auto-upgraded malloc calls to "malloc".
+  // FIXME: Remove in LLVM 3.0.
+  if (MallocF) {
+    MallocF->setName("malloc");
+    // If setName() does not set the name to "malloc", then there is already a 
+    // declaration of "malloc".  In that case, iterate over all calls to MallocF
+    // and get them to call the declared "malloc" instead.
+    if (MallocF->getName() != "malloc") {
+      Constant* RealMallocF = M->getFunction("malloc");
+      if (RealMallocF->getType() != MallocF->getType())
+        RealMallocF = ConstantExpr::getBitCast(RealMallocF, MallocF->getType());
+      MallocF->replaceAllUsesWith(RealMallocF);
+      MallocF->eraseFromParent();
+      MallocF = NULL;
+    }
+  }
+
   if (!ForwardRefTypes.empty())
     return Error(ForwardRefTypes.begin()->second.second,
                  "use of undefined type named '" +
@@ -1029,14 +1046,12 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
 ///   ::= /* empty */
 ///   ::= !dbg !42
 bool LLParser::ParseOptionalCustomMetadata() {
-
-  std::string Name;
-  if (Lex.getKind() == lltok::NamedOrCustomMD) {
-    Name = Lex.getStrVal();
-    Lex.Lex();
-  } else
+  if (Lex.getKind() != lltok::NamedOrCustomMD)
     return false;
 
+  std::string Name = Lex.getStrVal();
+  Lex.Lex();
+
   if (Lex.getKind() != lltok::Metadata)
     return TokError("Expected '!' here");
   Lex.Lex();
@@ -1047,7 +1062,7 @@ bool LLParser::ParseOptionalCustomMetadata() {
   MetadataContext &TheMetadata = M->getContext().getMetadata();
   unsigned MDK = TheMetadata.getMDKind(Name.c_str());
   if (!MDK)
-    MDK = TheMetadata.RegisterMDKind(Name.c_str());
+    MDK = TheMetadata.registerMDKind(Name.c_str());
   MDsOnInst.push_back(std::make_pair(MDK, cast<MDNode>(Node)));
 
   return false;
@@ -1959,17 +1974,17 @@ bool LLParser::ParseValID(ValID &ID) {
     return false;
 
   case lltok::kw_asm: {
-    // ValID ::= 'asm' SideEffect? MsAsm? STRINGCONSTANT ',' STRINGCONSTANT
-    bool HasSideEffect, MsAsm;
+    // ValID ::= 'asm' SideEffect? AlignStack? STRINGCONSTANT ',' STRINGCONSTANT
+    bool HasSideEffect, AlignStack;
     Lex.Lex();
     if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
-        ParseOptionalToken(lltok::kw_msasm, MsAsm) ||
+        ParseOptionalToken(lltok::kw_alignstack, AlignStack) ||
         ParseStringConstant(ID.StrVal) ||
         ParseToken(lltok::comma, "expected comma in inline asm expression") ||
         ParseToken(lltok::StringConstant, "expected constraint string"))
       return true;
     ID.StrVal2 = Lex.getStrVal();
-    ID.UIntVal = HasSideEffect | ((unsigned)MsAsm<<1);
+    ID.UIntVal = HasSideEffect | ((unsigned)AlignStack<<1);
     ID.Kind = ValID::t_InlineAsm;
     return false;
   }
@@ -2783,8 +2798,8 @@ bool LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_call:           return ParseCall(Inst, PFS, false);
   case lltok::kw_tail:           return ParseCall(Inst, PFS, true);
   // Memory.
-  case lltok::kw_alloca:
-  case lltok::kw_malloc:         return ParseAlloc(Inst, PFS, KeywordVal);
+  case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
+  case lltok::kw_malloc:         return ParseAlloc(Inst, PFS, BB, false);
   case lltok::kw_free:           return ParseFree(Inst, PFS);
   case lltok::kw_load:           return ParseLoad(Inst, PFS, false);
   case lltok::kw_store:          return ParseStore(Inst, PFS, false);
@@ -2858,8 +2873,6 @@ bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
   if (ParseType(Ty, true /*void allowed*/)) return true;
 
   if (Ty->isVoidTy()) {
-    if (EatIfPresent(lltok::comma))
-      if (ParseOptionalCustomMetadata()) return true;
     Inst = ReturnInst::Create(Context);
     return false;
   }
@@ -2895,8 +2908,6 @@ bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
       }
     }
   }
-  if (EatIfPresent(lltok::comma))
-    if (ParseOptionalCustomMetadata()) return true;
 
   Inst = ReturnInst::Create(Context, RV);
   return false;
@@ -3294,7 +3305,7 @@ bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
 }
 
 /// ParsePHI
-///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Valueß ']')*
+///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Value ']')*
 bool LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
   PATypeHolder Ty(Type::getVoidTy(Context));
   Value *Op0, *Op1;
@@ -3315,6 +3326,9 @@ bool LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
     if (!EatIfPresent(lltok::comma))
       break;
 
+    if (Lex.getKind() == lltok::NamedOrCustomMD)
+      break;
+
     if (ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
         ParseValue(Ty, Op0, PFS) ||
         ParseToken(lltok::comma, "expected ',' after insertelement value") ||
@@ -3323,6 +3337,9 @@ bool LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
       return true;
   }
 
+  if (Lex.getKind() == lltok::NamedOrCustomMD)
+    if (ParseOptionalCustomMetadata()) return true;
+
   if (!Ty->isFirstClassType())
     return Error(TypeLoc, "phi node must have first class type");
 
@@ -3439,7 +3456,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 ///   ::= 'malloc' Type (',' TypeAndValue)? (',' OptionalInfo)?
 ///   ::= 'alloca' Type (',' TypeAndValue)? (',' OptionalInfo)?
 bool LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
-                          unsigned Opc) {
+                          BasicBlock* BB, bool isAlloca) {
   PATypeHolder Ty(Type::getVoidTy(Context));
   Value *Size = 0;
   LocTy SizeLoc;
@@ -3460,10 +3477,20 @@ bool LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS,
   if (Size && Size->getType() != Type::getInt32Ty(Context))
     return Error(SizeLoc, "element count must be i32");
 
-  if (Opc == Instruction::Malloc)
-    Inst = new MallocInst(Ty, Size, Alignment);
-  else
+  if (isAlloca) {
     Inst = new AllocaInst(Ty, Size, Alignment);
+    return false;
+  }
+
+  // Autoupgrade old malloc instruction to malloc call.
+  // FIXME: Remove in LLVM 3.0.
+  const Type *IntPtrTy = Type::getInt32Ty(Context);
+  if (!MallocF)
+    // Prototype malloc as "void *(int32)".
+    // This function is renamed as "malloc" in ValidateEndOfModule().
+    MallocF = cast<Function>(
+       M->getOrInsertFunction("", Type::getInt8PtrTy(Context), IntPtrTy, NULL));
+  Inst = CallInst::CreateMalloc(BB, IntPtrTy, Ty, Size, MallocF);
   return false;
 }
 
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 97bf2f3..5dd6a2e 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -75,9 +75,11 @@ namespace llvm {
     std::map<std::string, std::pair<GlobalValue*, LocTy> > ForwardRefVals;
     std::map<unsigned, std::pair<GlobalValue*, LocTy> > ForwardRefValIDs;
     std::vector<GlobalValue*> NumberedVals;
+    Function* MallocF;
   public:
     LLParser(MemoryBuffer *F, SourceMgr &SM, SMDiagnostic &Err, Module *m) : 
-      Context(m->getContext()), Lex(F, SM, Err, m->getContext()), M(m) {}
+      Context(m->getContext()), Lex(F, SM, Err, m->getContext()),
+      M(m), MallocF(NULL) {}
     bool Run();
 
     LLVMContext& getContext() { return Context; }
@@ -276,7 +278,8 @@ namespace llvm {
     bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
     bool ParsePHI(Instruction *&I, PerFunctionState &PFS);
     bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail);
-    bool ParseAlloc(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseAlloc(Instruction *&I, PerFunctionState &PFS,
+                    BasicBlock *BB = 0, bool isAlloca = true);
     bool ParseFree(Instruction *&I, PerFunctionState &PFS);
     bool ParseLoad(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
     bool ParseStore(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index b3c59ee..f5072fe 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -62,9 +62,8 @@ namespace lltok {
     kw_module,
     kw_asm,
     kw_sideeffect,
-    kw_msasm,
+    kw_alignstack,
     kw_gc,
-    kw_dbg,
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 4eb12c6..3a385cb 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -840,7 +840,15 @@ bool BitcodeReader::ParseMetadata() {
       for (unsigned i = 1; i != RecordLength; ++i)
         Name[i-1] = Record[i];
       MetadataContext &TheMetadata = Context.getMetadata();
-      TheMetadata.MDHandlerNames[Name.str()] = Kind;
+      unsigned ExistingKind = TheMetadata.getMDKind(Name.str());
+      if (ExistingKind == 0) {
+        unsigned NewKind = TheMetadata.registerMDKind(Name.str());
+        assert (Kind == NewKind 
+                && "Unable to handle custom metadata mismatch!");
+      } else {
+        assert (ExistingKind == Kind 
+                && "Unable to handle custom metadata mismatch!");
+      }
       break;
     }
     }
@@ -1165,7 +1173,7 @@ bool BitcodeReader::ParseConstants() {
       if (Record.size() < 2) return Error("Invalid INLINEASM record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
-      bool IsMsAsm = Record[0] >> 1;
+      bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
         return Error("Invalid INLINEASM record");
@@ -1179,7 +1187,7 @@ bool BitcodeReader::ParseConstants() {
         ConstrStr += (char)Record[3+AsmStrSize+i];
       const PointerType *PTy = cast<PointerType>(CurTy);
       V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
-                         AsmStr, ConstrStr, HasSideEffects, IsMsAsm);
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
       break;
     }
     }
@@ -2044,14 +2052,18 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
 
     case bitc::FUNC_CODE_INST_MALLOC: { // MALLOC: [instty, op, align]
+      // Autoupgrade malloc instruction to malloc call.
+      // FIXME: Remove in LLVM 3.0.
       if (Record.size() < 3)
         return Error("Invalid MALLOC record");
       const PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
       Value *Size = getFnValueByID(Record[1], Type::getInt32Ty(Context));
-      unsigned Align = Record[2];
       if (!Ty || !Size) return Error("Invalid MALLOC record");
-      I = new MallocInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+      if (!CurBB) return Error("Invalid malloc instruction with no BB");
+      const Type *Int32Ty = IntegerType::getInt32Ty(CurBB->getContext());
+      I = CallInst::CreateMalloc(CurBB, Int32Ty, Ty->getElementType(),
+                                 Size, NULL);
       InstructionList.push_back(I);
       break;
     }
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 12a1f5e..037854e 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -517,9 +517,7 @@ static void WriteModuleMetadata(const ValueEnumerator &VE,
       }
 
       // Code: [strchar x N]
-      const char *StrBegin = MDS->begin();
-      for (unsigned i = 0, e = MDS->length(); i != e; ++i)
-        Record.push_back(StrBegin[i]);
+      Record.append(MDS->begin(), MDS->end());
 
       // Emit the finished record.
       Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
@@ -563,22 +561,22 @@ static void WriteMetadataAttachment(const Function &F,
   // Write metadata attachments
   // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
   MetadataContext &TheMetadata = F.getContext().getMetadata();
+  typedef SmallVector<std::pair<unsigned, TrackingVH<MDNode> >, 2> MDMapTy;
+  MDMapTy MDs;
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
          I != E; ++I) {
-      const MetadataContext::MDMapTy *P = TheMetadata.getMDs(I);
-      if (!P) continue;
+      MDs.clear();
+      TheMetadata.getMDs(I, MDs);
       bool RecordedInstruction = false;
-      for (MetadataContext::MDMapTy::const_iterator PI = P->begin(), 
-             PE = P->end(); PI != PE; ++PI) {
-        if (MDNode *ND = dyn_cast_or_null<MDNode>(PI->second)) {
-          if (RecordedInstruction == false) {
-            Record.push_back(VE.getInstructionID(I));
-            RecordedInstruction = true;
-          }
-          Record.push_back(PI->first);
-          Record.push_back(VE.getValueID(ND));
+      for (MDMapTy::const_iterator PI = MDs.begin(), PE = MDs.end();
+             PI != PE; ++PI) {
+        if (RecordedInstruction == false) {
+          Record.push_back(VE.getInstructionID(I));
+          RecordedInstruction = true;
         }
+        Record.push_back(PI->first);
+        Record.push_back(VE.getValueID(PI->second));
       }
       if (!Record.empty()) {
         if (!StartedMetadataBlock)  {
@@ -604,11 +602,13 @@ static void WriteModuleMetadataStore(const Module *M,
   // Write metadata kinds
   // METADATA_KIND - [n x [id, name]]
   MetadataContext &TheMetadata = M->getContext().getMetadata();
-  const StringMap<unsigned> *Kinds = TheMetadata.getHandlerNames();
-  for (StringMap<unsigned>::const_iterator
-         I = Kinds->begin(), E = Kinds->end(); I != E; ++I) {
-    Record.push_back(I->second);
-    StringRef KName = I->first();
+  SmallVector<std::pair<unsigned, StringRef>, 4> Names;
+  TheMetadata.getHandlerNames(Names);
+  for (SmallVector<std::pair<unsigned, StringRef>, 4>::iterator 
+         I = Names.begin(),
+         E = Names.end(); I != E; ++I) {
+    Record.push_back(I->first);
+    StringRef KName = I->second;
     for (unsigned i = 0, e = KName.size(); i != e; ++i)
       Record.push_back(KName[i]);
     if (!StartedMetadataBlock)  {
@@ -680,7 +680,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
 
     if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
       Record.push_back(unsigned(IA->hasSideEffects()) |
-                       unsigned(IA->isMsAsm()) << 1);
+                       unsigned(IA->isAlignStack()) << 1);
 
       // Add the asm string.
       const std::string &AsmStr = IA->getAsmString();
@@ -1054,13 +1054,6 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals.push_back(VE.getValueID(I.getOperand(i)));
     break;
 
-  case Instruction::Malloc:
-    Code = bitc::FUNC_CODE_INST_MALLOC;
-    Vals.push_back(VE.getTypeID(I.getType()));
-    Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
-    Vals.push_back(Log2_32(cast<MallocInst>(I).getAlignment())+1);
-    break;
-
   case Instruction::Free:
     Code = bitc::FUNC_CODE_INST_FREE;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 60253ad..85aa5fa 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -87,6 +87,8 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
       EnumerateType(I->getType());
 
     MetadataContext &TheMetadata = F->getContext().getMetadata();
+    typedef SmallVector<std::pair<unsigned, TrackingVH<MDNode> >, 2> MDMapTy;
+    MDMapTy MDs;
     for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;++I){
         for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
@@ -99,12 +101,11 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
           EnumerateAttributes(II->getAttributes());
 
         // Enumerate metadata attached with this instruction.
-        const MetadataContext::MDMapTy *MDs = TheMetadata.getMDs(I);
-        if (MDs)
-          for (MetadataContext::MDMapTy::const_iterator MI = MDs->begin(),
-                 ME = MDs->end(); MI != ME; ++MI)
-            if (MDNode *MDN = dyn_cast_or_null<MDNode>(MI->second))
-              EnumerateMetadata(MDN);
+        MDs.clear();
+        TheMetadata.getMDs(I, MDs);
+        for (MDMapTy::const_iterator MI = MDs.begin(), ME = MDs.end(); MI != ME;
+             ++MI)
+          EnumerateMetadata(MI->second);
       }
   }
 
@@ -214,10 +215,9 @@ void ValueEnumerator::EnumerateMetadata(const MetadataBase *MD) {
     MDValues.push_back(std::make_pair(MD, 1U));
     MDValueMap[MD] = MDValues.size();
     MDValueID = MDValues.size();
-    for (MDNode::const_elem_iterator I = N->elem_begin(), E = N->elem_end();
-         I != E; ++I) {
-      if (*I)
-        EnumerateValue(*I);
+    for (unsigned i = 0, e = N->getNumElements(); i != e; ++i) {    
+      if (Value *V = N->getElement(i))
+        EnumerateValue(V);
       else
         EnumerateType(Type::getVoidTy(MD->getContext()));
     }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 2914851..d50e5e3 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -923,7 +923,7 @@ void DwarfDebug::ConstructTypeDIE(CompileUnit *DW_Unit, DIE &Buffer,
   AddType(DW_Unit, &Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
-  if (Name)
+  if (Name && Tag != dwarf::DW_TAG_pointer_type)
     AddString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
 
   // Add size if non-zero (derived types might be zero-sized.)
@@ -1811,10 +1811,18 @@ void DwarfDebug::CollectVariableInfo() {
     DIVariable DV (Var);
     if (DV.isNull()) continue;
     unsigned VSlot = VI->second;
+    DbgScope *Scope = NULL;
     DenseMap<MDNode *, DbgScope *>::iterator DSI = 
       DbgScopeMap.find(DV.getContext().getNode());
-    assert (DSI != DbgScopeMap.end() && "Unable to find variable scope!");
-    DbgScope *Scope = DSI->second;
+    if (DSI != DbgScopeMap.end()) 
+      Scope = DSI->second;
+    else 
+      // There is not any instruction assocated with this scope, so get
+      // a new scope.
+      Scope = getDbgScope(DV.getContext().getNode(), 
+                          NULL /* Not an instruction */,
+                          NULL /* Not inlined */);
+    assert (Scope && "Unable to find variable scope!");
     Scope->AddVariable(new DbgVariable(DV, VSlot, false));
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 626523b..6c03b55 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -871,28 +871,7 @@ void DwarfException::EmitExceptionTable() {
     Asm->EOL("Next action");
   }
 
-  // Emit the Catch Clauses. The code for the catch clauses following the same
-  // try is similar to a switch statement. The catch clause action record
-  // informs the runtime about the type of a catch clause and about the
-  // associated switch value.
-  //
-  //  Action Record Fields:
-  //
-  //   * Filter Value
-  //     Positive value, starting at 1. Index in the types table of the
-  //     __typeinfo for the catch-clause type. 1 is the first word preceding
-  //     TTBase, 2 is the second word, and so on. Used by the runtime to check
-  //     if the thrown exception type matches the catch-clause type. Back-end
-  //     generated switch statements check against this value.
-  //
-  //   * Next
-  //     Signed offset, in bytes from the start of this field, to the next
-  //     chained action record, or zero if none.
-  //
-  // The order of the action records determined by the next field is the order
-  // of the catch clauses as they appear in the source code, and must be kept in
-  // the same order. As a result, changing the order of the catch clause would
-  // change the semantics of the program.
+  // Emit the Catch TypeInfos.
   for (std::vector<GlobalVariable *>::const_reverse_iterator
          I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
     const GlobalVariable *GV = *I;
@@ -907,12 +886,15 @@ void DwarfException::EmitExceptionTable() {
     Asm->EOL("TypeInfo");
   }
 
-  // Emit the Type Table.
+  // Emit the Exception Specifications.
   for (std::vector<unsigned>::const_iterator
          I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
     unsigned TypeID = *I;
     Asm->EmitULEB128Bytes(TypeID);
-    Asm->EOL("Filter TypeInfo index");
+    if (TypeID != 0)
+      Asm->EOL("Exception specification");
+    else
+      Asm->EOL();
   }
 
   Asm->EmitAlignment(2, 0, 0, false);
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index f9abeac..66c5aa5 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -945,15 +945,15 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
       }
     }
     
-    // If this block doesn't fall through (e.g. it ends with an uncond branch or
-    // has no successors) and if the pred falls through into this block, and if
-    // it would otherwise fall through into the block after this, move this
-    // block to the end of the function.
+    // If this block has no successors (e.g. it is a return block or ends with
+    // a call to a no-return function like abort or __cxa_throw) and if the pred
+    // falls through into this block, and if it would otherwise fall through
+    // into the block after this, move this block to the end of the function.
     //
     // We consider it more likely that execution will stay in the function (e.g.
     // due to loops) than it is to exit it.  This asserts in loops etc, moving
     // the assert condition out of the loop body.
-    if (!PriorCond.empty() && PriorFBB == 0 &&
+    if (MBB->succ_empty() && !PriorCond.empty() && PriorFBB == 0 &&
         MachineFunction::iterator(PriorTBB) == FallThrough &&
         !CanFallThrough(MBB)) {
       bool DoTransform = true;
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index 42b24a6..6fff12c 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -34,14 +34,6 @@ namespace {
     const TargetInstrInfo *TII;
     const TargetLowering  *TLI;
 
-    /// ChangedMBBs - BBs which are modified by OptimizeIntraLoopEdges.
-    SmallPtrSet<MachineBasicBlock*, 8> ChangedMBBs;
-
-    /// UncondJmpMBBs - A list of BBs which are in loops and end with
-    /// unconditional branches.
-    SmallVector<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 4>
-    UncondJmpMBBs;
-
   public:
     static char ID;
     CodePlacementOpt() : MachineFunctionPass(&ID) {}
@@ -58,7 +50,19 @@ namespace {
     }
 
   private:
-    bool OptimizeIntraLoopEdges();
+    bool HasFallthrough(MachineBasicBlock *MBB);
+    bool HasAnalyzableTerminator(MachineBasicBlock *MBB);
+    void Splice(MachineFunction &MF,
+                MachineFunction::iterator InsertPt,
+                MachineFunction::iterator Begin,
+                MachineFunction::iterator End);
+    void UpdateTerminator(MachineBasicBlock *MBB);
+    bool EliminateUnconditionalJumpsToTop(MachineFunction &MF,
+                                          MachineLoop *L);
+    bool MoveDiscontiguousLoopBlocks(MachineFunction &MF,
+                                     MachineLoop *L);
+    bool OptimizeIntraLoopEdgesInLoopNest(MachineFunction &MF, MachineLoop *L);
+    bool OptimizeIntraLoopEdges(MachineFunction &MF);
     bool AlignLoops(MachineFunction &MF);
     bool AlignLoop(MachineFunction &MF, MachineLoop *L, unsigned Align);
   };
@@ -70,168 +74,354 @@ FunctionPass *llvm::createCodePlacementOptPass() {
   return new CodePlacementOpt();
 }
 
-/// OptimizeBackEdges - Place loop back edges to move unconditional branches
-/// out of the loop.
-///
-///       A:
-///       ...
-///       <fallthrough to B>
+/// HasFallthrough - Test whether the given branch has a fallthrough, either as
+/// a plain fallthrough or as a fallthrough case of a conditional branch.
 ///
-///       B:  --> loop header
-///       ...
-///       jcc <cond> C, [exit]
-///
-///       C:
-///       ...
-///       jmp B
+bool CodePlacementOpt::HasFallthrough(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond))
+    return false;
+  // This conditional branch has no fallthrough.
+  if (FBB)
+    return false;
+  // An unconditional branch has no fallthrough.
+  if (Cond.empty() && TBB)
+    return false;
+  // It has a fallthrough.
+  return true;
+}
+
+/// HasAnalyzableTerminator - Test whether AnalyzeBranch will succeed on MBB.
+/// This is called before major changes are begun to test whether it will be
+/// possible to complete the changes.
 ///
-/// ==>
+/// Target-specific code is hereby encouraged to make AnalyzeBranch succeed
+/// whenever possible.
 ///
-///       A:
-///       ...
-///       jmp B
+bool CodePlacementOpt::HasAnalyzableTerminator(MachineBasicBlock *MBB) {
+  // Conservatively ignore EH landing pads.
+  if (MBB->isLandingPad()) return false;
+
+  // Ignore blocks which look like they might have EH-related control flow.
+  // At the time of this writing, there are blocks which AnalyzeBranch
+  // thinks end in single uncoditional branches, yet which have two CFG
+  // successors. Code in this file is not prepared to reason about such things.
+  if (!MBB->empty() && MBB->back().getOpcode() == TargetInstrInfo::EH_LABEL)
+    return false;
+
+  // Aggressively handle return blocks and similar constructs.
+  if (MBB->succ_empty()) return true;
+
+  // Ask the target's AnalyzeBranch if it can handle this block.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  // Make the the terminator is understood.
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond))
+    return false;
+  // Make sure we have the option of reversing the condition.
+  if (!Cond.empty() && TII->ReverseBranchCondition(Cond))
+    return false;
+  return true;
+}
+
+/// Splice - Move the sequence of instructions [Begin,End) to just before
+/// InsertPt. Update branch instructions as needed to account for broken
+/// fallthrough edges and to take advantage of newly exposed fallthrough
+/// opportunities.
 ///
-///       C:
-///       ...
-///       <fallthough to B>
-///       
-///       B:  --> loop header
-///       ...
-///       jcc <cond> C, [exit]
+void CodePlacementOpt::Splice(MachineFunction &MF,
+                              MachineFunction::iterator InsertPt,
+                              MachineFunction::iterator Begin,
+                              MachineFunction::iterator End) {
+  assert(Begin != MF.begin() && End != MF.begin() && InsertPt != MF.begin() &&
+         "Splice can't change the entry block!");
+  MachineFunction::iterator OldBeginPrior = prior(Begin);
+  MachineFunction::iterator OldEndPrior = prior(End);
+
+  MF.splice(InsertPt, Begin, End);
+
+  UpdateTerminator(prior(Begin));
+  UpdateTerminator(OldBeginPrior);
+  UpdateTerminator(OldEndPrior);
+}
+
+/// UpdateTerminator - Update the terminator instructions in MBB to account
+/// for changes to the layout. If the block previously used a fallthrough,
+/// it may now need a branch, and if it previously used branching it may now
+/// be able to use a fallthrough.
 ///
-bool CodePlacementOpt::OptimizeIntraLoopEdges() {
-  if (!TLI->shouldOptimizeCodePlacement())
-    return false;
+void CodePlacementOpt::UpdateTerminator(MachineBasicBlock *MBB) {
+  // A block with no successors has no concerns with fall-through edges.
+  if (MBB->succ_empty()) return;
+
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  bool B = TII->AnalyzeBranch(*MBB, TBB, FBB, Cond);
+  (void) B;
+  assert(!B && "UpdateTerminators requires analyzable predecessors!");
+  if (Cond.empty()) {
+    if (TBB) {
+      // The block has an unconditional branch. If its successor is now
+      // its layout successor, delete the branch.
+      if (MBB->isLayoutSuccessor(TBB))
+        TII->RemoveBranch(*MBB);
+    } else {
+      // The block has an unconditional fallthrough. If its successor is not
+      // its layout successor, insert a branch.
+      TBB = *MBB->succ_begin();
+      if (!MBB->isLayoutSuccessor(TBB))
+        TII->InsertBranch(*MBB, TBB, 0, Cond);
+    }
+  } else {
+    if (FBB) {
+      // The block has a non-fallthrough conditional branch. If one of its
+      // successors is its layout successor, rewrite it to a fallthrough
+      // conditional branch.
+      if (MBB->isLayoutSuccessor(TBB)) {
+        TII->RemoveBranch(*MBB);
+        TII->ReverseBranchCondition(Cond);
+        TII->InsertBranch(*MBB, FBB, 0, Cond);
+      } else if (MBB->isLayoutSuccessor(FBB)) {
+        TII->RemoveBranch(*MBB);
+        TII->InsertBranch(*MBB, TBB, 0, Cond);
+      }
+    } else {
+      // The block has a fallthrough conditional branch.
+      MachineBasicBlock *MBBA = *MBB->succ_begin();
+      MachineBasicBlock *MBBB = *next(MBB->succ_begin());
+      if (MBBA == TBB) std::swap(MBBB, MBBA);
+      if (MBB->isLayoutSuccessor(TBB)) {
+        TII->RemoveBranch(*MBB);
+        TII->ReverseBranchCondition(Cond);
+        TII->InsertBranch(*MBB, MBBA, 0, Cond);
+      } else if (!MBB->isLayoutSuccessor(MBBA)) {
+        TII->RemoveBranch(*MBB);
+        TII->InsertBranch(*MBB, TBB, MBBA, Cond);
+      }
+    }
+  }
+}
 
+/// EliminateUnconditionalJumpsToTop - Move blocks which unconditionally jump
+/// to the loop top to the top of the loop so that they have a fall through.
+/// This can introduce a branch on entry to the loop, but it can eliminate a
+/// branch within the loop. See the @simple case in
+/// test/CodeGen/X86/loop_blocks.ll for an example of this.
+bool CodePlacementOpt::EliminateUnconditionalJumpsToTop(MachineFunction &MF,
+                                                        MachineLoop *L) {
   bool Changed = false;
-  for (unsigned i = 0, e = UncondJmpMBBs.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = UncondJmpMBBs[i].first;
-    MachineBasicBlock *SuccMBB = UncondJmpMBBs[i].second;
-    MachineLoop *L = MLI->getLoopFor(MBB);
-    assert(L && "BB is expected to be in a loop!");
-
-    if (ChangedMBBs.count(MBB)) {
-      // BB has been modified, re-analyze.
-      MachineBasicBlock *TBB = 0, *FBB = 0;
-      SmallVector<MachineOperand, 4> Cond;
-      if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty())
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+
+  bool BotHasFallthrough = HasFallthrough(L->getBottomBlock());
+
+  if (TopMBB == MF.begin() ||
+      HasAnalyzableTerminator(prior(MachineFunction::iterator(TopMBB)))) {
+  new_top:
+    for (MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin(),
+         PE = TopMBB->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *Pred = *PI;
+      if (Pred == TopMBB) continue;
+      if (HasFallthrough(Pred)) continue;
+      if (!L->contains(Pred)) continue;
+
+      // Verify that we can analyze all the loop entry edges before beginning
+      // any changes which will require us to be able to analyze them.
+      if (Pred == MF.begin())
         continue;
-      if (MLI->getLoopFor(TBB) != L || TBB->isLandingPad())
+      if (!HasAnalyzableTerminator(Pred))
+        continue;
+      if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(Pred))))
         continue;
-      SuccMBB = TBB;
-    } else {
-      assert(MLI->getLoopFor(SuccMBB) == L &&
-             "Successor is not in the same loop!");
-    }
 
-    if (MBB->isLayoutSuccessor(SuccMBB)) {
-      // Successor is right after MBB, just eliminate the unconditional jmp.
-      // Can this happen?
-      TII->RemoveBranch(*MBB);
-      ChangedMBBs.insert(MBB);
-      ++NumIntraElim;
+      // Move the block.
       Changed = true;
-      continue;
-    }
 
-    // Now check if the predecessor is fallthrough from any BB. If there is,
-    // that BB should be from outside the loop since edge will become a jmp.
-    bool OkToMove = true;
-    MachineBasicBlock *FtMBB = 0, *FtTBB = 0, *FtFBB = 0;
-    SmallVector<MachineOperand, 4> FtCond;    
-    for (MachineBasicBlock::pred_iterator PI = SuccMBB->pred_begin(),
-           PE = SuccMBB->pred_end(); PI != PE; ++PI) {
-      MachineBasicBlock *PredMBB = *PI;
-      if (PredMBB->isLayoutSuccessor(SuccMBB)) {
-        if (TII->AnalyzeBranch(*PredMBB, FtTBB, FtFBB, FtCond)) {
-          OkToMove = false;
+      // Move it and all the blocks that can reach it via fallthrough edges
+      // exclusively, to keep existing fallthrough edges intact.
+      MachineFunction::iterator Begin = Pred;
+      MachineFunction::iterator End = next(Begin);
+      while (Begin != MF.begin()) {
+        MachineFunction::iterator Prior = prior(Begin);
+        if (Prior == MF.begin())
+          break;
+        // Stop when a non-fallthrough edge is found.
+        if (!HasFallthrough(Prior))
+          break;
+        // Stop if a block which could fall-through out of the loop is found.
+        if (Prior->isSuccessor(End))
+          break;
+        // If we've reached the top, stop scanning.
+        if (Prior == MachineFunction::iterator(TopMBB)) {
+          // We know top currently has a fall through (because we just checked
+          // it) which would be lost if we do the transformation, so it isn't
+          // worthwhile to do the transformation unless it would expose a new
+          // fallthrough edge.
+          if (!Prior->isSuccessor(End))
+            goto next_pred;
+          // Otherwise we can stop scanning and procede to move the blocks.
           break;
         }
-        if (!FtTBB)
-          FtTBB = SuccMBB;
-        else if (!FtFBB) {
-          assert(FtFBB != SuccMBB && "Unexpected control flow!");
-          FtFBB = SuccMBB;
-        }
-        
-        // A fallthrough.
-        FtMBB = PredMBB;
-        MachineLoop *PL = MLI->getLoopFor(PredMBB);
-        if (PL && (PL == L || PL->getLoopDepth() >= L->getLoopDepth()))
-          OkToMove = false;
-
-        break;
+        // If we hit a switch or something complicated, don't move anything
+        // for this predecessor.
+        if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(Prior))))
+          break;
+        // Ok, the block prior to Begin will be moved along with the rest.
+        // Extend the range to include it.
+        Begin = Prior;
+        ++NumIntraMoved;
       }
+
+      // Move the blocks.
+      Splice(MF, TopMBB, Begin, End);
+
+      // Update TopMBB.
+      TopMBB = L->getTopBlock();
+
+      // We have a new loop top. Iterate on it. We shouldn't have to do this
+      // too many times if BranchFolding has done a reasonable job.
+      goto new_top;
+    next_pred:;
     }
+  }
+
+  // If the loop previously didn't exit with a fall-through and it now does,
+  // we eliminated a branch.
+  if (Changed &&
+      !BotHasFallthrough &&
+      HasFallthrough(L->getBottomBlock())) {
+    ++NumIntraElim;
+    BotHasFallthrough = true;
+  }
+
+  return Changed;
+}
+
+/// MoveDiscontiguousLoopBlocks - Move any loop blocks that are not in the
+/// portion of the loop contiguous with the header. This usually makes the loop
+/// contiguous, provided that AnalyzeBranch can handle all the relevant
+/// branching. See the @cfg_islands case in test/CodeGen/X86/loop_blocks.ll
+/// for an example of this.
+bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
+                                                   MachineLoop *L) {
+  bool Changed = false;
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock *BotMBB = L->getBottomBlock();
+
+  // Determine a position to move orphaned loop blocks to. If TopMBB is not
+  // entered via fallthrough and BotMBB is exited via fallthrough, prepend them
+  // to the top of the loop to avoid loosing that fallthrough. Otherwise append
+  // them to the bottom, even if it previously had a fallthrough, on the theory
+  // that it's worth an extra branch to keep the loop contiguous.
+  MachineFunction::iterator InsertPt = next(MachineFunction::iterator(BotMBB));
+  bool InsertAtTop = false;
+  if (TopMBB != MF.begin() &&
+      !HasFallthrough(prior(MachineFunction::iterator(TopMBB))) &&
+      HasFallthrough(BotMBB)) {
+    InsertPt = TopMBB;
+    InsertAtTop = true;
+  }
+
+  // Keep a record of which blocks are in the portion of the loop contiguous
+  // with the loop header.
+  SmallPtrSet<MachineBasicBlock *, 8> ContiguousBlocks;
+  for (MachineFunction::iterator I = TopMBB,
+       E = next(MachineFunction::iterator(BotMBB)); I != E; ++I)
+    ContiguousBlocks.insert(I);
+
+  // Find non-contigous blocks and fix them.
+  if (InsertPt != MF.begin() && HasAnalyzableTerminator(prior(InsertPt)))
+    for (MachineLoop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      MachineBasicBlock *BB = *BI;
+
+      // Verify that we can analyze all the loop entry edges before beginning
+      // any changes which will require us to be able to analyze them.
+      if (!HasAnalyzableTerminator(BB))
+        continue;
+      if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(BB))))
+        continue;
+
+      // If the layout predecessor is part of the loop, this block will be
+      // processed along with it. This keeps them in their relative order.
+      if (BB != MF.begin() &&
+          L->contains(prior(MachineFunction::iterator(BB))))
+        continue;
 
-    if (!OkToMove)
-      continue;
-
-    // Is it profitable? If SuccMBB can fallthrough itself, that can be changed
-    // into a jmp.
-    MachineBasicBlock *TBB = 0, *FBB = 0;
-    SmallVector<MachineOperand, 4> Cond;
-    if (TII->AnalyzeBranch(*SuccMBB, TBB, FBB, Cond))
-      continue;
-    if (!TBB && Cond.empty())
-      TBB = next(MachineFunction::iterator(SuccMBB));
-    else if (!FBB && !Cond.empty())
-      FBB = next(MachineFunction::iterator(SuccMBB));
-
-    // This calculate the cost of the transformation. Also, it finds the *only*
-    // intra-loop edge if there is one.
-    int Cost = 0;
-    bool HasOneIntraSucc = true;
-    MachineBasicBlock *IntraSucc = 0;
-    for (MachineBasicBlock::succ_iterator SI = SuccMBB->succ_begin(),
-           SE = SuccMBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock *SSMBB = *SI;
-      if (MLI->getLoopFor(SSMBB) == L) {
-        if (!IntraSucc)
-          IntraSucc = SSMBB;
-        else
-          HasOneIntraSucc = false;
+      // Check to see if this block is already contiguous with the main
+      // portion of the loop.
+      if (!ContiguousBlocks.insert(BB))
+        continue;
+
+      // Move the block.
+      Changed = true;
+
+      // Process this block and all loop blocks contiguous with it, to keep
+      // them in their relative order.
+      MachineFunction::iterator Begin = BB;
+      MachineFunction::iterator End = next(MachineFunction::iterator(BB));
+      for (; End != MF.end(); ++End) {
+        if (!L->contains(End)) break;
+        if (!HasAnalyzableTerminator(End)) break;
+        ContiguousBlocks.insert(End);
+        ++NumIntraMoved;
       }
 
-      if (SuccMBB->isLayoutSuccessor(SSMBB))
-        // This will become a jmp.
-        ++Cost;
-      else if (MBB->isLayoutSuccessor(SSMBB)) {
-        // One of the successor will become the new fallthrough.
-        if (SSMBB == FBB) {
-          FBB = 0;
-          --Cost;
-        } else if (!FBB && SSMBB == TBB && Cond.empty()) {
-          TBB = 0;
-          --Cost;
-        } else if (!Cond.empty() && !TII->ReverseBranchCondition(Cond)) {
-          assert(SSMBB == TBB);
-          TBB = FBB;
-          FBB = 0;
-          --Cost;
+      // If we're inserting at the bottom of the loop, and the code we're
+      // moving originally had fall-through successors, bring the sucessors
+      // up with the loop blocks to preserve the fall-through edges.
+      if (!InsertAtTop)
+        for (; End != MF.end(); ++End) {
+          if (L->contains(End)) break;
+          if (!HasAnalyzableTerminator(End)) break;
+          if (!HasFallthrough(prior(End))) break;
         }
-      }
-    }
-    if (Cost)
-      continue;
-
-    // Now, let's move the successor to below the BB to eliminate the jmp.
-    SuccMBB->moveAfter(MBB);
-    TII->RemoveBranch(*MBB);
-    TII->RemoveBranch(*SuccMBB);
-    if (TBB)
-      TII->InsertBranch(*SuccMBB, TBB, FBB, Cond);
-    ChangedMBBs.insert(MBB);
-    ChangedMBBs.insert(SuccMBB);
-    if (FtMBB) {
-      TII->RemoveBranch(*FtMBB);
-      TII->InsertBranch(*FtMBB, FtTBB, FtFBB, FtCond);
-      ChangedMBBs.insert(FtMBB);
+
+      // Move the blocks. This may invalidate TopMBB and/or BotMBB, but
+      // we don't need them anymore at this point.
+      Splice(MF, InsertPt, Begin, End);
     }
-    Changed = true;
-  }
 
-  ++NumIntraMoved;
+  return Changed;
+}
+
+/// OptimizeIntraLoopEdgesInLoopNest - Reposition loop blocks to minimize
+/// intra-loop branching and to form contiguous loops.
+///
+/// This code takes the approach of making minor changes to the existing
+/// layout to fix specific loop-oriented problems. Also, it depends on
+/// AnalyzeBranch, which can't understand complex control instructions.
+///
+bool CodePlacementOpt::OptimizeIntraLoopEdgesInLoopNest(MachineFunction &MF,
+                                                        MachineLoop *L) {
+  bool Changed = false;
+
+  // Do optimization for nested loops.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= OptimizeIntraLoopEdgesInLoopNest(MF, *I);
+
+  // Do optimization for this loop.
+  Changed |= EliminateUnconditionalJumpsToTop(MF, L);
+  Changed |= MoveDiscontiguousLoopBlocks(MF, L);
+
+  return Changed;
+}
+
+/// OptimizeIntraLoopEdges - Reposition loop blocks to minimize
+/// intra-loop branching and to form contiguous loops.
+///
+bool CodePlacementOpt::OptimizeIntraLoopEdges(MachineFunction &MF) {
+  bool Changed = false;
+
+  if (!TLI->shouldOptimizeCodePlacement())
+    return Changed;
+
+  // Do optimization for each loop in the function.
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I)
+    if (!(*I)->getParentLoop())
+      Changed |= OptimizeIntraLoopEdgesInLoopNest(MF, *I);
+
   return Changed;
 }
 
@@ -255,6 +445,8 @@ bool CodePlacementOpt::AlignLoops(MachineFunction &MF) {
   return Changed;
 }
 
+/// AlignLoop - Align loop headers to target preferred alignments.
+///
 bool CodePlacementOpt::AlignLoop(MachineFunction &MF, MachineLoop *L,
                                  unsigned Align) {
   bool Changed = false;
@@ -263,17 +455,7 @@ bool CodePlacementOpt::AlignLoop(MachineFunction &MF, MachineLoop *L,
   for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
     Changed |= AlignLoop(MF, *I, Align);
 
-  MachineBasicBlock *TopMBB = L->getHeader();
-  if (TopMBB == MF.begin()) return Changed;
-
-  MachineBasicBlock *PredMBB = prior(MachineFunction::iterator(TopMBB));
-  while (MLI->getLoopFor(PredMBB) == L) {
-    TopMBB = PredMBB;
-    if (TopMBB == MF.begin()) return Changed;
-    PredMBB = prior(MachineFunction::iterator(TopMBB));
-  }
-
-  TopMBB->setAlignment(Align);
+  L->getTopBlock()->setAlignment(Align);
   Changed = true;
   ++NumLoopsAligned;
 
@@ -288,30 +470,9 @@ bool CodePlacementOpt::runOnMachineFunction(MachineFunction &MF) {
   TLI = MF.getTarget().getTargetLowering();
   TII = MF.getTarget().getInstrInfo();
 
-  // Analyze the BBs first and keep track of BBs that
-  // end with an unconditional jmp to another block in the same loop.
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I;
-    if (MBB->isLandingPad())
-      continue;
-    MachineLoop *L = MLI->getLoopFor(MBB);
-    if (!L)
-      continue;
-
-    MachineBasicBlock *TBB = 0, *FBB = 0;
-    SmallVector<MachineOperand, 4> Cond;
-    if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty())
-      continue;
-    if (MLI->getLoopFor(TBB) == L && !TBB->isLandingPad())
-      UncondJmpMBBs.push_back(std::make_pair(MBB, TBB));
-  }
-
-  bool Changed = OptimizeIntraLoopEdges();
+  bool Changed = OptimizeIntraLoopEdges(MF);
 
   Changed |= AlignLoops(MF);
 
-  ChangedMBBs.clear();
-  UncondJmpMBBs.clear();
-
   return Changed;
 }
diff --git a/lib/CodeGen/ExactHazardRecognizer.cpp b/lib/CodeGen/ExactHazardRecognizer.cpp
index 4f32c2b..f35d196 100644
--- a/lib/CodeGen/ExactHazardRecognizer.cpp
+++ b/lib/CodeGen/ExactHazardRecognizer.cpp
@@ -22,7 +22,8 @@
 
 using namespace llvm;
 
-ExactHazardRecognizer::ExactHazardRecognizer(const InstrItineraryData &LItinData) :
+ExactHazardRecognizer::
+ExactHazardRecognizer(const InstrItineraryData &LItinData) :
   ScheduleHazardRecognizer(), ItinData(LItinData) 
 {
   // Determine the maximum depth of any itinerary. This determines the
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 4e713a6..e58a9ca 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -323,7 +323,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
 
   // Second pass scheduler.
   if (OptLevel != CodeGenOpt::None) {
-    PM.add(createPostRAScheduler());
+    PM.add(createPostRAScheduler(OptLevel));
     printAndVerify(PM);
   }
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 93d3d4c..79f46f3 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -2603,7 +2602,19 @@ bool LiveIntervals::spillPhysRegAroundRegDefsUses(const LiveInterval &li,
            tri_->isSuperRegister(*AS, SpillReg));
 
   bool Cut = false;
-  LiveInterval &pli = getInterval(SpillReg);
+  SmallVector<unsigned, 4> PRegs;
+  if (hasInterval(SpillReg))
+    PRegs.push_back(SpillReg);
+  else {
+    SmallSet<unsigned, 4> Added;
+    for (const unsigned* AS = tri_->getSubRegisters(SpillReg); *AS; ++AS)
+      if (Added.insert(*AS) && hasInterval(*AS)) {
+        PRegs.push_back(*AS);
+        for (const unsigned* ASS = tri_->getSubRegisters(*AS); *ASS; ++ASS)
+          Added.insert(*ASS);
+      }
+  }
+
   SmallPtrSet<MachineInstr*, 8> SeenMIs;
   for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li.reg),
          E = mri_->reg_end(); I != E; ++I) {
@@ -2613,8 +2624,12 @@ bool LiveIntervals::spillPhysRegAroundRegDefsUses(const LiveInterval &li,
       continue;
     SeenMIs.insert(MI);
     LiveIndex Index = getInstructionIndex(MI);
-    if (pli.liveAt(Index)) {
-      vrm.addEmergencySpill(SpillReg, MI);
+    for (unsigned i = 0, e = PRegs.size(); i != e; ++i) {
+      unsigned PReg = PRegs[i];
+      LiveInterval &pli = getInterval(PReg);
+      if (!pli.liveAt(Index))
+        continue;
+      vrm.addEmergencySpill(PReg, MI);
       LiveIndex StartIdx = getLoadIndex(Index);
       LiveIndex EndIdx = getNextSlot(getStoreIndex(Index));
       if (pli.isInOneLiveRange(StartIdx, EndIdx)) {
@@ -2626,12 +2641,12 @@ bool LiveIntervals::spillPhysRegAroundRegDefsUses(const LiveInterval &li,
         Msg << "Ran out of registers during register allocation!";
         if (MI->getOpcode() == TargetInstrInfo::INLINEASM) {
           Msg << "\nPlease check your inline asm statement for invalid "
-               << "constraints:\n";
+              << "constraints:\n";
           MI->print(Msg, tm_);
         }
         llvm_report_error(Msg.str());
       }
-      for (const unsigned* AS = tri_->getSubRegisters(SpillReg); *AS; ++AS) {
+      for (const unsigned* AS = tri_->getSubRegisters(PReg); *AS; ++AS) {
         if (!hasInterval(*AS))
           continue;
         LiveInterval &spli = getInterval(*AS);
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index b9d3ba7..1f85e92 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -220,8 +220,10 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
           OS << "imp-";
         OS << "def";
         NeedComma = true;
-      } else if (isImplicit())
+      } else if (isImplicit()) {
           OS << "imp-use";
+          NeedComma = true;
+      }
 
       if (isKill() || isDead() || isUndef()) {
         if (NeedComma) OS << ',';
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 2da8e37..db77d19 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -43,3 +43,31 @@ void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineDominatorTree>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
+
+MachineBasicBlock *MachineLoop::getTopBlock() {
+  MachineBasicBlock *TopMBB = getHeader();
+  MachineFunction::iterator Begin = TopMBB->getParent()->begin();
+  if (TopMBB != Begin) {
+    MachineBasicBlock *PriorMBB = prior(MachineFunction::iterator(TopMBB));
+    while (contains(PriorMBB)) {
+      TopMBB = PriorMBB;
+      if (TopMBB == Begin) break;
+      PriorMBB = prior(MachineFunction::iterator(TopMBB));
+    }
+  }
+  return TopMBB;
+}
+
+MachineBasicBlock *MachineLoop::getBottomBlock() {
+  MachineBasicBlock *BotMBB = getHeader();
+  MachineFunction::iterator End = BotMBB->getParent()->end();
+  if (BotMBB != prior(End)) {
+    MachineBasicBlock *NextMBB = next(MachineFunction::iterator(BotMBB));
+    while (contains(NextMBB)) {
+      BotMBB = NextMBB;
+      if (BotMBB == next(MachineFunction::iterator(BotMBB))) break;
+      NextMBB = next(MachineFunction::iterator(BotMBB));
+    }
+  }
+  return BotMBB;
+}
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 0f3b33f..a00bebb 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -34,10 +34,8 @@ STATISTIC(NumSunk, "Number of machine instructions sunk");
 
 namespace {
   class VISIBILITY_HIDDEN MachineSinking : public MachineFunctionPass {
-    const TargetMachine   *TM;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    MachineFunction       *CurMF; // Current MachineFunction
     MachineRegisterInfo  *RegInfo; // Machine register information
     MachineDominatorTree *DT;   // Machine dominator tree
     AliasAnalysis *AA;
@@ -92,19 +90,16 @@ bool MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
   return true;
 }
 
-
-
 bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(errs() << "******** Machine Sinking ********\n");
   
-  CurMF = &MF;
-  TM = &CurMF->getTarget();
-  TII = TM->getInstrInfo();
-  TRI = TM->getRegisterInfo();
-  RegInfo = &CurMF->getRegInfo();
+  const TargetMachine &TM = MF.getTarget();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  RegInfo = &MF.getRegInfo();
   DT = &getAnalysis<MachineDominatorTree>();
   AA = &getAnalysis<AliasAnalysis>();
-  AllocatableSet = TRI->getAllocatableSet(*CurMF);
+  AllocatableSet = TRI->getAllocatableSet(MF);
 
   bool EverMadeChange = false;
   
@@ -112,7 +107,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
     bool MadeChange = false;
 
     // Process all basic blocks.
-    for (MachineFunction::iterator I = CurMF->begin(), E = CurMF->end(); 
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); 
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
     
@@ -256,7 +251,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   if (SuccToSinkTo->isLandingPad())
     return false;
   
-  // If is not possible to sink an instruction into its own block.  This can
+  // It is not possible to sink an instruction into its own block.  This can
   // happen with loops.
   if (MI->getParent() == SuccToSinkTo)
     return false;
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index e52158c..8fdbe9b 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -78,10 +78,12 @@ DebugMod("postra-sched-debugmod",
 namespace {
   class VISIBILITY_HIDDEN PostRAScheduler : public MachineFunctionPass {
     AliasAnalysis *AA;
+    CodeGenOpt::Level OptLevel;
 
   public:
     static char ID;
-    PostRAScheduler() : MachineFunctionPass(&ID) {}
+    PostRAScheduler(CodeGenOpt::Level ol) :
+      MachineFunctionPass(&ID), OptLevel(ol) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
@@ -126,6 +128,9 @@ namespace {
     /// AA - AliasAnalysis for making memory reference queries.
     AliasAnalysis *AA;
 
+    /// AntiDepMode - Anti-dependence breaking mode
+    TargetSubtarget::AntiDepBreakMode AntiDepMode;
+
     /// Classes - For live regs that are only used in one register class in a
     /// live range, the register class. If the register is not live, the
     /// corresponding value is null. If the register is live but used in
@@ -154,10 +159,11 @@ namespace {
                          const MachineLoopInfo &MLI,
                          const MachineDominatorTree &MDT,
                          ScheduleHazardRecognizer *HR,
-                         AliasAnalysis *aa)
+                         AliasAnalysis *aa,
+                         TargetSubtarget::AntiDepBreakMode adm)
       : ScheduleDAGInstrs(MF, MLI, MDT), Topo(SUnits),
         AllocatableSet(TRI->getAllocatableSet(MF)),
-        HazardRec(HR), AA(aa) {}
+      HazardRec(HR), AA(aa), AntiDepMode(adm) {}
 
     ~SchedulePostRATDList() {
       delete HazardRec;
@@ -232,14 +238,21 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   AA = &getAnalysis<AliasAnalysis>();
 
   // Check for explicit enable/disable of post-ra scheduling.
+  TargetSubtarget::AntiDepBreakMode AntiDepMode = TargetSubtarget::ANTIDEP_NONE;
   if (EnablePostRAScheduler.getPosition() > 0) {
     if (!EnablePostRAScheduler)
-      return true;
+      return false;
   } else {
-    // Check that post-RA scheduling is enabled for this function
+    // Check that post-RA scheduling is enabled for this target.
     const TargetSubtarget &ST = Fn.getTarget().getSubtarget<TargetSubtarget>();
-    if (!ST.enablePostRAScheduler())
-      return true;
+    if (!ST.enablePostRAScheduler(OptLevel, AntiDepMode))
+      return false;
+  }
+
+  // Check for antidep breaking override...
+  if (EnableAntiDepBreaking.getPosition() > 0) {
+    AntiDepMode = (EnableAntiDepBreaking) ? 
+      TargetSubtarget::ANTIDEP_CRITICAL : TargetSubtarget::ANTIDEP_NONE;
   }
 
   DEBUG(errs() << "PostRAScheduler\n");
@@ -251,7 +264,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     (ScheduleHazardRecognizer *)new ExactHazardRecognizer(InstrItins) :
     (ScheduleHazardRecognizer *)new SimpleHazardRecognizer();
 
-  SchedulePostRATDList Scheduler(Fn, MLI, MDT, HR, AA);
+  SchedulePostRATDList Scheduler(Fn, MLI, MDT, HR, AA, AntiDepMode);
 
   // Loop over all of the basic blocks
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
@@ -391,7 +404,7 @@ void SchedulePostRATDList::Schedule() {
   // Build the scheduling graph.
   BuildSchedGraph(AA);
 
-  if (EnableAntiDepBreaking) {
+  if (AntiDepMode != TargetSubtarget::ANTIDEP_NONE) {
     if (BreakAntiDependencies()) {
       // We made changes. Update the dependency graph.
       // Theoretically we could update the graph in place:
@@ -1195,6 +1208,6 @@ void SchedulePostRATDList::ListScheduleTopDown() {
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
 
-FunctionPass *llvm::createPostRAScheduler() {
-  return new PostRAScheduler();
+FunctionPass *llvm::createPostRAScheduler(CodeGenOpt::Level OptLevel) {
+  return new PostRAScheduler(OptLevel);
 }
diff --git a/lib/CodeGen/PreAllocSplitting.cpp b/lib/CodeGen/PreAllocSplitting.cpp
index 8fa07d4..726869a 100644
--- a/lib/CodeGen/PreAllocSplitting.cpp
+++ b/lib/CodeGen/PreAllocSplitting.cpp
@@ -952,7 +952,7 @@ MachineInstr* PreAllocSplitting::FoldSpill(unsigned vreg,
   if (I != IntervalSSMap.end()) {
     SS = I->second;
   } else {
-    SS = MFI->CreateStackObject(RC->getSize(), RC->getAlignment());    
+    SS = MFI->CreateStackObject(RC->getSize(), RC->getAlignment());
   }
   
   MachineInstr* FMI = TII->foldMemoryOperand(*MBB->getParent(),
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 7af0bba..a0860a1 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -259,7 +259,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
       // the TargetRegisterClass if the stack alignment is smaller. Use the
       // min.
       Align = std::min(Align, StackAlign);
-      FrameIdx = FFI->CreateStackObject(RC->getSize(), Align);
+      FrameIdx = FFI->CreateStackObject(RC->getSize(), Align, true);
       if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
       if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
     } else {
@@ -729,10 +729,12 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 /// the instruciton range. Return the operand number of the kill in Operand.
 static MachineBasicBlock::iterator
 findLastUseReg(MachineBasicBlock::iterator I, MachineBasicBlock::iterator ME,
-               unsigned Reg, unsigned *Operand) {
+               unsigned Reg) {
   // Scan forward to find the last use of this virtual register
   for (++I; I != ME; ++I) {
     MachineInstr *MI = I;
+    bool isDefInsn = false;
+    bool isKillInsn = false;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
       if (MI->getOperand(i).isReg()) {
         unsigned OpReg = MI->getOperand(i).getReg();
@@ -740,13 +742,14 @@ findLastUseReg(MachineBasicBlock::iterator I, MachineBasicBlock::iterator ME,
           continue;
         assert (OpReg == Reg
                 && "overlapping use of scavenged index register!");
-        // If this is the killing use, we're done
-        if (MI->getOperand(i).isKill()) {
-          if (Operand)
-            *Operand = i;
-          return I;
-        }
+        // If this is the killing use, we have a candidate.
+        if (MI->getOperand(i).isKill())
+          isKillInsn = true;
+        else if (MI->getOperand(i).isDef())
+          isDefInsn = true;
       }
+    if (isKillInsn && !isDefInsn)
+      return I;
   }
   // If we hit the end of the basic block, there was no kill of
   // the virtual register, which is wrong.
@@ -763,10 +766,13 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
        E = Fn.end(); BB != E; ++BB) {
     RS->enterBasicBlock(BB);
 
+    // FIXME: The logic flow in this function is still too convoluted.
+    // It needs a cleanup refactoring. Do that in preparation for tracking
+    // more than one scratch register value and using ranges to find
+    // available scratch registers.
     unsigned CurrentVirtReg = 0;
     unsigned CurrentScratchReg = 0;
     bool havePrevValue = false;
-    unsigned PrevScratchReg = 0;
     int PrevValue = 0;
     MachineInstr *PrevLastUseMI = NULL;
     unsigned PrevLastUseOp = 0;
@@ -776,37 +782,39 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
 
     // The instruction stream may change in the loop, so check BB->end()
     // directly.
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
       MachineInstr *MI = I;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      bool isDefInsn = false;
+      bool isKillInsn = false;
+      bool clobbersScratchReg = false;
+      bool DoIncr = true;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         if (MI->getOperand(i).isReg()) {
           MachineOperand &MO = MI->getOperand(i);
           unsigned Reg = MO.getReg();
           if (Reg == 0)
             continue;
           if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-            // If we have an active scavenged register, we shouldn't be
-            // seeing any references to it.
-            assert (Reg != CurrentScratchReg
-                    && "overlapping use of scavenged frame index register!");
-
             // If we have a previous scratch reg, check and see if anything
             // here kills whatever value is in there.
-            if (Reg == PrevScratchReg) {
+            if (Reg == CurrentScratchReg) {
               if (MO.isUse()) {
                 // Two-address operands implicitly kill
-                if (MO.isKill() || MI->isRegTiedToDefOperand(i)) {
-                  havePrevValue = false;
-                  PrevScratchReg = 0;
-                }
+                if (MO.isKill() || MI->isRegTiedToDefOperand(i))
+                  clobbersScratchReg = true;
               } else {
                 assert (MO.isDef());
-                havePrevValue = false;
-                PrevScratchReg = 0;
+                clobbersScratchReg = true;
               }
             }
             continue;
           }
+          // If this is a def, remember that this insn defines the value.
+          // This lets us properly consider insns which re-use the scratch
+          // register, such as r2 = sub r2, #imm, in the middle of the
+          // scratch range.
+          if (MO.isDef())
+            isDefInsn = true;
 
           // Have we already allocated a scratch register for this virtual?
           if (Reg != CurrentVirtReg) {
@@ -837,50 +845,75 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
               // for the virtual register are exclusively for the purpose
               // of populating the value in the register. That's reasonable
               // for these frame index registers, but it's still a very, very
-              // strong assumption. Perhaps this implies that the frame index
-              // elimination should be before register allocation, with
-              // conservative heuristics since we'll know less then, and
-              // the reuse calculations done directly when doing the code-gen?
+              // strong assumption. rdar://7322732. Better would be to
+              // explicitly check each instruction in the range for references
+              // to the virtual register. Only delete those insns that
+              // touch the virtual register.
 
               // Find the last use of the new virtual register. Remove all
               // instruction between here and there, and update the current
               // instruction to reference the last use insn instead.
               MachineBasicBlock::iterator LastUseMI =
-                findLastUseReg(I, BB->end(), Reg, &i);
+                findLastUseReg(I, BB->end(), Reg);
+
               // Remove all instructions up 'til the last use, since they're
               // just calculating the value we already have.
               BB->erase(I, LastUseMI);
               MI = I = LastUseMI;
-              e = MI->getNumOperands();
 
-              CurrentScratchReg = PrevScratchReg;
-              // Extend the live range of the register
+              // Extend the live range of the scratch register
               PrevLastUseMI->getOperand(PrevLastUseOp).setIsKill(false);
               RS->setUsed(CurrentScratchReg);
-            } else {
               CurrentVirtReg = Reg;
-              const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
-              CurrentScratchReg = RS->FindUnusedReg(RC);
-              if (CurrentScratchReg == 0)
-                // No register is "free". Scavenge a register.
-                CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
 
-              PrevValue = Value;
+              // We deleted the instruction we were scanning the operands of.
+              // Jump back to the instruction iterator loop. Don't increment
+              // past this instruction since we updated the iterator already.
+              DoIncr = false;
+              break;
             }
+
+            // Scavenge a new scratch register
+            CurrentVirtReg = Reg;
+            const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
+            CurrentScratchReg = RS->FindUnusedReg(RC);
+            if (CurrentScratchReg == 0)
+              // No register is "free". Scavenge a register.
+              CurrentScratchReg = RS->scavengeRegister(RC, I, SPAdj);
+
+            PrevValue = Value;
           }
+          // replace this reference to the virtual register with the
+          // scratch register.
           assert (CurrentScratchReg && "Missing scratch register!");
           MI->getOperand(i).setReg(CurrentScratchReg);
 
-          // If this is the last use of the register, stop tracking it.
           if (MI->getOperand(i).isKill()) {
-            PrevScratchReg = CurrentScratchReg;
-            PrevLastUseMI = MI;
+            isKillInsn = true;
             PrevLastUseOp = i;
-            CurrentScratchReg = CurrentVirtReg = 0;
-            havePrevValue = trackingCurrentValue;
+            PrevLastUseMI = MI;
           }
         }
-      RS->forward(MI);
+      }
+      // If this is the last use of the scratch, stop tracking it. The
+      // last use will be a kill operand in an instruction that does
+      // not also define the scratch register.
+      if (isKillInsn && !isDefInsn) {
+        CurrentVirtReg = 0;
+        havePrevValue = trackingCurrentValue;
+      }
+      // Similarly, notice if instruction clobbered the value in the
+      // register we're tracking for possible later reuse. This is noted
+      // above, but enforced here since the value is still live while we
+      // process the rest of the operands of the instruction.
+      if (clobbersScratchReg) {
+        havePrevValue = false;
+        CurrentScratchReg = 0;
+      }
+      if (DoIncr) {
+        RS->forward(I);
+        ++I;
+      }
     }
   }
 }
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 00c5d46..70e8640 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -63,6 +63,8 @@ namespace {
 
     virtual bool isConstant(const MachineFrameInfo *MFI) const;
 
+    virtual bool isAliased(const MachineFrameInfo *MFI) const;
+
     virtual void printCustom(raw_ostream &OS) const {
       OS << "FixedStack" << FI;
     }
@@ -89,6 +91,26 @@ bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
   return false;
 }
 
+bool PseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  if (this == getStack() ||
+      this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return false;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+  return true;
+}
+
 bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{
   return MFI && MFI->isImmutableObjectIndex(FI);
 }
+
+bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  // Negative frame indices are used for special things that don't
+  // appear in LLVM IR. Non-negative indices may be used for things
+  // like static allocas.
+  if (!MFI)
+    return FI >= 0;
+  // Spill slots should not alias others.
+  return !MFI->isFixedObjectIndex(FI) && !MFI->isSpillSlotObjectIndex(FI);
+}
diff --git a/lib/CodeGen/RegAllocLocal.cpp b/lib/CodeGen/RegAllocLocal.cpp
index 6caa2d3..28ede55 100644
--- a/lib/CodeGen/RegAllocLocal.cpp
+++ b/lib/CodeGen/RegAllocLocal.cpp
@@ -263,7 +263,7 @@ int RALocal::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
 
   // Allocate a new stack object for this spill location...
   int FrameIdx = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
-                                                       RC->getAlignment());
+                                                       RC->getAlignment(),true);
 
   // Assign the slot...
   StackSlotForVirtReg[VirtReg] = FrameIdx;
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 5f1c4e2..2518ce1 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -300,7 +300,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
 
   // If the target knows how to save/restore the register, let it do so;
   // otherwise, use the emergency stack spill slot.
-  if (!TRI->saveScavengerRegister(*MBB, I, RC, SReg)) {
+  if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
     // Spill the scavenged register before I.
     assert(ScavengingFrameIndex >= 0 &&
            "Cannot scavenge register without an emergency spill slot!");
@@ -310,8 +310,9 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
 
     // Restore the scavenged register before its use (or first terminator).
     TII->loadRegFromStackSlot(*MBB, UseMI, SReg, ScavengingFrameIndex, RC);
-  } else
-    TRI->restoreScavengerRegister(*MBB, UseMI, RC, SReg);
+    II = prior(UseMI);
+    TRI->eliminateFrameIndex(II, SPAdj, NULL, this);
+  }
 
   ScavengeRestore = prior(UseMI);
 
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 44e9296..43454dd 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -32,7 +32,9 @@ using namespace llvm;
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo &mli,
                                      const MachineDominatorTree &mdt)
-  : ScheduleDAG(mf), MLI(mli), MDT(mdt), LoopRegs(MLI, MDT) {}
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), LoopRegs(MLI, MDT) {
+  MFI = mf.getFrameInfo();
+}
 
 /// Run - perform scheduling.
 ///
@@ -95,7 +97,8 @@ static const Value *getUnderlyingObject(const Value *V) {
 /// getUnderlyingObjectForInstr - If this machine instr has memory reference
 /// information and it can be tracked to a normal reference to a known
 /// object, return the Value for that object. Otherwise return null.
-static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI) {
+static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
+                                                const MachineFrameInfo *MFI) {
   if (!MI->hasOneMemOperand() ||
       !(*MI->memoperands_begin())->getValue() ||
       (*MI->memoperands_begin())->isVolatile())
@@ -106,10 +109,19 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI) {
     return 0;
 
   V = getUnderlyingObject(V);
-  if (!isa<PseudoSourceValue>(V) && !isIdentifiedObject(V))
-    return 0;
+  if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+    // For now, ignore PseudoSourceValues which may alias LLVM IR values
+    // because the code that uses this function has no way to cope with
+    // such aliases.
+    if (PSV->isAliased(MFI))
+      return 0;
+    return V;
+  }
 
-  return V;
+  if (isIdentifiedObject(V))
+    return V;
+
+  return 0;
 }
 
 void ScheduleDAGInstrs::StartBlock(MachineBasicBlock *BB) {
@@ -344,7 +356,7 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
         // Unknown memory accesses. Assume the worst.
         ChainMMO = 0;
     } else if (TID.mayStore()) {
-      if (const Value *V = getUnderlyingObjectForInstr(MI)) {
+      if (const Value *V = getUnderlyingObjectForInstr(MI, MFI)) {
         // A store to a specific PseudoSourceValue. Add precise dependencies.
         // Handle the def in MemDefs, if there is one.
         std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V);
@@ -377,7 +389,7 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
     } else if (TID.mayLoad()) {
       if (MI->isInvariantLoad(AA)) {
         // Invariant load, no chain dependencies needed!
-      } else if (const Value *V = getUnderlyingObjectForInstr(MI)) {
+      } else if (const Value *V = getUnderlyingObjectForInstr(MI, MFI)) {
         // A load from a specific PseudoSourceValue. Add precise dependencies.
         std::map<const Value *, SUnit *>::iterator I = MemDefs.find(V);
         if (I != MemDefs.end())
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index 29e1c98..366c3a8 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -98,6 +98,7 @@ namespace llvm {
   class VISIBILITY_HIDDEN ScheduleDAGInstrs : public ScheduleDAG {
     const MachineLoopInfo &MLI;
     const MachineDominatorTree &MDT;
+    const MachineFrameInfo *MFI;
 
     /// Defs, Uses - Remember where defs and uses of each physical register
     /// are as we iterate upward through the instructions. This is allocated
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 95ad05e..4851d49 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -18,7 +18,6 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1ed3082..e3f8f0f 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4381,15 +4381,17 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
 
 SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
 
   if (isNegatibleForFree(N0, LegalOperations))
     return GetNegatedExpression(N0, DAG, LegalOperations);
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
   // constant pool values.
-  if (N0.getOpcode() == ISD::BIT_CONVERT && N0.getNode()->hasOneUse() &&
-      N0.getOperand(0).getValueType().isInteger() &&
-      !N0.getOperand(0).getValueType().isVector()) {
+  if (N0.getOpcode() == ISD::BIT_CONVERT && 
+      !VT.isVector() &&
+      N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
@@ -4397,7 +4399,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
               DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
       AddToWorkList(Int.getNode());
       return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(),
-                         N->getValueType(0), Int);
+                         VT, Int);
     }
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index fc01b07..7138dd2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1655,8 +1655,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 }
 
 /// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target. This routine assumes LHS and rHS
-/// have already been legalized by LegalizeSetCCOperands. It expands SETCC with
+/// condition code CC on the current target. This routine expands SETCC with
 /// illegal condition code into AND / OR of multiple SETCC values.
 void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
                                                  SDValue &LHS, SDValue &RHS,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 859c656..e1b7022 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -617,6 +617,7 @@ private:
   SDValue WidenVecOp_BIT_CONVERT(SDNode *N);
   SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
 
   SDValue WidenVecOp_Convert(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 0eafe62..dbd3e39 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -115,7 +115,8 @@ void DAGTypeLegalizer::ExpandRes_BIT_CONVERT(SDNode *N, SDValue &Lo,
   // Create the stack frame object.  Make sure it is aligned for both
   // the source and expanded destination types.
   unsigned Alignment =
-    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.getTypeForEVT(*DAG.getContext()));
+    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.
+                                              getTypeForEVT(*DAG.getContext()));
   SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   const Value *SV = PseudoSourceValue::getFixedStack(SPFI);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a03f825..75e1239 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1789,6 +1789,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned ResNo) {
 
   case ISD::BIT_CONVERT:        Res = WidenVecOp_BIT_CONVERT(N); break;
   case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
 
@@ -1893,6 +1894,12 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, N->getDebugLoc(),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 542bf64..37736c0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -4600,7 +4601,7 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
       N->InitOperands(new SDUse[NumOps], Ops, NumOps);
       N->OperandsNeedDelete = true;
     } else
-      MN->InitOperands(MN->OperandList, Ops, NumOps);
+      N->InitOperands(N->OperandList, Ops, NumOps);
   }
 
   // Delete any nodes that are still dead after adding the uses for the
@@ -5404,14 +5405,16 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EH_RETURN: return "EH_RETURN";
   case ISD::ConstantPool:  return "ConstantPool";
   case ISD::ExternalSymbol: return "ExternalSymbol";
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(getOperand(0))->getZExtValue();
-    return Intrinsic::getName((Intrinsic::ID)IID);
-  }
+  case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(getOperand(1))->getZExtValue();
-    return Intrinsic::getName((Intrinsic::ID)IID);
+    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
+    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return Intrinsic::getName((Intrinsic::ID)IID);
+    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+      return TII->getName(IID);
+    llvm_unreachable("Invalid intrinsic ID");
   }
 
   case ISD::BUILD_VECTOR:   return "BUILD_VECTOR";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
index 9017e43..adcc532 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
@@ -5485,48 +5485,6 @@ void SelectionDAGLowering::visitInlineAsm(CallSite CS) {
   DAG.setRoot(Chain);
 }
 
-
-void SelectionDAGLowering::visitMalloc(MallocInst &I) {
-  SDValue Src = getValue(I.getOperand(0));
-
-  // Scale up by the type size in the original i32 type width.  Various
-  // mid-level optimizers may make assumptions about demanded bits etc from the
-  // i32-ness of the optimizer: we do not want to promote to i64 and then
-  // multiply on 64-bit targets.
-  // FIXME: Malloc inst should go away: PR715.
-  uint64_t ElementSize = TD->getTypeAllocSize(I.getType()->getElementType());
-  if (ElementSize != 1) {
-    // Src is always 32-bits, make sure the constant fits.
-    assert(Src.getValueType() == MVT::i32);
-    ElementSize = (uint32_t)ElementSize;
-    Src = DAG.getNode(ISD::MUL, getCurDebugLoc(), Src.getValueType(),
-                      Src, DAG.getConstant(ElementSize, Src.getValueType()));
-  }
-  
-  EVT IntPtr = TLI.getPointerTy();
-
-  Src = DAG.getZExtOrTrunc(Src, getCurDebugLoc(), IntPtr);
-
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Node = Src;
-  Entry.Ty = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
-  Args.push_back(Entry);
-
-  bool isTailCall = PerformTailCallOpt &&
-                    isInTailCallPosition(&I, Attribute::None, TLI);
-  std::pair<SDValue,SDValue> Result =
-    TLI.LowerCallTo(getRoot(), I.getType(), false, false, false, false,
-                    0, CallingConv::C, isTailCall,
-                    /*isReturnValueUsed=*/true,
-                    DAG.getExternalSymbol("malloc", IntPtr),
-                    Args, DAG, getCurDebugLoc());
-  if (Result.first.getNode())
-    setValue(&I, Result.first);  // Pointers always fit in registers
-  if (Result.second.getNode())
-    DAG.setRoot(Result.second);
-}
-
 void SelectionDAGLowering::visitFree(FreeInst &I) {
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
index 06acc8a..722b1d8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
@@ -60,7 +60,6 @@ class MachineFunction;
 class MachineInstr;
 class MachineModuleInfo;
 class MachineRegisterInfo;
-class MallocInst;
 class PHINode;
 class PtrToIntInst;
 class ReturnInst;
@@ -529,7 +528,6 @@ private:
   void visitGetElementPtr(User &I);
   void visitSelect(User &I);
 
-  void visitMalloc(MallocInst &I);
   void visitFree(FreeInst &I);
   void visitAlloca(AllocaInst &I);
   void visitLoad(LoadInst &I);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index ae98da5..72e7f58 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -226,7 +226,7 @@ static void EmitLiveInCopy(MachineBasicBlock *MBB,
   assert(Emitted && "Unable to issue a live-in copy instruction!\n");
   (void) Emitted;
 
-CopyRegMap.insert(std::make_pair(prior(Pos), VirtReg));
+  CopyRegMap.insert(std::make_pair(prior(Pos), VirtReg));
   if (Coalesced) {
     if (&*InsertPos == UseMI) ++InsertPos;
     MBB->erase(UseMI);
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 350bc6e..0204969 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -111,11 +111,16 @@ bool StackProtector::RequiresStackProtector() const {
           // protectors.
           return true;
 
-        if (const ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType()))
+        if (const ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
+          // We apparently only care about character arrays.
+          if (AT->getElementType() != Type::getInt8Ty(AT->getContext()))
+            continue;
+
           // If an array has more than SSPBufferSize bytes of allocated space,
           // then we emit stack protectors.
           if (SSPBufferSize <= TD->getTypeAllocSize(AT))
             return true;
+        }
       }
   }
 
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index c78f35b..cac098b 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -118,7 +118,7 @@ int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
          "attempt to assign stack slot to already spilled register");
   const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
   int SS = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
-                                                RC->getAlignment());
+                                              RC->getAlignment(), /*isSS*/true);
   if (LowSpillSlot == NO_STACK_SLOT)
     LowSpillSlot = SS;
   if (HighSpillSlot == NO_STACK_SLOT || SS > HighSpillSlot)
@@ -162,7 +162,7 @@ int VirtRegMap::getEmergencySpillSlot(const TargetRegisterClass *RC) {
   if (I != EmergencySpillSlots.end())
     return I->second;
   int SS = MF->getFrameInfo()->CreateStackObject(RC->getSize(),
-                                                RC->getAlignment());
+                                              RC->getAlignment(), /*isSS*/true);
   if (LowSpillSlot == NO_STACK_SLOT)
     LowSpillSlot = SS;
   if (HighSpillSlot == NO_STACK_SLOT || SS > HighSpillSlot)
diff --git a/lib/CompilerDriver/Main.cpp b/lib/CompilerDriver/Main.cpp
index 3e1fc9f..c581809 100644
--- a/lib/CompilerDriver/Main.cpp
+++ b/lib/CompilerDriver/Main.cpp
@@ -95,8 +95,7 @@ int Main(int argc, char** argv) {
       (argc, argv, "LLVM Compiler Driver (Work In Progress)", true);
 
     PluginLoader Plugins;
-    Plugins.PopulateLanguageMap(langMap);
-    Plugins.PopulateCompilationGraph(graph);
+    Plugins.RunInitialization(langMap, graph);
 
     if (CheckGraph) {
       int ret = graph.Check();
diff --git a/lib/CompilerDriver/Plugin.cpp b/lib/CompilerDriver/Plugin.cpp
index 7310d12..0fdfef4 100644
--- a/lib/CompilerDriver/Plugin.cpp
+++ b/lib/CompilerDriver/Plugin.cpp
@@ -62,18 +62,17 @@ namespace llvmc {
     pluginListInitialized = false;
   }
 
-  void PluginLoader::PopulateLanguageMap(LanguageMap& langMap) {
+  void PluginLoader::RunInitialization(LanguageMap& langMap,
+                                       CompilationGraph& graph) const
+  {
     llvm::sys::SmartScopedLock<true> Lock(*PluginMutex);
     for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
-         B != E; ++B)
-      (*B)->PopulateLanguageMap(langMap);
-  }
-
-  void PluginLoader::PopulateCompilationGraph(CompilationGraph& graph) {
-    llvm::sys::SmartScopedLock<true> Lock(*PluginMutex);
-    for (PluginList::iterator B = Plugins.begin(), E = Plugins.end();
-         B != E; ++B)
-      (*B)->PopulateCompilationGraph(graph);
+         B != E; ++B) {
+      const BasePlugin* BP = *B;
+      BP->PreprocessOptions();
+      BP->PopulateLanguageMap(langMap);
+      BP->PopulateCompilationGraph(graph);
+    }
   }
 
 }
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
index 41b3b4e..42020d6 100644
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_library(LLVMJIT
   JITDwarfEmitter.cpp
   JITEmitter.cpp
   JITMemoryManager.cpp
-  MacOSJITEventListener.cpp
   OProfileJITEventListener.cpp
   TargetSelect.cpp
   )
diff --git a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
index fa64010..49faf64 100644
--- a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
+++ b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Mutex.h"
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index eacd9f9..073d6fb 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -42,6 +42,7 @@
 #include "llvm/System/Disassembler.h"
 #include "llvm/System/Memory.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -63,17 +64,20 @@ static JIT *TheJIT = 0;
 namespace {
   class JITResolverState {
   public:
-    typedef std::map<AssertingVH<Function>, void*> FunctionToStubMapTy;
-    typedef std::map<void*, AssertingVH<Function> > StubToFunctionMapTy;
+    typedef DenseMap<AssertingVH<Function>, void*> FunctionToStubMapTy;
+    typedef std::map<void*, AssertingVH<Function> > CallSiteToFunctionMapTy;
+    typedef DenseMap<AssertingVH<Function>, SmallPtrSet<void*, 1> >
+            FunctionToCallSitesMapTy;
     typedef std::map<AssertingVH<GlobalValue>, void*> GlobalToIndirectSymMapTy;
   private:
     /// FunctionToStubMap - Keep track of the stub created for a particular
     /// function so that we can reuse them if necessary.
     FunctionToStubMapTy FunctionToStubMap;
 
-    /// StubToFunctionMap - Keep track of the function that each stub
-    /// corresponds to.
-    StubToFunctionMapTy StubToFunctionMap;
+    /// CallSiteToFunctionMap - Keep track of the function that each lazy call
+    /// site corresponds to, and vice versa.
+    CallSiteToFunctionMapTy CallSiteToFunctionMap;
+    FunctionToCallSitesMapTy FunctionToCallSitesMap;
 
     /// GlobalToIndirectSymMap - Keep track of the indirect symbol created for a
     /// particular GlobalVariable so that we can reuse them if necessary.
@@ -85,14 +89,78 @@ namespace {
       return FunctionToStubMap;
     }
 
-    StubToFunctionMapTy& getStubToFunctionMap(const MutexGuard& locked) {
+    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap(const MutexGuard& locked) {
       assert(locked.holds(TheJIT->lock));
-      return StubToFunctionMap;
+      return GlobalToIndirectSymMap;
     }
 
-    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap(const MutexGuard& locked) {
+    pair<void *, Function *> LookupFunctionFromCallSite(
+        const MutexGuard &locked, void *CallSite) const {
       assert(locked.holds(TheJIT->lock));
-      return GlobalToIndirectSymMap;
+
+      // The address given to us for the stub may not be exactly right, it might be
+      // a little bit after the stub.  As such, use upper_bound to find it.
+      CallSiteToFunctionMapTy::const_iterator I =
+        CallSiteToFunctionMap.upper_bound(CallSite);
+      assert(I != CallSiteToFunctionMap.begin() &&
+             "This is not a known call site!");
+      --I;
+      return *I;
+    }
+
+    void AddCallSite(const MutexGuard &locked, void *CallSite, Function *F) {
+      assert(locked.holds(TheJIT->lock));
+
+      assert(CallSiteToFunctionMap.insert(std::make_pair(CallSite, F)).second &&
+             "Pair was already in CallSiteToFunctionMap");
+      FunctionToCallSitesMap[F].insert(CallSite);
+    }
+
+    // Returns the Function of the stub if a stub was erased, or NULL if there
+    // was no stub.  This function uses the call-site->function map to find a
+    // relevant function, but asserts that only stubs and not other call sites
+    // will be passed in.
+    Function *EraseStub(const MutexGuard &locked, void *Stub) {
+      CallSiteToFunctionMapTy::iterator C2F_I =
+        CallSiteToFunctionMap.find(Stub);
+      if (C2F_I == CallSiteToFunctionMap.end()) {
+        // Not a stub.
+        return NULL;
+      }
+
+      Function *const F = C2F_I->second;
+#ifndef NDEBUG
+      void *RealStub = FunctionToStubMap.lookup(F);
+      assert(RealStub == Stub &&
+             "Call-site that wasn't a stub pass in to EraseStub");
+#endif
+      FunctionToStubMap.erase(F);
+      CallSiteToFunctionMap.erase(C2F_I);
+
+      // Remove the stub from the function->call-sites map, and remove the whole
+      // entry from the map if that was the last call site.
+      FunctionToCallSitesMapTy::iterator F2C_I = FunctionToCallSitesMap.find(F);
+      assert(F2C_I != FunctionToCallSitesMap.end() &&
+             "FunctionToCallSitesMap broken");
+      assert(F2C_I->second.erase(Stub) &&
+             "FunctionToCallSitesMap broken");
+      if (F2C_I->second.empty())
+        FunctionToCallSitesMap.erase(F2C_I);
+
+      return F;
+    }
+
+    void EraseAllCallSites(const MutexGuard &locked, Function *F) {
+      assert(locked.holds(TheJIT->lock));
+      FunctionToCallSitesMapTy::iterator F2C = FunctionToCallSitesMap.find(F);
+      if (F2C == FunctionToCallSitesMap.end())
+        return;
+      for (SmallPtrSet<void*, 1>::const_iterator I = F2C->second.begin(),
+             E = F2C->second.end(); I != E; ++I) {
+        assert(CallSiteToFunctionMap.erase(*I) == 1 &&
+               "Missing call site->function mapping");
+      }
+      FunctionToCallSitesMap.erase(F2C);
     }
   };
 
@@ -100,7 +168,7 @@ namespace {
   /// have not yet been compiled.
   class JITResolver {
     typedef JITResolverState::FunctionToStubMapTy FunctionToStubMapTy;
-    typedef JITResolverState::StubToFunctionMapTy StubToFunctionMapTy;
+    typedef JITResolverState::CallSiteToFunctionMapTy CallSiteToFunctionMapTy;
     typedef JITResolverState::GlobalToIndirectSymMapTy GlobalToIndirectSymMapTy;
 
     /// LazyResolverFn - The target lazy resolver function that we actually
@@ -154,7 +222,7 @@ namespace {
     void *AddCallbackAtLocation(Function *F, void *Location) {
       MutexGuard locked(TheJIT->lock);
       /// Get the target-specific JIT resolver function.
-      state.getStubToFunctionMap(locked)[Location] = F;
+      state.AddCallSite(locked, Location, F);
       return (void*)(intptr_t)LazyResolverFn;
     }
     
@@ -183,8 +251,7 @@ void *JITResolver::getFunctionStubIfAvailable(Function *F) {
   MutexGuard locked(TheJIT->lock);
 
   // If we already have a stub for this function, recycle it.
-  void *&Stub = state.getFunctionToStubMap(locked)[F];
-  return Stub;
+  return state.getFunctionToStubMap(locked).lookup(F);
 }
 
 /// getFunctionStub - This returns a pointer to a function stub, creating
@@ -230,7 +297,7 @@ void *JITResolver::getFunctionStub(Function *F) {
 
   // Finally, keep track of the stub-to-Function mapping so that the
   // JITCompilerFn knows which function to compile!
-  state.getStubToFunctionMap(locked)[Stub] = F;
+  state.AddCallSite(locked, Stub, F);
 
   // If we are JIT'ing non-lazily but need to call a function that does not
   // exist yet, add it to the JIT's work list so that we can fill in the stub
@@ -291,10 +358,11 @@ void JITResolver::getRelocatableGVs(SmallVectorImpl<GlobalValue*> &GVs,
                                     SmallVectorImpl<void*> &Ptrs) {
   MutexGuard locked(TheJIT->lock);
   
-  FunctionToStubMapTy &FM = state.getFunctionToStubMap(locked);
+  const FunctionToStubMapTy &FM = state.getFunctionToStubMap(locked);
   GlobalToIndirectSymMapTy &GM = state.getGlobalToIndirectSymMap(locked);
   
-  for (FunctionToStubMapTy::iterator i = FM.begin(), e = FM.end(); i != e; ++i){
+  for (FunctionToStubMapTy::const_iterator i = FM.begin(), e = FM.end();
+       i != e; ++i){
     Function *F = i->first;
     if (F->isDeclaration() && F->hasExternalLinkage()) {
       GVs.push_back(i->first);
@@ -310,20 +378,15 @@ void JITResolver::getRelocatableGVs(SmallVectorImpl<GlobalValue*> &GVs,
 
 GlobalValue *JITResolver::invalidateStub(void *Stub) {
   MutexGuard locked(TheJIT->lock);
-  
-  FunctionToStubMapTy &FM = state.getFunctionToStubMap(locked);
-  StubToFunctionMapTy &SM = state.getStubToFunctionMap(locked);
+
   GlobalToIndirectSymMapTy &GM = state.getGlobalToIndirectSymMap(locked);
-  
+
   // Look up the cheap way first, to see if it's a function stub we are
   // invalidating.  If so, remove it from both the forward and reverse maps.
-  if (SM.find(Stub) != SM.end()) {
-    Function *F = SM[Stub];
-    SM.erase(Stub);
-    FM.erase(F);
+  if (Function *F = state.EraseStub(locked, Stub)) {
     return F;
   }
-  
+
   // Otherwise, it might be an indirect symbol stub.  Find it and remove it.
   for (GlobalToIndirectSymMapTy::iterator i = GM.begin(), e = GM.end();
        i != e; ++i) {
@@ -361,14 +424,12 @@ void *JITResolver::JITCompilerFn(void *Stub) {
     // JIT lock to be unlocked.
     MutexGuard locked(TheJIT->lock);
 
-    // The address given to us for the stub may not be exactly right, it might be
-    // a little bit after the stub.  As such, use upper_bound to find it.
-    StubToFunctionMapTy::iterator I =
-      JR.state.getStubToFunctionMap(locked).upper_bound(Stub);
-    assert(I != JR.state.getStubToFunctionMap(locked).begin() &&
-           "This is not a known stub!");
-    F = (--I)->second;
-    ActualPtr = I->first;
+    // The address given to us for the stub may not be exactly right, it might
+    // be a little bit after the stub.  As such, use upper_bound to find it.
+    pair<void*, Function*> I =
+      JR.state.LookupFunctionFromCallSite(locked, Stub);
+    F = I.second;
+    ActualPtr = I.first;
   }
 
   // If we have already code generated the function, just return the address.
@@ -383,25 +444,21 @@ void *JITResolver::JITCompilerFn(void *Stub) {
                         + F->getName() + "' when lazy compiles are disabled!");
     }
   
-    // We might like to remove the stub from the StubToFunction map.
-    // We can't do that! Multiple threads could be stuck, waiting to acquire the
-    // lock above. As soon as the 1st function finishes compiling the function,
-    // the next one will be released, and needs to be able to find the function
-    // it needs to call.
-    //JR.state.getStubToFunctionMap(locked).erase(I);
-
     DEBUG(errs() << "JIT: Lazily resolving function '" << F->getName()
           << "' In stub ptr = " << Stub << " actual ptr = "
           << ActualPtr << "\n");
 
     Result = TheJIT->getPointerToFunction(F);
   }
-  
-  // Reacquire the lock to erase the stub in the map.
+
+  // Reacquire the lock to update the GOT map.
   MutexGuard locked(TheJIT->lock);
 
-  // We don't need to reuse this stub in the future, as F is now compiled.
-  JR.state.getFunctionToStubMap(locked).erase(F);
+  // We might like to remove the call site from the CallSiteToFunction map, but
+  // we can't do that! Multiple threads could be stuck, waiting to acquire the
+  // lock above. As soon as the 1st function finishes compiling the function,
+  // the next one will be released, and needs to be able to find the function it
+  // needs to call.
 
   // FIXME: We could rewrite all references to this stub if we knew them.
 
@@ -492,6 +549,13 @@ namespace {
     /// finishFunction.
     JITEvent_EmittedFunctionDetails EmissionDetails;
 
+    struct EmittedCode {
+      void *FunctionBody;
+      void *ExceptionTable;
+      EmittedCode() : FunctionBody(0), ExceptionTable(0) {}
+    };
+    DenseMap<const Function *, EmittedCode> EmittedFunctions;
+
     // CurFnStubUses - For a given Function, a vector of stubs that it
     // references.  This facilitates the JIT detecting that a stub is no
     // longer used, so that it may be deallocated.
@@ -955,7 +1019,8 @@ void JITEmitter::startFunction(MachineFunction &F) {
   BufferBegin = CurBufferPtr = MemMgr->startFunctionBody(F.getFunction(),
                                                          ActualSize);
   BufferEnd = BufferBegin+ActualSize;
-  
+  EmittedFunctions[F.getFunction()].FunctionBody = BufferBegin;
+
   // Ensure the constant pool/jump table info is at least 4-byte aligned.
   emitAlignment(16);
 
@@ -1145,6 +1210,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
                                                              ActualSize);
     BufferEnd = BufferBegin+ActualSize;
+    EmittedFunctions[F.getFunction()].ExceptionTable = BufferBegin;
     uint8_t *EhStart;
     uint8_t *FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd,
                                                 EhStart);
@@ -1188,7 +1254,13 @@ void JITEmitter::retryWithMoreMemory(MachineFunction &F) {
 /// deallocateMemForFunction - Deallocate all memory for the specified
 /// function body.  Also drop any references the function has to stubs.
 void JITEmitter::deallocateMemForFunction(const Function *F) {
-  MemMgr->deallocateMemForFunction(F);
+  DenseMap<const Function *, EmittedCode>::iterator Emitted =
+    EmittedFunctions.find(F);
+  if (Emitted != EmittedFunctions.end()) {
+    MemMgr->deallocateFunctionBody(Emitted->second.FunctionBody);
+    MemMgr->deallocateExceptionTable(Emitted->second.ExceptionTable);
+    EmittedFunctions.erase(Emitted);
+  }
 
   // TODO: Do we need to unregister exception handling information from libgcc
   // here?
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 474843f..3796624 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -297,9 +297,6 @@ namespace {
 
     uint8_t *GOTBase;     // Target Specific reserved memory
     void *DlsymTable;     // Stub external symbol information
-
-    std::map<const Function*, MemoryRangeHeader*> FunctionBlocks;
-    std::map<const Function*, MemoryRangeHeader*> TableBlocks;
   public:
     DefaultJITMemoryManager();
     ~DefaultJITMemoryManager();
@@ -414,7 +411,6 @@ namespace {
              "Mismatched function start/end!");
 
       uintptr_t BlockSize = FunctionEnd - (uint8_t *)CurBlock;
-      FunctionBlocks[F] = CurBlock;
 
       // Release the memory at the end of this block that isn't needed.
       FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
@@ -464,7 +460,6 @@ namespace {
              "Mismatched table start/end!");
       
       uintptr_t BlockSize = TableEnd - (uint8_t *)CurBlock;
-      TableBlocks[F] = CurBlock;
 
       // Release the memory at the end of this block that isn't needed.
       FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
@@ -478,15 +473,9 @@ namespace {
       return DlsymTable;
     }
     
-    /// deallocateMemForFunction - Deallocate all memory for the specified
-    /// function body.
-    void deallocateMemForFunction(const Function *F) {
-      std::map<const Function*, MemoryRangeHeader*>::iterator
-        I = FunctionBlocks.find(F);
-      if (I == FunctionBlocks.end()) return;
-      
+    void deallocateBlock(void *Block) {
       // Find the block that is allocated for this function.
-      MemoryRangeHeader *MemRange = I->second;
+      MemoryRangeHeader *MemRange = static_cast<MemoryRangeHeader*>(Block) - 1;
       assert(MemRange->ThisAllocated && "Block isn't allocated!");
 
       // Fill the buffer with garbage!
@@ -496,27 +485,18 @@ namespace {
 
       // Free the memory.
       FreeMemoryList = MemRange->FreeBlock(FreeMemoryList);
-      
-      // Finally, remove this entry from FunctionBlocks.
-      FunctionBlocks.erase(I);
-      
-      I = TableBlocks.find(F);
-      if (I == TableBlocks.end()) return;
-      
-      // Find the block that is allocated for this function.
-      MemRange = I->second;
-      assert(MemRange->ThisAllocated && "Block isn't allocated!");
+    }
 
-      // Fill the buffer with garbage!
-      if (PoisonMemory) {
-        memset(MemRange+1, 0xCD, MemRange->BlockSize-sizeof(*MemRange));
-      }
+    /// deallocateFunctionBody - Deallocate all memory for the specified
+    /// function body.
+    void deallocateFunctionBody(void *Body) {
+      if (Body) deallocateBlock(Body);
+    }
 
-      // Free the memory.
-      FreeMemoryList = MemRange->FreeBlock(FreeMemoryList);
-      
-      // Finally, remove this entry from TableBlocks.
-      TableBlocks.erase(I);
+    /// deallocateExceptionTable - Deallocate memory for the specified
+    /// exception table.
+    void deallocateExceptionTable(void *ET) {
+      if (ET) deallocateBlock(ET);
     }
 
     /// setMemoryWritable - When code generation is in progress,
diff --git a/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
index 69398be..00c4af7 100644
--- a/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
@@ -69,16 +69,16 @@ OProfileJITEventListener::~OProfileJITEventListener() {
 }
 
 class FilenameCache {
-  // Holds the filename of each CompileUnit, so that we can pass the
+  // Holds the filename of each Scope, so that we can pass the
   // pointer into oprofile.  These char*s are freed in the destructor.
   DenseMap<MDNode*, char*> Filenames;
 
  public:
-  const char *getFilename(MDNode *CompileUnit) {
-    char *&Filename = Filenames[CompileUnit];
+  const char *getFilename(MDNode *Scope) {
+    char *&Filename = Filenames[Scope];
     if (Filename == NULL) {
-      DICompileUnit CU(CompileUnit);
-      Filename = strdup(CU.getFilename());
+      DIScope S(Scope);
+      Filename = strdup(S.getFilename());
     }
     return Filename;
   }
@@ -97,7 +97,7 @@ static debug_line_info LineStartToOProfileFormat(
   Result.vma = Address;
   const DebugLocTuple &tuple = MF.getDebugLocTuple(Loc);
   Result.lineno = tuple.Line;
-  Result.filename = Filenames.getFilename(tuple.CompileUnit);
+  Result.filename = Filenames.getFilename(tuple.Scope);
   DEBUG(errs() << "Mapping " << reinterpret_cast<void*>(Result.vma) << " to "
                << Result.filename << ":" << Result.lineno << "\n");
   return Result;
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index e56e968..e939f37 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -123,23 +123,27 @@ void MCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   OS << " = ";
   Value->print(OS, &MAI);
   OS << '\n';
+
+  // FIXME: Lift context changes into super class.
+  // FIXME: Set associated section.
+  Symbol->setValue(Value);
 }
 
-void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol, 
+void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                         SymbolAttr Attribute) {
   switch (Attribute) {
-  case Global: OS << ".globl"; break;
-  case Hidden: OS << ".hidden"; break;
+  case Global:         OS << ".globl";           break;
+  case Hidden:         OS << ".hidden";          break;
   case IndirectSymbol: OS << ".indirect_symbol"; break;
-  case Internal: OS << ".internal"; break;
-  case LazyReference: OS << ".lazy_reference"; break;
-  case NoDeadStrip: OS << ".no_dead_strip"; break;
-  case PrivateExtern: OS << ".private_extern"; break;
-  case Protected: OS << ".protected"; break;
-  case Reference: OS << ".reference"; break;
-  case Weak: OS << ".weak"; break;
+  case Internal:       OS << ".internal";        break;
+  case LazyReference:  OS << ".lazy_reference";  break;
+  case NoDeadStrip:    OS << ".no_dead_strip";   break;
+  case PrivateExtern:  OS << ".private_extern";  break;
+  case Protected:      OS << ".protected";       break;
+  case Reference:      OS << ".reference";       break;
+  case Weak:           OS << ".weak";            break;
   case WeakDefinition: OS << ".weak_definition"; break;
-  case WeakReference: OS << ".weak_reference"; break;
+  case WeakReference:  OS << ".weak_reference";  break;
   }
 
   OS << ' ';
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 0afdf98..4f39f1e 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -9,7 +9,10 @@
 
 #define DEBUG_TYPE "assembler"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Target/TargetMachOWriterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
@@ -48,7 +51,7 @@ class MachObjectWriter {
     Header_Magic32 = 0xFEEDFACE,
     Header_Magic64 = 0xFEEDFACF
   };
-  
+
   static const unsigned Header32Size = 28;
   static const unsigned Header64Size = 32;
   static const unsigned SegmentLoadCommand32Size = 56;
@@ -127,7 +130,7 @@ class MachObjectWriter {
   bool IsLSB;
 
 public:
-  MachObjectWriter(raw_ostream &_OS, bool _IsLSB = true) 
+  MachObjectWriter(raw_ostream &_OS, bool _IsLSB = true)
     : OS(_OS), IsLSB(_IsLSB) {
   }
 
@@ -170,10 +173,10 @@ public:
 
   void WriteZeros(unsigned N) {
     const char Zeros[16] = { 0 };
-    
+
     for (unsigned i = 0, e = N / 16; i != e; ++i)
       OS << StringRef(Zeros, 16);
-    
+
     OS << StringRef(Zeros, N % 16);
   }
 
@@ -184,7 +187,7 @@ public:
   }
 
   /// @}
-  
+
   void WriteHeader32(unsigned NumLoadCommands, unsigned LoadCommandsSize,
                      bool SubsectionsViaSymbols) {
     uint32_t Flags = 0;
@@ -384,7 +387,7 @@ public:
     Write32(MSD.StringIndex);
     Write8(Type);
     Write8(MSD.SectionIndex);
-    
+
     // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc'
     // value.
     Write16(Flags);
@@ -397,6 +400,7 @@ public:
   };
   void ComputeScatteredRelocationInfo(MCAssembler &Asm,
                                       MCSectionData::Fixup &Fixup,
+                                      const MCValue &Target,
                              DenseMap<const MCSymbol*,MCSymbolData*> &SymbolMap,
                                      std::vector<MachRelocationEntry> &Relocs) {
     uint32_t Address = Fixup.Fragment->getOffset() + Fixup.Offset;
@@ -404,13 +408,12 @@ public:
     unsigned Type = RIT_Vanilla;
 
     // See <reloc.h>.
-
-    const MCSymbol *A = Fixup.Value.getSymA();
+    const MCSymbol *A = Target.getSymA();
     MCSymbolData *SD = SymbolMap.lookup(A);
     uint32_t Value = SD->getFragment()->getAddress() + SD->getOffset();
     uint32_t Value2 = 0;
 
-    if (const MCSymbol *B = Fixup.Value.getSymB()) {
+    if (const MCSymbol *B = Target.getSymB()) {
       Type = RIT_LocalDifference;
 
       MCSymbolData *SD = SymbolMap.lookup(B);
@@ -421,7 +424,7 @@ public:
     assert((1U << Log2Size) == Fixup.Size && "Invalid fixup size!");
 
     // The value which goes in the fixup is current value of the expression.
-    Fixup.FixedValue = Value - Value2 + Fixup.Value.getConstant();
+    Fixup.FixedValue = Value - Value2 + Target.getConstant();
 
     MachRelocationEntry MRE;
     MRE.Word0 = ((Address   <<  0) |
@@ -450,14 +453,18 @@ public:
                              MCSectionData::Fixup &Fixup,
                              DenseMap<const MCSymbol*,MCSymbolData*> &SymbolMap,
                              std::vector<MachRelocationEntry> &Relocs) {
-    // If this is a local symbol plus an offset or a difference, then we need a
+    MCValue Target;
+    if (!Fixup.Value->EvaluateAsRelocatable(Target))
+      llvm_report_error("expected relocatable expression");
+
+    // If this is a difference or a local symbol plus an offset, then we need a
     // scattered relocation entry.
-    if (Fixup.Value.getSymB()) // a - b
-      return ComputeScatteredRelocationInfo(Asm, Fixup, SymbolMap, Relocs);
-    if (Fixup.Value.getSymA() && Fixup.Value.getConstant())
-      if (!Fixup.Value.getSymA()->isUndefined())
-        return ComputeScatteredRelocationInfo(Asm, Fixup, SymbolMap, Relocs);
-        
+    if (Target.getSymB() ||
+        (Target.getSymA() && !Target.getSymA()->isUndefined() &&
+         Target.getConstant()))
+      return ComputeScatteredRelocationInfo(Asm, Fixup, Target,
+                                            SymbolMap, Relocs);
+
     // See <reloc.h>.
     uint32_t Address = Fixup.Fragment->getOffset() + Fixup.Offset;
     uint32_t Value = 0;
@@ -466,15 +473,15 @@ public:
     unsigned IsExtern = 0;
     unsigned Type = 0;
 
-    if (Fixup.Value.isAbsolute()) { // constant
+    if (Target.isAbsolute()) { // constant
       // SymbolNum of 0 indicates the absolute section.
       Type = RIT_Vanilla;
       Value = 0;
       llvm_unreachable("FIXME: Not yet implemented!");
     } else {
-      const MCSymbol *Symbol = Fixup.Value.getSymA();
+      const MCSymbol *Symbol = Target.getSymA();
       MCSymbolData *SD = SymbolMap.lookup(Symbol);
-      
+
       if (Symbol->isUndefined()) {
         IsExtern = 1;
         Index = SD->getIndex();
@@ -495,7 +502,7 @@ public:
     }
 
     // The value which goes in the fixup is current value of the expression.
-    Fixup.FixedValue = Value + Fixup.Value.getConstant();
+    Fixup.FixedValue = Value + Target.getConstant();
 
     unsigned Log2Size = Log2_32(Fixup.Size);
     assert((1U << Log2Size) == Fixup.Size && "Invalid fixup size!");
@@ -510,7 +517,7 @@ public:
                  (Type      << 28));
     Relocs.push_back(MRE);
   }
-  
+
   void BindIndirectSymbols(MCAssembler &Asm,
                            DenseMap<const MCSymbol*,MCSymbolData*> &SymbolMap) {
     // This is the point where 'as' creates actual symbols for indirect symbols
@@ -703,7 +710,7 @@ public:
     if (NumSymbols)
       ComputeSymbolTable(Asm, StringTable, LocalSymbolData, ExternalSymbolData,
                          UndefinedSymbolData);
-  
+
     // The section data starts after the header, the segment load command (and
     // section headers) and the symbol table.
     unsigned NumLoadCommands = 1;
@@ -733,7 +740,7 @@ public:
 
       SectionDataSize = std::max(SectionDataSize,
                                  SD.getAddress() + SD.getSize());
-      SectionDataFileSize = std::max(SectionDataFileSize, 
+      SectionDataFileSize = std::max(SectionDataFileSize,
                                      SD.getAddress() + SD.getFileSize());
     }
 
@@ -748,9 +755,9 @@ public:
                   Asm.getSubsectionsViaSymbols());
     WriteSegmentLoadCommand32(NumSections, VMSize,
                               SectionDataStart, SectionDataSize);
-  
+
     // ... and then the section headers.
-    // 
+    //
     // We also compute the section relocations while we do this. Note that
     // compute relocation info will also update the fixup to have the correct
     // value; this will be overwrite the appropriate data in the fragment when
@@ -774,7 +781,7 @@ public:
       WriteSection32(SD, SectionStart, RelocTableEnd, NumRelocs);
       RelocTableEnd += NumRelocs * RelocationInfoSize;
     }
-    
+
     // Write the symbol table load command, if used.
     if (NumSymbols) {
       unsigned FirstLocalSymbol = 0;
@@ -923,7 +930,7 @@ MCSectionData::LookupFixup(const MCFragment *Fragment, uint64_t Offset) const {
 
   return 0;
 }
-                                                       
+
 /* *** */
 
 MCSymbolData::MCSymbolData() : Symbol(0) {}
@@ -960,7 +967,7 @@ void MCAssembler::LayoutSection(MCSectionData &SD) {
     switch (F.getKind()) {
     case MCFragment::FT_Align: {
       MCAlignFragment &AF = cast<MCAlignFragment>(F);
-      
+
       uint64_t Size = OffsetToAlignment(Address, AF.getAlignment());
       if (Size > AF.getMaxBytesToEmit())
         AF.setFileSize(0);
@@ -978,8 +985,12 @@ void MCAssembler::LayoutSection(MCSectionData &SD) {
 
       F.setFileSize(F.getMaxFileSize());
 
+      MCValue Target;
+      if (!FF.getValue().EvaluateAsRelocatable(Target))
+        llvm_report_error("expected relocatable expression");
+
       // If the fill value is constant, thats it.
-      if (FF.getValue().isAbsolute())
+      if (Target.isAbsolute())
         break;
 
       // Otherwise, add fixups for the values.
@@ -994,19 +1005,23 @@ void MCAssembler::LayoutSection(MCSectionData &SD) {
     case MCFragment::FT_Org: {
       MCOrgFragment &OF = cast<MCOrgFragment>(F);
 
-      if (!OF.getOffset().isAbsolute())
+      MCValue Target;
+      if (!OF.getOffset().EvaluateAsRelocatable(Target))
+        llvm_report_error("expected relocatable expression");
+
+      if (!Target.isAbsolute())
         llvm_unreachable("FIXME: Not yet implemented!");
-      uint64_t OrgOffset = OF.getOffset().getConstant();
+      uint64_t OrgOffset = Target.getConstant();
       uint64_t Offset = Address - SD.getAddress();
 
       // FIXME: We need a way to communicate this error.
       if (OrgOffset < Offset)
-        llvm_report_error("invalid .org offset '" + Twine(OrgOffset) + 
+        llvm_report_error("invalid .org offset '" + Twine(OrgOffset) +
                           "' (at offset '" + Twine(Offset) + "'");
-        
+
       F.setFileSize(OrgOffset - Offset);
       break;
-    }      
+    }
 
     case MCFragment::FT_ZeroFill: {
       MCZeroFillFragment &ZFF = cast<MCZeroFillFragment>(F);
@@ -1038,7 +1053,7 @@ static void WriteFileData(raw_ostream &OS, const MCFragment &F,
                           MachObjectWriter &MOW) {
   uint64_t Start = OS.tell();
   (void) Start;
-    
+
   ++EmittedFragments;
 
   // FIXME: Embed in fragments instead?
@@ -1051,8 +1066,8 @@ static void WriteFileData(raw_ostream &OS, const MCFragment &F,
     // multiple .align directives to enforce the semantics it wants), but is
     // severe enough that we want to report it. How to handle this?
     if (Count * AF.getValueSize() != AF.getFileSize())
-      llvm_report_error("undefined .align directive, value size '" + 
-                        Twine(AF.getValueSize()) + 
+      llvm_report_error("undefined .align directive, value size '" +
+                        Twine(AF.getValueSize()) +
                         "' is not a divisor of padding size '" +
                         Twine(AF.getFileSize()) + "'");
 
@@ -1077,10 +1092,15 @@ static void WriteFileData(raw_ostream &OS, const MCFragment &F,
     MCFillFragment &FF = cast<MCFillFragment>(F);
 
     int64_t Value = 0;
-    if (FF.getValue().isAbsolute())
-      Value = FF.getValue().getConstant();
+
+    MCValue Target;
+    if (!FF.getValue().EvaluateAsRelocatable(Target))
+      llvm_report_error("expected relocatable expression");
+
+    if (Target.isAbsolute())
+      Value = Target.getConstant();
     for (uint64_t i = 0, e = FF.getCount(); i != e; ++i) {
-      if (!FF.getValue().isAbsolute()) {
+      if (!Target.isAbsolute()) {
         // Find the fixup.
         //
         // FIXME: Find a better way to write in the fixes.
@@ -1101,7 +1121,7 @@ static void WriteFileData(raw_ostream &OS, const MCFragment &F,
     }
     break;
   }
-    
+
   case MCFragment::FT_Org: {
     MCOrgFragment &OF = cast<MCOrgFragment>(F);
 
@@ -1131,7 +1151,7 @@ static void WriteFileData(raw_ostream &OS, const MCSectionData &SD,
 
   uint64_t Start = OS.tell();
   (void) Start;
-      
+
   for (MCSectionData::const_iterator it = SD.begin(),
          ie = SD.end(); it != ie; ++it)
     WriteFileData(OS, *it, MOW);
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index f36564a..09479c5 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -8,10 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCContext.h"
-
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
 using namespace llvm;
 
 MCContext::MCContext() {
@@ -38,6 +39,13 @@ MCSymbol *MCContext::GetOrCreateSymbol(const StringRef &Name) {
   return Entry = new (*this) MCSymbol(Name, false);
 }
 
+MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name) {
+  SmallString<128> NameSV;
+  Name.toVector(NameSV);
+  return GetOrCreateSymbol(NameSV.str());
+}
+
+
 MCSymbol *MCContext::CreateTemporarySymbol(const StringRef &Name) {
   // If unnamed, just create a symbol.
   if (Name.empty())
@@ -52,20 +60,3 @@ MCSymbol *MCContext::CreateTemporarySymbol(const StringRef &Name) {
 MCSymbol *MCContext::LookupSymbol(const StringRef &Name) const {
   return Symbols.lookup(Name);
 }
-
-void MCContext::ClearSymbolValue(const MCSymbol *Sym) {
-  SymbolValues.erase(Sym);
-}
-
-void MCContext::SetSymbolValue(const MCSymbol *Sym, const MCValue &Value) {
-  SymbolValues[Sym] = Value;
-}
-
-const MCValue *MCContext::GetSymbolValue(const MCSymbol *Sym) const {
-  DenseMap<const MCSymbol*, MCValue>::iterator it = SymbolValues.find(Sym);
-
-  if (it == SymbolValues.end())
-    return 0;
-
-  return &it->second;
-}
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 0f3e053..c950ff2 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -141,10 +141,10 @@ const MCSymbolRefExpr *MCSymbolRefExpr::Create(const StringRef &Name,
 
 /* *** */
 
-bool MCExpr::EvaluateAsAbsolute(MCContext &Ctx, int64_t &Res) const {
+bool MCExpr::EvaluateAsAbsolute(int64_t &Res) const {
   MCValue Value;
   
-  if (!EvaluateAsRelocatable(Ctx, Value) || !Value.isAbsolute())
+  if (!EvaluateAsRelocatable(Value) || !Value.isAbsolute())
     return false;
 
   Res = Value.getConstant();
@@ -173,7 +173,7 @@ static bool EvaluateSymbolicAdd(const MCValue &LHS, const MCSymbol *RHS_A,
   return true;
 }
 
-bool MCExpr::EvaluateAsRelocatable(MCContext &Ctx, MCValue &Res) const {
+bool MCExpr::EvaluateAsRelocatable(MCValue &Res) const {
   switch (getKind()) {
   case Constant:
     Res = MCValue::get(cast<MCConstantExpr>(this)->getValue());
@@ -181,10 +181,12 @@ bool MCExpr::EvaluateAsRelocatable(MCContext &Ctx, MCValue &Res) const {
 
   case SymbolRef: {
     const MCSymbol &Sym = cast<MCSymbolRefExpr>(this)->getSymbol();
-    if (const MCValue *Value = Ctx.GetSymbolValue(&Sym))
-      Res = *Value;
-    else
-      Res = MCValue::get(&Sym, 0, 0);
+
+    // Evaluate recursively if this is a variable.
+    if (Sym.isVariable())
+      return Sym.getValue()->EvaluateAsRelocatable(Res);
+
+    Res = MCValue::get(&Sym, 0, 0);
     return true;
   }
 
@@ -192,7 +194,7 @@ bool MCExpr::EvaluateAsRelocatable(MCContext &Ctx, MCValue &Res) const {
     const MCUnaryExpr *AUE = cast<MCUnaryExpr>(this);
     MCValue Value;
 
-    if (!AUE->getSubExpr()->EvaluateAsRelocatable(Ctx, Value))
+    if (!AUE->getSubExpr()->EvaluateAsRelocatable(Value))
       return false;
 
     switch (AUE->getOpcode()) {
@@ -225,8 +227,8 @@ bool MCExpr::EvaluateAsRelocatable(MCContext &Ctx, MCValue &Res) const {
     const MCBinaryExpr *ABE = cast<MCBinaryExpr>(this);
     MCValue LHSValue, RHSValue;
     
-    if (!ABE->getLHS()->EvaluateAsRelocatable(Ctx, LHSValue) ||
-        !ABE->getRHS()->EvaluateAsRelocatable(Ctx, RHSValue))
+    if (!ABE->getLHS()->EvaluateAsRelocatable(LHSValue) ||
+        !ABE->getRHS()->EvaluateAsRelocatable(RHSValue))
       return false;
 
     // We only support a few operations on non-constant expressions, handle
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index e04bd1f..189f072 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -198,7 +198,9 @@ void MCMachOStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   assert((Symbol->isUndefined() || Symbol->isAbsolute()) &&
          "Cannot define a symbol twice!");
 
-  llvm_unreachable("FIXME: Not yet implemented!");
+  // FIXME: Lift context changes into super class.
+  // FIXME: Set associated section.
+  Symbol->setValue(Value);
 }
 
 void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
@@ -321,12 +323,7 @@ void MCMachOStreamer::EmitBytes(const StringRef &Data) {
 }
 
 void MCMachOStreamer::EmitValue(const MCExpr *Value, unsigned Size) {
-  MCValue RelocValue;
-
-  if (!AddValueSymbols(Value)->EvaluateAsRelocatable(getContext(), RelocValue))
-    return llvm_report_error("expected relocatable expression");
-
-  new MCFillFragment(RelocValue, Size, 1, CurSectionData);
+  new MCFillFragment(*AddValueSymbols(Value), Size, 1, CurSectionData);
 }
 
 void MCMachOStreamer::EmitValueToAlignment(unsigned ByteAlignment,
@@ -344,13 +341,7 @@ void MCMachOStreamer::EmitValueToAlignment(unsigned ByteAlignment,
 
 void MCMachOStreamer::EmitValueToOffset(const MCExpr *Offset,
                                         unsigned char Value) {
-  MCValue RelocOffset;
-
-  if (!AddValueSymbols(Offset)->EvaluateAsRelocatable(getContext(),
-                                                      RelocOffset))
-    return llvm_report_error("expected relocatable expression");
-
-  new MCOrgFragment(RelocOffset, Value, CurSectionData);
+  new MCOrgFragment(*Offset, Value, CurSectionData);
 }
 
 void MCMachOStreamer::EmitInstruction(const MCInst &Inst) {
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index e431d27..7e42e2d 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -48,6 +48,7 @@ namespace llvm {
     unsigned int arithmeticOK;
   };
 
+  const fltSemantics APFloat::IEEEhalf = { 15, -14, 11, true };
   const fltSemantics APFloat::IEEEsingle = { 127, -126, 24, true };
   const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53, true };
   const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113, true };
@@ -2812,6 +2813,35 @@ APFloat::convertFloatAPFloatToAPInt() const
                     (mysignificand & 0x7fffff)));
 }
 
+APInt
+APFloat::convertHalfAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEhalf);
+  assert (partCount()==1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+15; //bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x400))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x1f;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0x1f;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(16, (((sign&1) << 15) | ((myexponent&0x1f) << 10) |
+                    (mysignificand & 0x3ff)));
+}
+
 // This function creates an APInt that is just a bit map of the floating
 // point constant as it would appear in memory.  It is not a conversion,
 // and treating the result as a normal integer is unlikely to be useful.
@@ -2819,6 +2849,9 @@ APFloat::convertFloatAPFloatToAPInt() const
 APInt
 APFloat::bitcastToAPInt() const
 {
+  if (semantics == (const llvm::fltSemantics*)&IEEEhalf)
+    return convertHalfAPFloatToAPInt();
+
   if (semantics == (const llvm::fltSemantics*)&IEEEsingle)
     return convertFloatAPFloatToAPInt();
 
@@ -3051,6 +3084,39 @@ APFloat::initFromFloatAPInt(const APInt & api)
   }
 }
 
+void
+APFloat::initFromHalfAPInt(const APInt & api)
+{
+  assert(api.getBitWidth()==16);
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 10) & 0x1f;
+  uint32_t mysignificand = i & 0x3ff;
+
+  initialize(&APFloat::IEEEhalf);
+  assert(partCount()==1);
+
+  sign = i >> 15;
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x1f && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x1f && mysignificand!=0) {
+    // sign, exponent, significand meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 15;  //bias
+    *significandParts() = mysignificand;
+    if (myexponent==0)    // denormal
+      exponent = -14;
+    else
+      *significandParts() |= 0x400; // integer bit
+  }
+}
+
 /// Treat api as containing the bits of a floating point number.  Currently
 /// we infer the floating point type from the size of the APInt.  The
 /// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful
@@ -3058,7 +3124,9 @@ APFloat::initFromFloatAPInt(const APInt & api)
 void
 APFloat::initFromAPInt(const APInt& api, bool isIEEE)
 {
-  if (api.getBitWidth() == 32)
+  if (api.getBitWidth() == 16)
+    return initFromHalfAPInt(api);
+  else if (api.getBitWidth() == 32)
     return initFromFloatAPInt(api);
   else if (api.getBitWidth()==64)
     return initFromDoubleAPInt(api);
diff --git a/lib/Support/StringExtras.cpp b/lib/Support/StringExtras.cpp
index 1618086..c72f121 100644
--- a/lib/Support/StringExtras.cpp
+++ b/lib/Support/StringExtras.cpp
@@ -56,59 +56,3 @@ void llvm::SplitString(const std::string &Source,
     S2 = getToken(S, Delimiters);
   }
 }
-
-
-
-/// UnescapeString - Modify the argument string, turning two character sequences
-/// @verbatim
-/// like '\\' 'n' into '\n'.  This handles: \e \a \b \f \n \r \t \v \' \ and
-/// \num (where num is a 1-3 byte octal value).
-/// @endverbatim
-void llvm::UnescapeString(std::string &Str) {
-  for (unsigned i = 0; i != Str.size(); ++i) {
-    if (Str[i] == '\\' && i != Str.size()-1) {
-      switch (Str[i+1]) {
-      default: continue;  // Don't execute the code after the switch.
-      case 'a': Str[i] = '\a'; break;
-      case 'b': Str[i] = '\b'; break;
-      case 'e': Str[i] = 27; break;
-      case 'f': Str[i] = '\f'; break;
-      case 'n': Str[i] = '\n'; break;
-      case 'r': Str[i] = '\r'; break;
-      case 't': Str[i] = '\t'; break;
-      case 'v': Str[i] = '\v'; break;
-      case '"': Str[i] = '\"'; break;
-      case '\'': Str[i] = '\''; break;
-      case '\\': Str[i] = '\\'; break;
-      }
-      // Nuke the second character.
-      Str.erase(Str.begin()+i+1);
-    }
-  }
-}
-
-/// EscapeString - Modify the argument string, turning '\\' and anything that
-/// doesn't satisfy std::isprint into an escape sequence.
-void llvm::EscapeString(std::string &Str) {
-  for (unsigned i = 0; i != Str.size(); ++i) {
-    if (Str[i] == '\\') {
-      ++i;
-      Str.insert(Str.begin()+i, '\\');
-    } else if (Str[i] == '\t') {
-      Str[i++] = '\\';
-      Str.insert(Str.begin()+i, 't');
-    } else if (Str[i] == '"') {
-      Str.insert(Str.begin()+i++, '\\');
-    } else if (Str[i] == '\n') {
-      Str[i++] = '\\';
-      Str.insert(Str.begin()+i, 'n');
-    } else if (!std::isprint(Str[i])) {
-      // Always expand to a 3-digit octal escape.
-      unsigned Char = Str[i];
-      Str[i++] = '\\';
-      Str.insert(Str.begin()+i++, '0'+((Char/64) & 7));
-      Str.insert(Str.begin()+i++, '0'+((Char/8)  & 7));
-      Str.insert(Str.begin()+i  , '0'+( Char     & 7));
-    }
-  }
-}
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index 040308b..a729d3d 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringExtras.h"
 #include <cassert>
 using namespace llvm;
 
@@ -46,20 +47,6 @@ void StringMapImpl::init(unsigned InitSize) {
 }
 
 
-/// HashString - Compute a hash code for the specified string.
-///
-static unsigned HashString(const char *Start, const char *End) {
-  // Bernstein hash function.
-  unsigned int Result = 0;
-  // TODO: investigate whether a modified bernstein hash function performs
-  // better: http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
-  //   X*33+c -> X*33^c
-  while (Start != End)
-    Result = Result * 33 + *Start++;
-  Result = Result + (Result >> 5);
-  return Result;
-}
-
 /// LookupBucketFor - Look up the bucket that the specified string should end
 /// up in.  If it already exists as a key in the map, the Item pointer for the
 /// specified bucket will be non-null.  Otherwise, it will be null.  In either
@@ -71,7 +58,7 @@ unsigned StringMapImpl::LookupBucketFor(const StringRef &Name) {
     init(16);
     HTSize = NumBuckets;
   }
-  unsigned FullHashValue = HashString(Name.begin(), Name.end());
+  unsigned FullHashValue = HashString(Name);
   unsigned BucketNo = FullHashValue & (HTSize-1);
   
   unsigned ProbeAmt = 1;
@@ -126,7 +113,7 @@ unsigned StringMapImpl::LookupBucketFor(const StringRef &Name) {
 int StringMapImpl::FindKey(const StringRef &Key) const {
   unsigned HTSize = NumBuckets;
   if (HTSize == 0) return -1;  // Really empty table?
-  unsigned FullHashValue = HashString(Key.begin(), Key.end());
+  unsigned FullHashValue = HashString(Key);
   unsigned BucketNo = FullHashValue & (HTSize-1);
   
   unsigned ProbeAmt = 1;
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 6f805da..26a1a4e 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -96,6 +96,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case OpenBSD: return "openbsd";
   case Solaris: return "solaris";
   case Win32: return "win32";
+  case Haiku: return "haiku";
   }
 
   return "<invalid>";
@@ -276,6 +277,8 @@ void Triple::Parse() const {
     OS = Solaris;
   else if (OSName.startswith("win32"))
     OS = Win32;
+  else if (OSName.startswith("haiku"))
+  	OS = Haiku;
   else
     OS = UnknownOS;
 
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 0a82cc1..31451cc 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -168,6 +168,40 @@ raw_ostream &raw_ostream::write_hex(unsigned long long N) {
   return write(CurPtr, EndPtr-CurPtr);
 }
 
+raw_ostream &raw_ostream::write_escaped(StringRef Str) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    unsigned char c = Str[i];
+
+    switch (c) {
+    case '\\':
+      *this << '\\' << '\\';
+      break;
+    case '\t':
+      *this << '\\' << 't';
+      break;
+    case '\n':
+      *this << '\\' << 'n';
+      break;
+    case '"':
+      *this << '\\' << '"';
+      break;
+    default:
+      if (std::isprint(c)) {
+        *this << c;
+        break;
+      }
+
+      // Always expand to a 3-character octal escape.
+      *this << '\\';
+      *this << char('0' + ((c >> 6) & 7));
+      *this << char('0' + ((c >> 3) & 7));
+      *this << char('0' + ((c >> 0) & 7));
+    }
+  }
+
+  return *this;
+}
+
 raw_ostream &raw_ostream::operator<<(const void *P) {
   *this << '0' << 'x';
 
diff --git a/lib/System/Unix/Program.inc b/lib/System/Unix/Program.inc
index 56dea25..c52f3a8 100644
--- a/lib/System/Unix/Program.inc
+++ b/lib/System/Unix/Program.inc
@@ -244,7 +244,7 @@ Program::Wait(unsigned secondsToWait,
   int status;
   uint64_t pid = reinterpret_cast<uint64_t>(Data_);
   pid_t child = static_cast<pid_t>(pid);
-  while (wait(&status) != child)
+  while (waitpid(pid, &status, 0) != child)
     if (secondsToWait && errno == EINTR) {
       // Kill the child.
       kill(child, SIGKILL);
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
index 1839153..c603708 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/ARMAddressingModes.h
@@ -15,7 +15,6 @@
 #define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
 
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 
@@ -38,7 +37,7 @@ namespace ARM_AM {
 
   static inline const char *getShiftOpcStr(ShiftOpc Op) {
     switch (Op) {
-    default: llvm_unreachable("Unknown shift opc!");
+    default: assert(0 && "Unknown shift opc!");
     case ARM_AM::asr: return "asr";
     case ARM_AM::lsl: return "lsl";
     case ARM_AM::lsr: return "lsr";
@@ -71,7 +70,7 @@ namespace ARM_AM {
 
   static inline const char *getAMSubModeStr(AMSubMode Mode) {
     switch (Mode) {
-    default: llvm_unreachable("Unknown addressing sub-mode!");
+    default: assert(0 && "Unknown addressing sub-mode!");
     case ARM_AM::ia: return "ia";
     case ARM_AM::ib: return "ib";
     case ARM_AM::da: return "da";
@@ -81,7 +80,7 @@ namespace ARM_AM {
 
   static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
     switch (Mode) {
-    default: llvm_unreachable("Unknown addressing sub-mode!");
+    default: assert(0 && "Unknown addressing sub-mode!");
     case ARM_AM::ia: return isLD ? "fd" : "ea";
     case ARM_AM::ib: return isLD ? "ed" : "fa";
     case ARM_AM::da: return isLD ? "fa" : "ed";
@@ -342,6 +341,66 @@ namespace ARM_AM {
     return -1;
   }
 
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = CountTrailingZeros_32(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
 
   //===--------------------------------------------------------------------===//
   // Addressing Mode #2
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 42ef183..00e7531 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -36,8 +36,18 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
+static cl::opt<bool>
+ScavengeFrameIndexVals("arm-virtual-frame-index-vals", cl::Hidden,
+          cl::init(false),
+          cl::desc("Resolve frame index values via scavenging in PEI"));
+
+static cl::opt<bool>
+ReuseFrameIndexVals("arm-reuse-frame-index-vals", cl::Hidden, cl::init(false),
+          cl::desc("Reuse repeated frame index values"));
+
 unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
                                                    bool *isSPVFP) {
   if (isSPVFP)
@@ -740,8 +750,7 @@ unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
   case ARM::R1:
     return ARM::R0;
   case ARM::R3:
-    // FIXME!
-    return STI.isThumb1Only() ? 0 : ARM::R2;
+    return ARM::R2;
   case ARM::R5:
     return ARM::R4;
   case ARM::R7:
@@ -830,8 +839,7 @@ unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
   case ARM::R0:
     return ARM::R1;
   case ARM::R2:
-    // FIXME!
-    return STI.isThumb1Only() ? 0 : ARM::R3;
+    return ARM::R3;
   case ARM::R4:
     return ARM::R5;
   case ARM::R6:
@@ -937,6 +945,11 @@ requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
 }
 
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return ScavengeFrameIndexVals;
+}
+
 // hasReservedCallFrame - Under normal circumstances, when a frame pointer is
 // not required, we reserve argument space for call sites in the function
 // immediately on entry to the current function. This eliminates the need for
@@ -1077,14 +1090,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
           (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4) &&
          "This code isn't needed if offset already handled!");
 
-  // Insert a set of r12 with the full address: r12 = sp + offset
-  // If the offset we have is too large to fit into the instruction, we need
-  // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
-  // out of 'Offset'.
-  unsigned ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI);
-  if (ScratchReg == 0)
-    // No register is "free". Scavenge a register.
-    ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj);
+  unsigned ScratchReg = 0;
   int PIdx = MI.findFirstPredOperandIdx();
   ARMCC::CondCodes Pred = (PIdx == -1)
     ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
@@ -1093,6 +1099,19 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // Must be addrmode4.
     MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
   else {
+    if (!ScavengeFrameIndexVals) {
+      // Insert a set of r12 with the full address: r12 = sp + offset
+      // If the offset we have is too large to fit into the instruction, we need
+      // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
+      // out of 'Offset'.
+      ScratchReg = findScratchRegister(RS, ARM::GPRRegisterClass, AFI);
+      if (ScratchReg == 0)
+        // No register is "free". Scavenge a register.
+        ScratchReg = RS->scavengeRegister(ARM::GPRRegisterClass, II, SPAdj);
+    } else {
+      ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass);
+      *Value = Offset;
+    }
     if (!AFI->isThumbFunction())
       emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
                               Offset, Pred, PredReg, TII);
@@ -1102,8 +1121,10 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                              Offset, Pred, PredReg, TII);
     }
     MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+    if (!ReuseFrameIndexVals || !ScavengeFrameIndexVals)
+      ScratchReg = 0;
   }
-  return 0;
+  return ScratchReg;
 }
 
 /// Move iterator pass the next bunch of callee save load / store ops for
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index da703fb..f7d38e5 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -122,6 +122,8 @@ public:
 
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
+  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
   virtual bool hasReservedCallFrame(MachineFunction &MF) const;
 
   virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index b0bd04b..c995ff2 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -28,9 +28,11 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include <algorithm>
 using namespace llvm;
 
 STATISTIC(NumCPEs,       "Number of constpool entries");
@@ -70,6 +72,10 @@ namespace {
     /// to a return, unreachable, or unconditional branch).
     std::vector<MachineBasicBlock*> WaterList;
 
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
     typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
 
     /// CPUser - One user of a constant pool, keeping the machine instruction
@@ -175,9 +181,7 @@ namespace {
     void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta);
     bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
     int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
-    bool LookForWater(CPUser&U, unsigned UserOffset,
-                      MachineBasicBlock *&NewMBB);
-    MachineBasicBlock *AcceptWater(water_iterator IP);
+    bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter);
     void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
                         MachineBasicBlock *&NewMBB);
     bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex);
@@ -297,6 +301,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
     if (CPChange && ++NoCPIters > 30)
       llvm_unreachable("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
+    
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
 
     bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
@@ -629,7 +637,7 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
 
 
 /// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update datastructures and renumber blocks to
+/// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
 MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
@@ -691,6 +699,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
     WaterList.insert(next(IP), NewBB);
   else
     WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
 
   // Figure out how large the first NewMBB is.  (It cannot
   // contain a constpool_entry or tablejump.)
@@ -941,30 +950,16 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
   return ((1<<23)-1)*4;
 }
 
-/// AcceptWater - Small amount of common code factored out of the following.
-///
-MachineBasicBlock *ARMConstantIslands::AcceptWater(water_iterator IP) {
-  DEBUG(errs() << "found water in range\n");
-  MachineBasicBlock *WaterBB = *IP;
-  // Remove the original WaterList entry; we want subsequent
-  // insertions in this vicinity to go after the one we're
-  // about to insert.  This considerably reduces the number
-  // of times we have to move the same CPE more than once.
-  WaterList.erase(IP);
-  // CPE goes before following block (NewMBB).
-  return next(MachineFunction::iterator(WaterBB));
-}
-
-/// LookForWater - look for an existing entry in the WaterList in which
+/// LookForWater - Look for an existing entry in the WaterList in which
 /// we can place the CPE referenced from U so it's within range of U's MI.
-/// Returns true if found, false if not.  If it returns true, NewMBB
+/// Returns true if found, false if not.  If it returns true, WaterIter
 /// is set to the WaterList entry.  For Thumb, prefer water that will not
 /// introduce padding to water that will.  To ensure that this pass
 /// terminates, the CPE location for a particular CPUser is only allowed to
 /// move to a lower address, so search backward from the end of the list and
 /// prefer the first water that is in range.
 bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
-                                      MachineBasicBlock *&NewMBB) {
+                                      water_iterator &WaterIter) {
   if (WaterList.empty())
     return false;
 
@@ -973,9 +968,17 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
   for (water_iterator IP = prior(WaterList.end()),
          B = WaterList.begin();; --IP) {
     MachineBasicBlock* WaterBB = *IP;
-    // Check if water is in range and at a lower address than the current one.
-    if (WaterBB->getNumber() < U.HighWaterMark->getNumber() &&
-        WaterIsInRange(UserOffset, WaterBB, U)) {
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    if (WaterIsInRange(UserOffset, WaterBB, U) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB))) {
       unsigned WBBId = WaterBB->getNumber();
       if (isThumb &&
           (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) {
@@ -986,7 +989,7 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
           IPThatWouldPad = IP;
         }
       } else {
-        NewMBB = AcceptWater(IP);
+        WaterIter = IP;
         return true;
       }
     }
@@ -994,7 +997,7 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
       break;
   }
   if (FoundWaterThatWouldPad) {
-    NewMBB = AcceptWater(IPThatWouldPad);
+    WaterIter = IPThatWouldPad;
     return true;
   }
   return false;
@@ -1107,7 +1110,6 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   MachineInstr *CPEMI  = U.CPEMI;
   unsigned CPI = CPEMI->getOperand(1).getIndex();
   unsigned Size = CPEMI->getOperand(2).getImm();
-  MachineBasicBlock *NewMBB;
   // Compute this only once, it's expensive.  The 4 or 8 is the value the
   // hardware keeps in the PC.
   unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
@@ -1123,14 +1125,50 @@ bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
   unsigned ID = AFI->createConstPoolEntryUId();
 
   // Look for water where we can place this CPE.
-  if (!LookForWater(U, UserOffset, NewMBB)) {
+  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (LookForWater(U, UserOffset, IP)) {
+    DEBUG(errs() << "found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.count(WaterBB)) {
+      NewWaterList.erase(WaterBB);
+      NewWaterList.insert(NewIsland);
+    }
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = next(MachineFunction::iterator(WaterBB));
+
+  } else {
     // No water found.
     DEBUG(errs() << "No water found\n");
     CreateNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // SplitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
   }
 
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
   // Okay, we know we can put an island before NewMBB now, do it!
-  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
   MF.insert(NewMBB, NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c39de0a..5c1835b 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1287,7 +1287,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDValue Op,
       assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
       unsigned Width = 32 - Srl_imm;
       int LSB = Srl_imm - Shl_imm;
-      if ((LSB + Width) > 32)
+      if (LSB < 0)
         return NULL;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { Op.getOperand(0).getOperand(0),
@@ -1427,6 +1427,43 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
       }
     }
     break;
+  case ISD::AND: {
+    // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+    // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+    // are entirely contributed by c2 and lower 16-bits are entirely contributed
+    // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+    // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+    EVT VT = Op.getValueType();
+    if (VT != MVT::i32)
+      break;
+    unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ? ARM::t2MOVTi16
+      : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+    if (!Opc)
+      break;
+    SDValue N0 = Op.getOperand(0), N1 = Op.getOperand(1);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    if (!N1C)
+      break;
+    if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+      SDValue N2 = N0.getOperand(1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+      if (!N2C)
+        break;
+      unsigned N1CVal = N1C->getZExtValue();
+      unsigned N2CVal = N2C->getZExtValue();
+      if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+          (N1CVal & 0xffffU) == 0xffffU &&
+          (N2CVal & 0xffffU) == 0x0U) {
+        SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+                                                  MVT::i32);
+        SDValue Ops[] = { N0.getOperand(0), Imm16,
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4);
+      }
+    }
+    break;
+  }
   case ARMISD::FMRRD:
     return CurDAG->getMachineNode(ARM::FMRRD, dl, MVT::i32, MVT::i32,
                                   Op.getOperand(0), getAL(CurDAG),
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 426cecb..6a264fd 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -25,9 +25,10 @@
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
 #include "llvm/Instruction.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -2360,8 +2361,11 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
          "Only possible block sizes for VREV are: 16, 32, 64");
 
-  unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
   unsigned BlockElts = M[0] + 1;
 
   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
@@ -2378,6 +2382,10 @@ static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
 
 static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
@@ -2390,6 +2398,10 @@ static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
 
 static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -2398,7 +2410,7 @@ static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
   }
 
   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
-  if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+  if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
@@ -2406,6 +2418,10 @@ static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
 
 static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
                        unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
@@ -2417,7 +2433,7 @@ static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
   }
 
   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
-  if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32)
+  if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
@@ -2695,18 +2711,10 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Vec = Op.getOperand(0);
   SDValue Lane = Op.getOperand(1);
-
-  // FIXME: This is invalid for 8 and 16-bit elements - the information about
-  // sign / zero extension is lost!
-  Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
-  Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT));
-
-  if (VT.bitsLT(MVT::i32))
-    Op = DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
-  else if (VT.bitsGT(MVT::i32))
-    Op = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op);
-
-  return Op;
+  assert(VT == MVT::i32 &&
+         Vec.getValueType().getVectorElementType().getSizeInBits() < 32 &&
+         "unexpected type for custom-lowering vector extract");
+  return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
@@ -3029,7 +3037,6 @@ static SDValue PerformSUBCombine(SDNode *N,
   return SDValue();
 }
 
-
 /// PerformFMRRDCombine - Target-specific dag combine xforms for ARMISD::FMRRD.
 static SDValue PerformFMRRDCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 3d19f23..8225fd7 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1220,6 +1220,10 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
             string asm, string cstr, list<dag> pattern>
   : NeonI<oops, iops, AddrMode6, IndexModeNone, itin, asm, cstr, pattern> {
   let Inst{31-24} = 0b11110100;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{7-4} = op7_4;
 }
 
 class NDataI<dag oops, dag iops, InstrItinClass itin,
@@ -1258,15 +1262,26 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{4} = op4;
 }
 
+// NEON Vector Duplicate (scalar).
+// Inst{19-16} is specified by subclasses.
+class N2VDup<bits<2> op24_23, bits<2> op21_20, bits<5> op11_7, bit op6, bit op4,
+             dag oops, dag iops, InstrItinClass itin,
+             string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
 // NEON 2 vector register with immediate.
-class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4,
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
              dag oops, dag iops, InstrItinClass itin,
              string asm, string cstr, list<dag> pattern>
   : NDataI<oops, iops, itin, asm, cstr, pattern> {
   let Inst{24} = op24;
   let Inst{23} = op23;
-  let Inst{21-16} = op21_16;
   let Inst{11-8} = op11_8;
   let Inst{7} = op7;
   let Inst{6} = op6;
@@ -1286,6 +1301,20 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{4} = op4;
 }
 
+// NEON 3 vector register with immediate.  This is only used for VEXT where
+// op11_8 represents the starting byte index of the extracted result in the
+// concatenation of the operands and is left unspecified.
+class N3VImm<bit op24, bit op23, bits<2> op21_20, bit op6, bit op4,
+             dag oops, dag iops, InstrItinClass itin,
+             string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
 // NEON VMOVs between scalar and core registers.
 class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
                dag oops, dag iops, Format f, InstrItinClass itin,
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 4c92891..dd4123b 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6ec78bc..384b98c 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -980,6 +980,9 @@ def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
   let Inst{25} = 1;
 }
 
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+      Requires<[IsARM, HasV6T2]>;
+
 let Uses = [CPSR] in
 def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
                  "mov", " $dst, $src, rrx",
@@ -1580,10 +1583,17 @@ def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
 def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
              (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
                     (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(add GPR:$LHS, so_imm2part:$RHS),
+             (ADDri (ADDri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(sub GPR:$LHS, so_imm2part:$RHS),
+             (SUBri (SUBri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
 
 // 32-bit immediate using movw + movt.
-// This is a single pseudo instruction to make it re-materializable. Remove
-// when we can do generalized remat.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
 let isReMaterializable = 1 in
 def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
                      "movw", " $dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index b34aff7..822950c 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -456,17 +456,17 @@ class VST2LN<bits<4> op11_8, string OpcodeStr>
           !strconcat(OpcodeStr, "\t\\{$src1[$lane],$src2[$lane]\\}, $addr"),
           "", []>;
 
-def VST2LNd8  : VST2LN<0b0000, "vst2.8">;
-def VST2LNd16 : VST2LN<0b0100, "vst2.16">;
-def VST2LNd32 : VST2LN<0b1000, "vst2.32">;
+def VST2LNd8  : VST2LN<0b0001, "vst2.8">;
+def VST2LNd16 : VST2LN<0b0101, "vst2.16">;
+def VST2LNd32 : VST2LN<0b1001, "vst2.32">;
 
 // vst2 to double-spaced even registers.
-def VST2LNq16a: VST2LN<0b0100, "vst2.16">;
-def VST2LNq32a: VST2LN<0b1000, "vst2.32">;
+def VST2LNq16a: VST2LN<0b0101, "vst2.16">;
+def VST2LNq32a: VST2LN<0b1001, "vst2.32">;
 
 // vst2 to double-spaced odd registers.
-def VST2LNq16b: VST2LN<0b0100, "vst2.16">;
-def VST2LNq32b: VST2LN<0b1000, "vst2.32">;
+def VST2LNq16b: VST2LN<0b0101, "vst2.16">;
+def VST2LNq32b: VST2LN<0b1001, "vst2.32">;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
 class VST3LN<bits<4> op11_8, string OpcodeStr>
@@ -623,12 +623,12 @@ class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
 
-// Long 2-register intrinsics.  (This is currently only used for VMOVL and is
-// derived from N2VImm instead of N2V because of the way the size is encoded.)
-class N2VLInt<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+// Long 2-register intrinsics (currently only used for VMOVL).
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr,
               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4, (outs QPR:$dst),
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst),
         (ins DPR:$src), itin, !strconcat(OpcodeStr, "\t$dst, $src"), "",
         [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
 
@@ -1016,36 +1016,33 @@ class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 
 // Shift by immediate,
 // both double- and quad-register.
-class N2VDSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op4, InstrItinClass itin, string OpcodeStr,
-             ValueType Ty, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
            (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
-class N2VQSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op4, InstrItinClass itin, string OpcodeStr,
-             ValueType Ty, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             InstrItinClass itin, string OpcodeStr, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
            (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
 
 // Long shift by immediate.
-class N2VLSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4, string OpcodeStr, ValueType ResTy,
-             ValueType OpTy, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             string OpcodeStr, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
            (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
                                           (i32 imm:$SIMM))))]>;
 
 // Narrow shift by immediate.
-class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-             bit op6, bit op4, InstrItinClass itin, string OpcodeStr,
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             InstrItinClass itin, string OpcodeStr,
              ValueType ResTy, ValueType OpTy, SDNode OpNode>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, op6, op4,
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
            (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
@@ -1053,53 +1050,49 @@ class N2VNSh<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
 
 // Shift right by immediate and accumulate,
 // both double- and quad-register.
-class N2VDShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
-           IIC_VPALiD, 
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set DPR:$dst, (Ty (add DPR:$src1,
                                 (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
-class N2VQShAdd<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
-           IIC_VPALiD, 
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set QPR:$dst, (Ty (add QPR:$src1,
                                 (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
 
 // Shift by immediate and insert,
 // both double- and quad-register.
-class N2VDShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
-           (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
-           IIC_VSHLiD, 
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
-class N2VQShIns<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-                bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
-           (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
-           IIC_VSHLiQ, 
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, 
            !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
            [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
 
 // Convert, with fractional bits immediate,
 // both double- and quad-register.
-class N2VCvtD<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 0, op4,
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
            (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, 
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
-class N2VCvtQ<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
-              bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, ValueType ResTy, ValueType OpTy,
               Intrinsic IntOp>
-  : N2VImm<op24, op23, op21_16, op11_8, op7, 1, op4,
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
            (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, 
            !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
            [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
@@ -1175,14 +1168,14 @@ multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 
 // Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
 //   source operand element sizes of 16, 32 and 64 bits:
-multiclass N2VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
-                       bit op4, string OpcodeStr, Intrinsic IntOp> {
-  def v8i16 : N2VLInt<op24, op23, 0b001000, op11_8, op7, op6, op4,
-                      IIC_VQUNAiD, !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
-  def v4i32 : N2VLInt<op24, op23, 0b010000, op11_8, op7, op6, op4,
-                      IIC_VQUNAiD, !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
-  def v2i64 : N2VLInt<op24, op23, 0b100000, op11_8, op7, op6, op4,
-                      IIC_VQUNAiD, !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
+multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                       string OpcodeStr, Intrinsic IntOp> {
+  def v8i16 : N2VLInt<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
+  def v4i32 : N2VLInt<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N2VLInt<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
 }
 
 
@@ -1381,7 +1374,7 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
 multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
                         string OpcodeStr, Intrinsic IntOp>
   : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, IntOp> {
-  def v8i16 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
+  def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
                        !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
 }
 
@@ -1461,24 +1454,38 @@ multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                       InstrItinClass itin, string OpcodeStr, SDNode OpNode> {
   // 64-bit vector types.
-  def v8i8  : N2VDSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "8"), v8i8, OpNode>;
-  def v4i16 : N2VDSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "16"), v4i16, OpNode>;
-  def v2i32 : N2VDSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "32"), v2i32, OpNode>;
-  def v1i64 : N2VDSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin,
                      !strconcat(OpcodeStr, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQSh<op24, op23, 0b001000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "8"), v16i8, OpNode>;
-  def v8i16 : N2VQSh<op24, op23, 0b010000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "16"), v8i16, OpNode>;
-  def v4i32 : N2VQSh<op24, op23, 0b100000, op11_8, 0, op4, itin,
-                     !strconcat(OpcodeStr, "32"), v4i32, OpNode>;
-  def v2i64 : N2VQSh<op24, op23, 0b000000, op11_8, 1, op4, itin,
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     !strconcat(OpcodeStr, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin,
                      !strconcat(OpcodeStr, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
 }
 
 
@@ -1487,24 +1494,38 @@ multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                          string OpcodeStr, SDNode ShOp> {
   // 64-bit vector types.
-  def v8i8  : N2VDShAdd<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
-  def v4i16 : N2VDShAdd<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
-  def v2i32 : N2VDShAdd<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
-  def v1i64 : N2VDShAdd<op24, op23, 0b000000, op11_8, 1, op4,
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQShAdd<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
-  def v8i16 : N2VQShAdd<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
-  def v4i32 : N2VQShAdd<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
-  def v2i64 : N2VQShAdd<op24, op23, 0b000000, op11_8, 1, op4,
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
 }
 
 
@@ -1513,24 +1534,75 @@ multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                          string OpcodeStr, SDNode ShOp> {
   // 64-bit vector types.
-  def v8i8  : N2VDShIns<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
-  def v4i16 : N2VDShIns<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
-  def v2i32 : N2VDShIns<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
-  def v1i64 : N2VDShIns<op24, op23, 0b000000, op11_8, 1, op4,
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
 
   // 128-bit vector types.
-  def v16i8 : N2VQShIns<op24, op23, 0b001000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
-  def v8i16 : N2VQShIns<op24, op23, 0b010000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
-  def v4i32 : N2VQShIns<op24, op23, 0b100000, op11_8, 0, op4,
-                        !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
-  def v2i64 : N2VQShIns<op24, op23, 0b000000, op11_8, 1, op4,
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        !strconcat(OpcodeStr, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4,
                         !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+//   element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, string OpcodeStr, SDNode OpNode> {
+  def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                 !strconcat(OpcodeStr, "8"), v8i16, v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                  !strconcat(OpcodeStr, "16"), v4i32, v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                  !strconcat(OpcodeStr, "32"), v2i64, v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+// Neon Shift Narrow operations,
+//   element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, InstrItinClass itin, string OpcodeStr,
+                      SDNode OpNode> {
+  def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                    !strconcat(OpcodeStr, "16"), v8i8, v8i16, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     !strconcat(OpcodeStr, "32"), v4i16, v4i32, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     !strconcat(OpcodeStr, "64"), v2i32, v2i64, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1903,8 +1975,8 @@ defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, IIC_VBINi4Q, "vabdl.s", int_arm_neon_v
 defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, IIC_VBINi4Q, "vabdl.u", int_arm_neon_vabdlu, 0>;
 
 //   VABA     : Vector Absolute Difference and Accumulate
-defm VABAs    : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>;
-defm VABAu    : N3VInt3_QHS<1,1,0b0101,0, "vaba.u", int_arm_neon_vabau>;
+defm VABAs    : N3VInt3_QHS<0,0,0b0111,1, "vaba.s", int_arm_neon_vabas>;
+defm VABAu    : N3VInt3_QHS<1,0,0b0111,1, "vaba.u", int_arm_neon_vabau>;
 
 //   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
 defm VABALs   : N3VLInt3_QHS<0,1,0b0101,0, "vabal.s", int_arm_neon_vabals>;
@@ -2044,34 +2116,25 @@ defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr.s", NEONvshrs>;
 defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr.u", NEONvshru>;
 
 //   VSHLL    : Vector Shift Left Long
-def  VSHLLs8  : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8",
-                       v8i16, v8i8, NEONvshlls>;
-def  VSHLLs16 : N2VLSh<0, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.s16",
-                       v4i32, v4i16, NEONvshlls>;
-def  VSHLLs32 : N2VLSh<0, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.s32",
-                       v2i64, v2i32, NEONvshlls>;
-def  VSHLLu8  : N2VLSh<1, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.u8",
-                       v8i16, v8i8, NEONvshllu>;
-def  VSHLLu16 : N2VLSh<1, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.u16",
-                       v4i32, v4i16, NEONvshllu>;
-def  VSHLLu32 : N2VLSh<1, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.u32",
-                       v2i64, v2i32, NEONvshllu>;
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll.s", NEONvshlls>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll.u", NEONvshllu>;
 
 //   VSHLL    : Vector Shift Left Long (with maximum shift count)
-def  VSHLLi8  : N2VLSh<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8",
-                       v8i16, v8i8, NEONvshlli>;
-def  VSHLLi16 : N2VLSh<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16",
-                       v4i32, v4i16, NEONvshlli>;
-def  VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
-                       v2i64, v2i32, NEONvshlli>;
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op6, bit op4, string OpcodeStr, ValueType ResTy,
+                ValueType OpTy, SDNode OpNode>
+  : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, ResTy, OpTy, OpNode> {
+  let Inst{21-16} = op21_16;
+}
+def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8",
+                          v8i16, v8i8, NEONvshlli>;
+def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16",
+                          v4i32, v4i16, NEONvshlli>;
+def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
+                          v2i64, v2i32, NEONvshlli>;
 
 //   VSHRN    : Vector Shift Right and Narrow
-def  VSHRN16  : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, 
-                       IIC_VSHLiD, "vshrn.i16", v8i8, v8i16, NEONvshrn>;
-def  VSHRN32  : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1,
-                       IIC_VSHLiD, "vshrn.i32", v4i16, v4i32, NEONvshrn>;
-def  VSHRN64  : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1,
-                       IIC_VSHLiD, "vshrn.i64", v2i32, v2i64, NEONvshrn>;
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn.i", NEONvshrn>;
 
 //   VRSHL    : Vector Rounding Shift
 defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
@@ -2083,12 +2146,8 @@ defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.s", NEONvrshrs>;
 defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr.u", NEONvrshru>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
-def  VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vrshrn.i16", v8i8, v8i16, NEONvrshrn>;
-def  VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, 
-                       IIC_VSHLi4D, "vrshrn.i32", v4i16, v4i32, NEONvrshrn>;
-def  VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vrshrn.i64", v2i32, v2i64, NEONvrshrn>;
+defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn.i",
+                           NEONvrshrn>;
 
 //   VQSHL    : Vector Saturating Shift
 defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
@@ -2102,26 +2161,14 @@ defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl.u", NEONvqshlu>;
 defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu.s", NEONvqshlsu>;
 
 //   VQSHRN   : Vector Saturating Shift Right and Narrow
-def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, 
-                       IIC_VSHLi4D, "vqshrn.s16", v8i8, v8i16, NEONvqshrns>;
-def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.s32", v4i16, v4i32, NEONvqshrns>;
-def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, 
-                       IIC_VSHLi4D, "vqshrn.s64", v2i32, v2i64, NEONvqshrns>;
-def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.u16", v8i8, v8i16, NEONvqshrnu>;
-def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.u32", v4i16, v4i32, NEONvqshrnu>;
-def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrn.u64", v2i32, v2i64, NEONvqshrnu>;
+defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn.s",
+                           NEONvqshrns>;
+defm VQSHRNu  : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn.u",
+                           NEONvqshrnu>;
 
 //   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
-def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrun.s16", v8i8, v8i16, NEONvqshrnsu>;
-def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrun.s32", v4i16, v4i32, NEONvqshrnsu>;
-def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1,
-                       IIC_VSHLi4D, "vqshrun.s64", v2i32, v2i64, NEONvqshrnsu>;
+defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun.s",
+                           NEONvqshrnsu>;
 
 //   VQRSHL   : Vector Saturating Rounding Shift
 defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
@@ -2130,26 +2177,14 @@ defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi
                             IIC_VSHLi4Q, "vqrshl.u", int_arm_neon_vqrshiftu, 0>;
 
 //   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
-def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.s16", v8i8, v8i16, NEONvqrshrns>;
-def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.s32", v4i16, v4i32, NEONvqrshrns>;
-def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.s64", v2i32, v2i64, NEONvqrshrns>;
-def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.u16", v8i8, v8i16, NEONvqrshrnu>;
-def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrn.u32", v4i16, v4i32, NEONvqrshrnu>;
-def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, 
-                       IIC_VSHLi4D, "vqrshrn.u64", v2i32, v2i64, NEONvqrshrnu>;
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn.s",
+                           NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn.u",
+                           NEONvqrshrnu>;
 
 //   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
-def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrun.s16", v8i8, v8i16, NEONvqrshrnsu>;
-def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, 
-                       IIC_VSHLi4D, "vqrshrun.s32", v4i16, v4i32, NEONvqrshrnsu>;
-def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1,
-                       IIC_VSHLi4D, "vqrshrun.s64", v2i32, v2i64, NEONvqrshrnsu>;
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun.s",
+                           NEONvqrshrnsu>;
 
 //   VSRA     : Vector Shift Right and Accumulate
 defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>;
@@ -2491,27 +2526,28 @@ def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
 
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
-class VDUPLND<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, ValueType Ty>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
+class VDUPLND<string OpcodeStr, ValueType Ty>
+  : N2VDup<0b11, 0b11, 0b11000, 0, 0,
         (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
         !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
         [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
 
-class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr,
-              ValueType ResTy, ValueType OpTy>
-  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
+class VDUPLNQ<string OpcodeStr, ValueType ResTy, ValueType OpTy>
+  : N2VDup<0b11, 0b11, 0b11000, 1, 0,
         (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
         !strconcat(OpcodeStr, "\t$dst, $src[$lane]"), "",
         [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
 
-def VDUPLN8d  : VDUPLND<0b00, 0b01, "vdup.8", v8i8>;
-def VDUPLN16d : VDUPLND<0b00, 0b10, "vdup.16", v4i16>;
-def VDUPLN32d : VDUPLND<0b01, 0b00, "vdup.32", v2i32>;
-def VDUPLNfd  : VDUPLND<0b01, 0b00, "vdup.32", v2f32>;
-def VDUPLN8q  : VDUPLNQ<0b00, 0b01, "vdup.8", v16i8, v8i8>;
-def VDUPLN16q : VDUPLNQ<0b00, 0b10, "vdup.16", v8i16, v4i16>;
-def VDUPLN32q : VDUPLNQ<0b01, 0b00, "vdup.32", v4i32, v2i32>;
-def VDUPLNfq  : VDUPLNQ<0b01, 0b00, "vdup.32", v4f32, v2f32>;
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d  : VDUPLND<"vdup.8", v8i8> { let Inst{16} = 1; }
+def VDUPLN16d : VDUPLND<"vdup.16", v4i16> { let Inst{17-16} = 0b10; }
+def VDUPLN32d : VDUPLND<"vdup.32", v2i32> { let Inst{18-16} = 0b100; }
+def VDUPLNfd  : VDUPLND<"vdup.32", v2f32> { let Inst{18-16} = 0b100; }
+def VDUPLN8q  : VDUPLNQ<"vdup.8", v16i8, v8i8> { let Inst{16} = 1; }
+def VDUPLN16q : VDUPLNQ<"vdup.16", v8i16, v4i16> { let Inst{17-16} = 0b10; }
+def VDUPLN32q : VDUPLNQ<"vdup.32", v4i32, v2i32> { let Inst{18-16} = 0b100; }
+def VDUPLNfq  : VDUPLNQ<"vdup.32", v4f32, v2f32> { let Inst{18-16} = 0b100; }
 
 def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
           (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
@@ -2530,15 +2566,19 @@ def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
-def VDUPfdf   : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 0, 0,
-                    (outs DPR:$dst), (ins SPR:$src),
-                    IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
-                    [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
+def  VDUPfdf  : N2VDup<0b11, 0b11, 0b11000, 0, 0,
+                       (outs DPR:$dst), (ins SPR:$src),
+                       IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+                       [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]> {
+  let Inst{18-16} = 0b100;
+}
 
-def VDUPfqf   : N2V<0b11, 0b11, 0b01, 0b00, 0b11000, 1, 0,
-                    (outs QPR:$dst), (ins SPR:$src),
-                    IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
-                    [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+def  VDUPfqf  : N2VDup<0b11, 0b11, 0b11000, 1, 0,
+                       (outs QPR:$dst), (ins SPR:$src),
+                       IIC_VMOVD, "vdup.32\t$dst, ${src:lane}", "",
+                       [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]> {
+  let Inst{18-16} = 0b100;
+}
 
 def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
           (INSERT_SUBREG QPR:$src, 
@@ -2560,8 +2600,8 @@ defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD, "vqmovn.u",
 defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD, "vqmovun.s",
                             int_arm_neon_vqmovnsu>;
 //   VMOVL    : Vector Lengthening Move
-defm VMOVLs   : N2VLInt_QHS<0,1,0b1010,0,0,1, "vmovl.s", int_arm_neon_vmovls>;
-defm VMOVLu   : N2VLInt_QHS<1,1,0b1010,0,0,1, "vmovl.u", int_arm_neon_vmovlu>;
+defm VMOVLs   : N2VLInt_QHS<0b01,0b10100,0,1, "vmovl.s", int_arm_neon_vmovls>;
+defm VMOVLu   : N2VLInt_QHS<0b11,0b10100,0,1, "vmovl.u", int_arm_neon_vmovlu>;
 
 // Vector Conversions.
 
@@ -2585,24 +2625,22 @@ def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt.f32.u32",
                      v4f32, v4i32, uint_to_fp>;
 
 //   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
-// Note: Some of the opcode bits in the following VCVT instructions need to
-// be encoded based on the immed values.
-def VCVTf2xsd : N2VCvtD<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32",
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt.s32.f32",
                         v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
-def VCVTf2xud : N2VCvtD<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32",
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt.u32.f32",
                         v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
-def VCVTxs2fd : N2VCvtD<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt.f32.s32",
                         v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
-def VCVTxu2fd : N2VCvtD<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt.f32.u32",
                         v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
 
-def VCVTf2xsq : N2VCvtQ<0, 1, 0b000000, 0b1111, 0, 1, "vcvt.s32.f32",
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt.s32.f32",
                         v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
-def VCVTf2xuq : N2VCvtQ<1, 1, 0b000000, 0b1111, 0, 1, "vcvt.u32.f32",
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt.u32.f32",
                         v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
-def VCVTxs2fq : N2VCvtQ<0, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.s32",
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt.f32.s32",
                         v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
-def VCVTxu2fq : N2VCvtQ<1, 1, 0b000000, 0b1110, 0, 1, "vcvt.f32.u32",
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt.f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 
 // Vector Reverse.
@@ -2670,18 +2708,18 @@ def VREV16q8  : VREV16Q<0b00, "vrev16.8", v16i8>;
 //   VEXT     : Vector Extract
 
 class VEXTd<string OpcodeStr, ValueType Ty>
-  : N3V<0,1,0b11,0b0000,0,0, (outs DPR:$dst),
-        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
-        !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
-        [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
-                                      (Ty DPR:$rhs), imm:$index)))]>;
+  : N3VImm<0,1,0b11,0,0, (outs DPR:$dst),
+           (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
+           !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+           [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
+                                         (Ty DPR:$rhs), imm:$index)))]>;
 
 class VEXTq<string OpcodeStr, ValueType Ty>
-  : N3V<0,1,0b11,0b0000,1,0, (outs QPR:$dst),
-        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
-        !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
-        [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
-                                      (Ty QPR:$rhs), imm:$index)))]>;
+  : N3VImm<0,1,0b11,1,0, (outs QPR:$dst),
+           (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
+           !strconcat(OpcodeStr, "\t$dst, $lhs, $rhs, $index"), "",
+           [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
+                                         (Ty QPR:$rhs), imm:$index)))]>;
 
 def VEXTd8  : VEXTd<"vext.8",  v8i8>;
 def VEXTd16 : VEXTd<"vext.16", v4i16>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 0750dcc..2b6fa98 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -69,6 +69,25 @@ def t2_so_imm_neg : Operand<i32>,
   return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
 }], t2_so_imm_neg_XFORM>;
 
+// Break t2_so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses t2_so_imm2part to match and t2_so_imm2part_[12]
+// to get the first/second pieces.
+def t2_so_imm2part : Operand<i32>,
+                  PatLeaf<(imm), [{
+      return ARM_AM::isT2SOImmTwoPartVal((unsigned)N->getZExtValue());
+    }]> {
+}
+
+def t2_so_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartFirst((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def t2_so_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartSecond((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
 /// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
 def imm1_31 : PatLeaf<(i32 imm), [{
   return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32;
@@ -666,6 +685,8 @@ def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
                     [(set GPR:$dst,
                           (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>;
 
+def : T2Pat<(or GPR:$src, 0xffff0000), (t2MOVTi16 GPR:$src, 0xffff)>;
+
 //===----------------------------------------------------------------------===//
 //  Extend Instructions.
 //
@@ -1129,6 +1150,20 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
 // Non-Instruction Patterns
 //
 
+// Two piece so_imms.
+def : T2Pat<(or GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ORRri (t2ORRri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(xor GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2EORri (t2EORri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(add GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ADDri (t2ADDri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(sub GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2SUBri (t2SUBri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>;
 def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index d2ec9ee..c9b9e84 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -974,6 +974,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     if (Advance) {
       ++Position;
       ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
     } else
       TryMerge = true;
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 20a7355..e0be784 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -222,12 +222,9 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
     iterator allocation_order_begin(const MachineFunction &MF) const;
     iterator allocation_order_end(const MachineFunction &MF) const;
   }];
-  // FIXME: We are reserving r3 in Thumb mode in case the PEI needs to use it
-  // to generate large stack offset. Make it available once we have register
-  // scavenging.
   let MethodBodies = [{
     static const unsigned THUMB_tGPR_AO[] = {
-      ARM::R0, ARM::R1, ARM::R2,
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
 
     // FP is R7, only low registers available.
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index cf1ee3f..5af95c3 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -27,11 +27,11 @@ UseNEONFP("arm-use-neon-fp",
           cl::init(false), cl::Hidden);
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
-                           bool isThumb)
+                           bool isT)
   : ARMArchVersion(V4T)
   , ARMFPUType(None)
   , UseNEONForSinglePrecisionFP(UseNEONFP)
-  , IsThumb(isThumb)
+  , IsThumb(isT)
   , ThumbMode(Thumb1)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
@@ -98,9 +98,13 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
   if (isTargetDarwin())
     IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
 
+  if (!isThumb() || hasThumb2())
+    PostRAScheduler = true;
+
   // Set CPU specific features.
   if (CPUString == "cortex-a8") {
-    PostRAScheduler = true;
+    // On Cortex-a8, it's faster to perform some single-precision FP
+    // operations with NEON instructions.
     if (UseNEONFP.getPosition() == 0)
       UseNEONForSinglePrecisionFP = true;
   }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 7098fd4..7478159 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -126,9 +126,13 @@ protected:
 
   const std::string & getCPUString() const { return CPUString; }
   
-  /// enablePostRAScheduler - From TargetSubtarget, return true to
-  /// enable post-RA scheduler.
-  bool enablePostRAScheduler() const { return PostRAScheduler; }
+  /// enablePostRAScheduler - True at 'More' optimization except
+  /// for Thumb1.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& mode) const {
+    mode = TargetSubtarget::ANTIDEP_NONE;
+    return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+  }
 
   /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 32ddc20..c1da6ce 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -16,7 +16,6 @@
 #include "ARM.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 7438ea9..403f96c 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -56,6 +56,14 @@ private:
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
 
+  bool ParseDirectiveThumb(SMLoc L);
+
+  bool ParseDirectiveThumbFunc(SMLoc L);
+
+  bool ParseDirectiveCode(SMLoc L);
+
+  bool ParseDirectiveSyntax(SMLoc L);
+
   // TODO - For now hacked versions of the next two are in here in this file to
   // allow some parser testing until the table gen versions are implemented.
 
@@ -230,8 +238,8 @@ bool ARMAsmParser::ParseRegister(ARMOperand &Op) {
   return false;
 }
 
-// Try to parse a register list.  The first token must be a '{' when called
-// for now.
+// Parse a register list, return false if successful else return true or an 
+// error.  The first token must be a '{' when called.
 bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
   assert(getLexer().getTok().is(AsmToken::LCurly) &&
          "Token is not an Left Curly Brace");
@@ -277,7 +285,8 @@ bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
   return false;
 }
 
-// Try to parse an arm memory expression.  It must start with a '[' token.
+// Parse an arm memory expression, return false if successful else return true
+// or an error.  The first token must be a '[' when called.
 // TODO Only preindexing and postindexing addressing are started, unindexed
 // with option, etc are still to do.
 bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
@@ -374,50 +383,55 @@ bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
     Writeback = true;
     getLexer().Lex(); // Eat right bracket token.
 
-    const AsmToken &CommaTok = getLexer().getTok();
-    if (CommaTok.isNot(AsmToken::Comma))
-      return Error(CommaTok.getLoc(), "',' expected");
-    getLexer().Lex(); // Eat comma token.
-
-    const AsmToken &NextTok = getLexer().getTok();
-    if (NextTok.is(AsmToken::Plus))
-      getLexer().Lex(); // Eat plus token.
-    else if (NextTok.is(AsmToken::Minus)) {
-      Negative = true;
-      getLexer().Lex(); // Eat minus token
-    }
-
-    // See if there is a register following the "[Rn]," we have so far.
-    const AsmToken &OffsetRegTok = getLexer().getTok();
-    int OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+    int OffsetRegNum = 0;
     bool OffsetRegShifted = false;
     enum ShiftType ShiftType;
     const MCExpr *ShiftAmount;
     const MCExpr *Offset;
-    if (OffsetRegNum != -1) {
-      OffsetIsReg = true;
-      getLexer().Lex(); // Eat identifier token for the offset register.
-      // Look for a comma then a shift
-      const AsmToken &Tok = getLexer().getTok();
-      if (Tok.is(AsmToken::Comma)) {
-        getLexer().Lex(); // Eat comma token.
 
-        const AsmToken &Tok = getLexer().getTok();
-        if (ParseShift(&ShiftType, ShiftAmount))
-          return Error(Tok.getLoc(), "shift expected");
-        OffsetRegShifted = true;
+    const AsmToken &NextTok = getLexer().getTok();
+    if (NextTok.isNot(AsmToken::EndOfStatement)) {
+      if (NextTok.isNot(AsmToken::Comma))
+	return Error(NextTok.getLoc(), "',' expected");
+      getLexer().Lex(); // Eat comma token.
+
+      const AsmToken &PlusMinusTok = getLexer().getTok();
+      if (PlusMinusTok.is(AsmToken::Plus))
+	getLexer().Lex(); // Eat plus token.
+      else if (PlusMinusTok.is(AsmToken::Minus)) {
+	Negative = true;
+	getLexer().Lex(); // Eat minus token
       }
-    }
-    else { // "[Rn]," we have so far was not followed by "Rm"
-      // Look for #offset following the "[Rn],"
-      const AsmToken &HashTok = getLexer().getTok();
-      if (HashTok.isNot(AsmToken::Hash))
-        return Error(HashTok.getLoc(), "'#' expected");
-      getLexer().Lex(); // Eat hash token.
 
-      if (getParser().ParseExpression(Offset))
-       return true;
+      // See if there is a register following the "[Rn]," we have so far.
+      const AsmToken &OffsetRegTok = getLexer().getTok();
+      OffsetRegNum = MatchRegisterName(OffsetRegTok.getString());
+      if (OffsetRegNum != -1) {
+	OffsetIsReg = true;
+	getLexer().Lex(); // Eat identifier token for the offset register.
+	// Look for a comma then a shift
+	const AsmToken &Tok = getLexer().getTok();
+	if (Tok.is(AsmToken::Comma)) {
+	  getLexer().Lex(); // Eat comma token.
+
+	  const AsmToken &Tok = getLexer().getTok();
+	  if (ParseShift(&ShiftType, ShiftAmount))
+	    return Error(Tok.getLoc(), "shift expected");
+	  OffsetRegShifted = true;
+	}
+      }
+      else { // "[Rn]," we have so far was not followed by "Rm"
+	// Look for #offset following the "[Rn],"
+	const AsmToken &HashTok = getLexer().getTok();
+	if (HashTok.isNot(AsmToken::Hash))
+	  return Error(HashTok.getLoc(), "'#' expected");
+	getLexer().Lex(); // Eat hash token.
+
+	if (getParser().ParseExpression(Offset))
+	 return true;
+      }
     }
+
     Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
                                OffsetRegShifted, ShiftType, ShiftAmount,
                                Preindexed, Postindexed, Negative, Writeback);
@@ -465,7 +479,7 @@ bool ARMAsmParser::ParseShift(ShiftType *St, const MCExpr *&ShiftAmount) {
   return false;
 }
 
-// A hack to allow some testing
+// A hack to allow some testing, to be replaced by a real table gen version.
 int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
   if (Name == "r0" || Name == "R0")
     return 0;
@@ -504,7 +518,7 @@ int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
   return -1;
 }
 
-// A hack to allow some testing
+// A hack to allow some testing, to be replaced by a real table gen version.
 bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
                                     MCInst &Inst) {
   struct ARMOperand Op0 = Operands[0];
@@ -516,40 +530,58 @@ bool ARMAsmParser::MatchInstruction(SmallVectorImpl<ARMOperand> &Operands,
       Mnemonic == "ldmfd" ||
       Mnemonic == "ldr" ||
       Mnemonic == "mov" ||
-      Mnemonic == "sub")
+      Mnemonic == "sub" ||
+      Mnemonic == "bl" ||
+      Mnemonic == "push" ||
+      Mnemonic == "blx" ||
+      Mnemonic == "pop") {
+    // Hard-coded to a valid instruction, till we have a real matcher.
+    Inst = MCInst();
+    Inst.setOpcode(ARM::MOVr);
+    Inst.addOperand(MCOperand::CreateReg(2));
+    Inst.addOperand(MCOperand::CreateReg(2));
+    Inst.addOperand(MCOperand::CreateImm(0));
+    Inst.addOperand(MCOperand::CreateImm(0));
+    Inst.addOperand(MCOperand::CreateReg(0));
     return false;
+  }
 
   return true;
 }
 
-// TODO - this is a work in progress
+// Parse a arm instruction operand.  For now this parses the operand regardless
+// of the mnemonic.
 bool ARMAsmParser::ParseOperand(ARMOperand &Op) {
   switch (getLexer().getKind()) {
   case AsmToken::Identifier:
     if (!ParseRegister(Op))
       return false;
-    // TODO parse other operands that start with an identifier like labels
-    return Error(getLexer().getTok().getLoc(), "labels not yet supported");
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    if (getParser().ParseExpression(IdVal))
+      return true;
+    Op = ARMOperand::CreateImm(IdVal);
+    return false;
   case AsmToken::LBrac:
-    if (!ParseMemory(Op))
-      return false;
+    return ParseMemory(Op);
   case AsmToken::LCurly:
-    if (!ParseRegisterList(Op))
-      return false;
+    return ParseRegisterList(Op);
   case AsmToken::Hash:
     // #42 -> immediate.
     // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
     getLexer().Lex();
-    const MCExpr *Val;
-    if (getParser().ParseExpression(Val))
+    const MCExpr *ImmVal;
+    if (getParser().ParseExpression(ImmVal))
       return true;
-    Op = ARMOperand::CreateImm(Val);
+    Op = ARMOperand::CreateImm(ImmVal);
     return false;
   default:
     return Error(getLexer().getTok().getLoc(), "unexpected token in operand");
   }
 }
 
+// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
   SmallVector<ARMOperand, 7> Operands;
 
@@ -579,10 +611,19 @@ bool ARMAsmParser::ParseInstruction(const StringRef &Name, MCInst &Inst) {
   return true;
 }
 
+/// ParseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return ParseDirectiveWord(4, DirectiveID.getLoc());
+  else if (IDVal == ".thumb")
+    return ParseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_func")
+    return ParseDirectiveThumbFunc(DirectiveID.getLoc());
+  else if (IDVal == ".code")
+    return ParseDirectiveCode(DirectiveID.getLoc());
+  else if (IDVal == ".syntax")
+    return ParseDirectiveSyntax(DirectiveID.getLoc());
   return true;
 }
 
@@ -611,6 +652,93 @@ bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// ParseDirectiveThumb
+///  ::= .thumb
+bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO: set thumb mode
+  // TODO: tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveThumbFunc
+///  ::= .thumbfunc symbol_name
+bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
+    return Error(L, "unexpected token in .syntax directive");
+  StringRef SymbolName = getLexer().getTok().getIdentifier();
+  getLexer().Lex(); // Consume the identifier token.
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO: mark symbol as a thumb symbol
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveSyntax
+///  ::= .syntax unified | divided
+bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(L, "unexpected token in .syntax directive");
+  const StringRef &Mode = Tok.getString();
+  bool unified_syntax;
+  if (Mode == "unified" || Mode == "UNIFIED") {
+    getLexer().Lex();
+    unified_syntax = true;
+  }
+  else if (Mode == "divided" || Mode == "DIVIDED") {
+    getLexer().Lex();
+    unified_syntax = false;
+  }
+  else
+    return Error(L, "unrecognized syntax mode in .syntax directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getTok().getLoc(), "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code 16 | 32
+bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
+  const AsmToken &Tok = getLexer().getTok();
+  if (Tok.isNot(AsmToken::Integer))
+    return Error(L, "unexpected token in .code directive");
+  int64_t Val = getLexer().getTok().getIntVal();
+  bool thumb_mode;
+  if (Val == 16) {
+    getLexer().Lex();
+    thumb_mode = true;
+  }
+  else if (Val == 32) {
+    getLexer().Lex();
+    thumb_mode = false;
+  }
+  else
+    return Error(L, "invalid operand to .code directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getTok().getLoc(), "unexpected token in directive");
+  getLexer().Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmParser() {
   RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
index 546731b..8719e4c 100644
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@@ -1,3 +1,5 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
@@ -13,21 +15,25 @@
 #define DEBUG_TYPE "asm-printer"
 #include "ARM.h"
 #include "ARMBuildAttrs.h"
-#include "ARMTargetMachine.h"
 #include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMInstPrinter.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMMCInstLower.h"
+#include "ARMTargetMachine.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DwarfWriter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -38,18 +44,22 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Mangler.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/FormattedStream.h"
 #include <cctype>
 using namespace llvm;
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
+static cl::opt<bool>
+EnableMCInst("enable-arm-mcinst-printer", cl::Hidden,
+            cl::desc("enable experimental asmprinter gunk in the arm backend"));
+
 namespace {
-  class VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter {
+  class ARMAsmPrinter : public AsmPrinter {
 
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
     /// make the right decision when printing asm code for different targets.
@@ -63,34 +73,23 @@ namespace {
     /// MachineFunction.
     const MachineConstantPool *MCP;
 
-    /// We name each basic block in a Function with a unique number, so
-    /// that we can consistently refer to them later. This is cleared
-    /// at the beginning of each call to runOnMachineFunction().
-    ///
-    typedef std::map<const Value *, unsigned> ValueMapTy;
-    ValueMapTy NumberForBB;
-
-    /// GVNonLazyPtrs - Keeps the set of GlobalValues that require
-    /// non-lazy-pointers for indirect access.
-    StringMap<std::string> GVNonLazyPtrs;
-
-    /// HiddenGVNonLazyPtrs - Keeps the set of GlobalValues with hidden
-    /// visibility that require non-lazy-pointers for indirect access.
-    StringMap<std::string> HiddenGVNonLazyPtrs;
-
-    /// True if asm printer is printing a series of CONSTPOOL_ENTRY.
-    bool InCPMode;
   public:
     explicit ARMAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
                            const MCAsmInfo *T, bool V)
-      : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL),
-        InCPMode(false) {
+      : AsmPrinter(O, TM, T, V), AFI(NULL), MCP(NULL) {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
 
     virtual const char *getPassName() const {
       return "ARM Assembly Printer";
     }
+    
+    void printMCInst(const MCInst *MI) {
+      ARMInstPrinter(O, *MAI, VerboseAsm).printInstruction(MI);
+    }
+    
+    void printInstructionThroughMCStreamer(const MachineInstr *MI);
+    
 
     void printOperand(const MachineInstr *MI, int OpNum,
                       const char *Modifier = 0);
@@ -149,8 +148,8 @@ namespace {
 
     void printMachineInstruction(const MachineInstr *MI);
     bool runOnMachineFunction(MachineFunction &F);
-    bool doFinalization(Module &M);
     void EmitStartOfAsmFile(Module &M);
+    void EmitEndOfAsmFile(Module &M);
 
     /// EmitMachineConstantPoolValue - Print a machine constantpool value to
     /// the .s file.
@@ -173,12 +172,19 @@ namespace {
           Name = Mang->getMangledName(GV);
         else {
           // FIXME: Remove this when Darwin transition to @GOT like syntax.
-          std::string SymName = Mang->getMangledName(GV);
           Name = Mang->getMangledName(GV, "$non_lazy_ptr", true);
-          if (GV->hasHiddenVisibility())
-            HiddenGVNonLazyPtrs[SymName] = Name;
-          else
-            GVNonLazyPtrs[SymName] = Name;
+          MCSymbol *Sym = OutContext.GetOrCreateSymbol(StringRef(Name));
+          
+          MachineModuleInfoMachO &MMIMachO =
+            MMI->getObjFileInfo<MachineModuleInfoMachO>();
+          const MCSymbol *&StubSym =
+            GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(Sym) :
+                                        MMIMachO.getGVStubEntry(Sym);
+          if (StubSym == 0) {
+            SmallString<128> NameStr;
+            Mang->getNameWithPrefix(NameStr, GV, false);
+            StubSym = OutContext.GetOrCreateSymbol(NameStr.str());
+          }
         }
       } else
         Name = Mang->makeNameProper(ACPV->getSymbol());
@@ -260,7 +266,6 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     if (Subtarget->isTargetDarwin())
       O << "\t" << CurrentFnName;
     O << "\n";
-    InCPMode = false;
   } else {
     EmitAlignment(FnAlign, F);
   }
@@ -283,14 +288,13 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
        I != E; ++I) {
     // Print a label for the basic block.
-    if (I != MF.begin()) {
+    if (I != MF.begin())
       EmitBasicBlockStart(I);
-    }
+
+    // Print the assembly for the instruction.
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
-         II != E; ++II) {
-      // Print the assembly for the instruction.
+         II != E; ++II)
       printMachineInstruction(II);
-    }
   }
 
   if (MAI->hasDotTypeDotSizeDirective())
@@ -306,25 +310,25 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
                                  const char *Modifier) {
   const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
   case MachineOperand::MO_Register: {
     unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      if (Modifier && strcmp(Modifier, "dregpair") == 0) {
-        unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
-        unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
-        O << '{'
-          << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
-          << '}';
-      } else if (Modifier && strcmp(Modifier, "lane") == 0) {
-        unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
-        unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
-                                                 &ARM::DPR_VFP2RegClass);
-        O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
-      } else {
-        O << getRegisterName(Reg);
-      }
-    } else
-      llvm_unreachable("not implemented");
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+      unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+      unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+      O << '{'
+        << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+        << '}';
+    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                               &ARM::DPR_VFP2RegClass);
+      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
+    } else {
+      O << getRegisterName(Reg);
+    }
     break;
   }
   case MachineOperand::MO_Immediate: {
@@ -372,8 +376,6 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
       << '_' << MO.getIndex();
     break;
-  default:
-    O << "<unknown operand type>"; abort (); break;
   }
 }
 
@@ -1027,22 +1029,19 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   ++EmittedInsts;
 
-  int Opc = MI->getOpcode();
-  switch (Opc) {
-  case ARM::CONSTPOOL_ENTRY:
-    if (!InCPMode && AFI->isThumbFunction()) {
-      EmitAlignment(2);
-      InCPMode = true;
-    }
-    break;
-  default: {
-    if (InCPMode && AFI->isThumbFunction())
-      InCPMode = false;
-  }}
-
   // Call the autogenerated instruction printer routines.
   processDebugLoc(MI, true);
-  printInstruction(MI);
+  
+  if (EnableMCInst) {
+    printInstructionThroughMCStreamer(MI);
+  } else {
+    int Opc = MI->getOpcode();
+    if (Opc == ARM::CONSTPOOL_ENTRY)
+      EmitAlignment(2);
+    
+    printInstruction(MI);
+  }
+  
   if (VerboseAsm && !MI->getDebugLoc().isUnknown())
     EmitComments(*MI);
   O << '\n';
@@ -1256,34 +1255,40 @@ void ARMAsmPrinter::PrintGlobalVariable(const GlobalVariable* GVar) {
 }
 
 
-bool ARMAsmPrinter::doFinalization(Module &M) {
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetDarwin()) {
     // All darwin targets use mach-o.
     TargetLoweringObjectFileMachO &TLOFMacho =
       static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
 
     O << '\n';
 
     // Output non-lazy-pointers for external and common global variables.
-    if (!GVNonLazyPtrs.empty()) {
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+    
+    if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
       EmitAlignment(2);
-      for (StringMap<std::string>::iterator I = GVNonLazyPtrs.begin(),
-           E = GVNonLazyPtrs.end(); I != E; ++I) {
-        O << I->second << ":\n";
-        O << "\t.indirect_symbol " << I->getKeyData() << "\n";
-        O << "\t.long\t0\n";
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n\t.indirect_symbol ";
+        Stubs[i].second->print(O, MAI);
+        O << "\n\t.long\t0\n";
       }
     }
 
-    if (!HiddenGVNonLazyPtrs.empty()) {
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
       OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
       EmitAlignment(2);
-      for (StringMap<std::string>::iterator I = HiddenGVNonLazyPtrs.begin(),
-             E = HiddenGVNonLazyPtrs.end(); I != E; ++I) {
-        O << I->second << ":\n";
-        O << "\t.long " << I->getKeyData() << "\n";
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        Stubs[i].first->print(O, MAI);
+        O << ":\n\t.long ";
+        Stubs[i].second->print(O, MAI);
+        O << "\n";
       }
     }
 
@@ -1292,14 +1297,179 @@ bool ARMAsmPrinter::doFinalization(Module &M) {
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
-    O << "\t.subsections_via_symbols\n";
+    OutStreamer.EmitAssemblerFlag(MCStreamer::SubsectionsViaSymbols);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
+  ARMMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  switch (MI->getOpcode()) {
+  case ARM::t2MOVi32imm:
+    assert(0 && "Should be lowered by thumb2it pass");
+  default: break;
+  case TargetInstrInfo::DBG_LABEL:
+  case TargetInstrInfo::EH_LABEL:
+  case TargetInstrInfo::GC_LABEL:
+    printLabel(MI);
+    return;
+  case TargetInstrInfo::KILL:
+    return;
+  case TargetInstrInfo::INLINEASM:
+    O << '\t';
+    printInlineAsm(MI);
+    return;
+  case TargetInstrInfo::IMPLICIT_DEF:
+    printImplicitDef(MI);
+    return;
+  case ARM::PICADD: { // FIXME: Remove asm string from td file.
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+    
+    // Emit the label.
+    // FIXME: MOVE TO SHARED PLACE.
+    unsigned Id = (unsigned)MI->getOperand(2).getImm();
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)+"PC"+Twine(Id));
+    OutStreamer.EmitLabel(Label);
+    
+    
+    // Form and emit tha dd.
+    MCInst AddInst;
+    AddInst.setOpcode(ARM::ADDrr);
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    AddInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    printMCInst(&AddInst);
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: { // FIXME: Remove asm string from td file.
+    /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+    /// in the function.  The first operand is the ID# for this instruction, the
+    /// second is the index into the MachineConstantPool that this is, the third
+    /// is the size in bytes of this constant pool entry.
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    EmitAlignment(2);
+
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *Label = OutContext.GetOrCreateSymbol(Twine(Prefix)+"CPI"+
+                                                   Twine(getFunctionNumber())+
+                                                   "_"+ Twine(LabelId));
+    OutStreamer.EmitLabel(Label);
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    
+    return;
   }
+  case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file.
+    // This is a hack that lowers as a two instruction sequence.
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));
+      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1));
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+
+      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      printMCInst(&TmpInst);
+      O << '\n';
+    }
 
-  return AsmPrinter::doFinalization(M);
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::ORRri);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // dstreg
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // inreg
+      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV2)); // so_imm
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      printMCInst(&TmpInst);
+    }
+    return; 
+  }
+  case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file.
+    // This is a hack that lowers as a two instruction sequence.
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi16);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
+      TmpInst.addOperand(MCOperand::CreateImm(ImmVal & 65535)); // lower16(imm)
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      printMCInst(&TmpInst);
+      O << '\n';
+    }
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVTi16);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // srcreg
+      TmpInst.addOperand(MCOperand::CreateImm(ImmVal >> 16));   // upper16(imm)
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      printMCInst(&TmpInst);
+    }
+    
+    return;
+  }
+  }
+      
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  
+  printMCInst(&TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             raw_ostream &O) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(O, MAI, false);
+  return 0;
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
   RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+
+  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
 }
+
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
new file mode 100644
index 0000000..f422798
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
@@ -0,0 +1,358 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM.h" // FIXME: FACTOR ENUMS BETTER.
+#include "ARMInstPrinter.h"
+#include "ARMAddressingModes.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define ARMAsmPrinter ARMInstPrinter  // FIXME: REMOVE.
+#define NO_ASM_WRITER_BOILERPLATE
+#include "ARMGenAsmWriter.inc"
+#undef MachineInstr
+#undef ARMAsmPrinter
+
+void ARMInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+      // FIXME: Breaks e.g. ARM/vmul.ll.
+      assert(0);
+      /*
+      unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+      unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+      O << '{'
+      << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+      << '}';*/
+    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+      assert(0);
+      /*
+      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                               &ARM::DPR_VFP2RegClass);
+      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
+       */
+    } else {
+      O << getRegisterName(Reg);
+    }
+  } else if (Op.isImm()) {
+    assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+    O << '#' << Op.getImm();
+  } else {
+    assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
+                       const MCAsmInfo *MAI) {
+  // Break it up into two parts that make up a shifter immediate.
+  V = ARM_AM::getSOImmVal(V);
+  assert(V != -1 && "Not a valid so_imm value!");
+  
+  unsigned Imm = ARM_AM::getSOImmValImm(V);
+  unsigned Rot = ARM_AM::getSOImmValRot(V);
+  
+  // Print low-level immediate formation info, per
+  // A5.1.3: "Data-processing operands - Immediate".
+  if (Rot) {
+    O << "#" << Imm << ", " << Rot;
+    // Pretty printed version.
+    if (VerboseAsm)
+      O << ' ' << MAI->getCommentString()
+      << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
+  } else {
+    O << "#" << Imm;
+  }
+}
+
+
+/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
+/// immediate in bits 0-7.
+void ARMInstPrinter::printSOImmOperand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  printSOImm(O, MO.getImm(), VerboseAsm, &MAI);
+}
+
+/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
+/// followed by an 'orr' to materialize.
+void ARMInstPrinter::printSOImm2PartOperand(const MCInst *MI, unsigned OpNum) {
+  // FIXME: REMOVE this method.
+  abort();
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  O << getRegisterName(MO1.getReg());
+  
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
+    << ' ';
+  
+  if (MO2.getReg()) {
+    O << getRegisterName(MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+  } else {
+    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  }
+}
+
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+  
+  O << "[" << getRegisterName(MO1.getReg());
+  
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
+      O << ", #"
+      << (char)ARM_AM::getAM2Op(MO3.getImm())
+      << ARM_AM::getAM2Offset(MO3.getImm());
+    O << "]";
+    return;
+  }
+  
+  O << ", "
+  << (char)ARM_AM::getAM2Op(MO3.getImm())
+  << getRegisterName(MO2.getReg());
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
+    << " #" << ShImm;
+  O << "]";
+}  
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    assert(ImmOffs && "Malformed indexed load / store!");
+    O << '#' << (char)ARM_AM::getAM2Op(MO2.getImm()) << ImmOffs;
+    return;
+  }
+  
+  O << (char)ARM_AM::getAM2Op(MO2.getImm()) << getRegisterName(MO1.getReg());
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
+    << " #" << ShImm;
+}
+
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  O << '[' << getRegisterName(MO1.getReg());
+  
+  if (MO2.getReg()) {
+    O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << getRegisterName(MO2.getReg()) << ']';
+    return;
+  }
+  
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+    O << ", #"
+    << (char)ARM_AM::getAM3Op(MO3.getImm())
+    << ImmOffs;
+  O << ']';
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (MO1.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO2.getImm())
+    << getRegisterName(MO1.getReg());
+    return;
+  }
+  
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  assert(ImmOffs && "Malformed indexed load / store!");
+  O << "#"
+  << (char)ARM_AM::getAM3Op(MO2.getImm())
+  << ImmOffs;
+}
+
+
+void ARMInstPrinter::printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
+                                           const char *Modifier) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    if (MO1.getReg() == ARM::SP) {
+      // FIXME
+      bool isLDM = (MI->getOpcode() == ARM::LDM ||
+                    MI->getOpcode() == ARM::LDM_RET ||
+                    MI->getOpcode() == ARM::t2LDM ||
+                    MI->getOpcode() == ARM::t2LDM_RET);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+    if (Mode == ARM_AM::ia)
+      O << ".w";
+  } else {
+    printOperand(MI, OpNum);
+    if (ARM_AM::getAM4WBFlag(MO2.getImm()))
+      O << "!";
+  }
+}
+
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                                           const char *Modifier) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum);
+    return;
+  }
+  
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
+    if (MO1.getReg() == ARM::SP) {
+      bool isFLDM = (MI->getOpcode() == ARM::FLDMD ||
+                     MI->getOpcode() == ARM::FLDMS);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isFLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+    return;
+  } else if (Modifier && strcmp(Modifier, "base") == 0) {
+    // Used for FSTM{D|S} and LSTM{D|S} operations.
+    O << getRegisterName(MO1.getReg());
+    if (ARM_AM::getAM5WBFlag(MO2.getImm()))
+      O << "!";
+    return;
+  }
+  
+  O << "[" << getRegisterName(MO1.getReg());
+  
+  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+    O << ", #"
+      << (char)ARM_AM::getAM5Op(MO2.getImm())
+      << ImmOffs*4;
+  }
+  O << "]";
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  // FIXME: No support yet for specifying alignment.
+  O << '[' << getRegisterName(MO1.getReg()) << ']';
+  
+  if (ARM_AM::getAM6WBFlag(MO3.getImm())) {
+    if (MO2.getReg() == 0)
+      O << '!';
+    else
+      O << ", " << getRegisterName(MO2.getReg());
+  }
+}
+
+void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
+                                            const char *Modifier) {
+  assert(0 && "FIXME: Implement printAddrModePCOperand");
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand (const MCInst *MI,
+                                                     unsigned OpNum) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = CountTrailingZeros_32(v);
+  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << '#' << lsb << ", #" << width;
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum) {
+  O << "{";
+  // Always skip the first operand, it's the optional (and implicit writeback).
+  for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != OpNum+1) O << ", ";
+    O << getRegisterName(MI->getOperand(i).getReg());
+  }
+  O << "}";
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum){
+  if (MI->getOperand(OpNum).getReg()) {
+    assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+           "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+
+
+void ARMInstPrinter::printCPInstOperand(const MCInst *MI, unsigned OpNum,
+                                        const char *Modifier) {
+  // FIXME: remove this.
+  abort();
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum) {
+  // FIXME: remove this.
+  abort();
+}
diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
new file mode 100644
index 0000000..4925137
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
@@ -0,0 +1,89 @@
+//===-- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTPRINTER_H
+#define ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class ARMInstPrinter : public MCInstPrinter {
+  bool VerboseAsm;
+public:
+  ARMInstPrinter(raw_ostream &O, const MCAsmInfo &MAI, bool verboseAsm)
+    : MCInstPrinter(O, MAI), VerboseAsm(verboseAsm) {}
+
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+    
+  void printSOImmOperand(const MCInst *MI, unsigned OpNum);
+  void printSOImm2PartOperand(const MCInst *MI, unsigned OpNum);
+  
+  void printSORegOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode2Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode3Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
+                             const char *Modifier = 0);
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                             const char *Modifier = 0);
+  void printAddrMode6Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
+                              const char *Modifier = 0);
+    
+  void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum);
+  
+  void printThumbITMask(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeRI5Operand(const MCInst *MI, unsigned OpNum,
+                                    unsigned Scale) {}
+  void printThumbAddrModeS1Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeS2Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeS4Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum) {}
+  
+  void printT2SOOperand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm12Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum) {}
+  
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum);
+  void printSBitModifierOperand(const MCInst *MI, unsigned OpNum);
+  void printRegisterList(const MCInst *MI, unsigned OpNum);
+  void printCPInstOperand(const MCInst *MI, unsigned OpNum,
+                          const char *Modifier);
+  void printJTBlockOperand(const MCInst *MI, unsigned OpNum) {}
+  void printJT2BlockOperand(const MCInst *MI, unsigned OpNum) {}
+  void printTBAddrMode(const MCInst *MI, unsigned OpNum) {}
+  void printNoHashImmediate(const MCInst *MI, unsigned OpNum);
+
+  void printPCLabel(const MCInst *MI, unsigned OpNum);  
+  // FIXME: Implement.
+  void PrintSpecial(const MCInst *MI, const char *Kind) {}
+};
+  
+}
+
+#endif
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
new file mode 100644
index 0000000..757164e
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
@@ -0,0 +1,166 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCInstLower.h"
+//#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+//#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+
+#if 0
+const ARMSubtarget &ARMMCInstLower::getSubtarget() const {
+  return AsmPrinter.getSubtarget();
+}
+
+MachineModuleInfoMachO &ARMMCInstLower::getMachOMMI() const {
+  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
+  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
+}
+#endif
+
+MCSymbol *ARMMCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  const GlobalValue *GV = MO.getGlobal();
+  
+  SmallString<128> Name;
+  Mang.getNameWithPrefix(Name, GV, false);
+  
+  // FIXME: HANDLE PLT references how??
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *ARMMCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  SmallString<128> Name;
+  Name += Printer.MAI->getGlobalPrefix();
+  Name += MO.getSymbolName();
+  
+  // FIXME: HANDLE PLT references how??
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+
+
+MCSymbol *ARMMCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+    << Printer.getFunctionNumber() << '_' << MO.getIndex();
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+    default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *ARMMCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+    << Printer.getFunctionNumber() << '_' << MO.getIndex();
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+  
+MCOperand ARMMCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      assert(0 && "unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                       Printer.GetMBBSymbol(MO.getMBB()->getNumber()), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+  
+}
diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h
new file mode 100644
index 0000000..383d30d
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h
@@ -0,0 +1,56 @@
+//===-- ARMMCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_MCINSTLOWER_H
+#define ARM_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCAsmInfo;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+  //class ARMSubtarget;
+  
+/// ARMMCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class VISIBILITY_HIDDEN ARMMCInstLower {
+  MCContext &Ctx;
+  Mangler &Mang;
+  AsmPrinter &Printer;
+
+  //const ARMSubtarget &getSubtarget() const;
+public:
+  ARMMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
+    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  //MCSymbol *GetPICBaseSymbol() const;
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+  
+/*
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+ */
+};
+
+}
+
+#endif
diff --git a/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
index a67fc84..4e299f8 100644
--- a/lib/Target/ARM/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
@@ -2,5 +2,7 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 
 add_llvm_library(LLVMARMAsmPrinter
   ARMAsmPrinter.cpp
+  ARMInstPrinter.cpp
+  ARMMCInstLower.cpp
   )
-add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
-\ No newline at end of file
+add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
index a961a57..e7770b2 100644
--- a/lib/Target/ARM/README-Thumb.txt
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -196,14 +196,6 @@ This is especially bad when dynamic alloca is used. The all fixed size stack
 objects are referenced off the frame pointer with negative offsets. See
 oggenc for an example.
 
-//===---------------------------------------------------------------------===//
-
-We are reserving R3 as a scratch register under thumb mode. So if it is live in
-to the function, we save / restore R3 to / from R12. Until register scavenging
-is done, we should save R3 to a high callee saved reg at emitPrologue time
-(when hasFP is true or stack size is large) and restore R3 from that register
-instead. This allows us to at least get rid of the save to r12 everytime it is
-used.
 
 //===---------------------------------------------------------------------===//
 
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 3c896da..6207177 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -394,31 +393,48 @@ rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return 0;
 }
 
-/// saveScavengerRegister - Save the register so it can be used by the
+/// saveScavengerRegister - Spill the register so it can be used by the
 /// register scavenger. Return true.
-bool Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator I,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg) const {
+bool
+Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator &UseMI,
+                                          const TargetRegisterClass *RC,
+                                          unsigned Reg) const {
   // Thumb1 can't use the emergency spill slot on the stack because
   // ldr/str immediate offsets must be positive, and if we're referencing
   // off the frame pointer (if, for example, there are alloca() calls in
   // the function, the offset will be negative. Use R12 instead since that's
   // a call clobbered register that we know won't be used in Thumb1 mode.
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  BuildMI(MBB, I, DL, TII.get(ARM::tMOVtgpr2gpr)).
+    addReg(ARM::R12, RegState::Define).addReg(Reg, RegState::Kill);
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVgpr2tgpr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill);
 
-  TII.copyRegToReg(MBB, I, ARM::R12, Reg, ARM::GPRRegisterClass, RC);
   return true;
 }
 
-/// restoreScavengerRegister - restore a registers saved by
-// saveScavengerRegister().
-void Thumb1RegisterInfo::restoreScavengerRegister(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator I,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg) const {
-  TII.copyRegToReg(MBB, I, Reg, ARM::R12, RC, ARM::GPRRegisterClass);
-}
-
 unsigned
 Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                         int SPAdj, int *Value,
@@ -828,7 +844,6 @@ void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
 
   if (VARegSaveSize) {
     // Epilogue for vararg functions: pop LR to R3 and branch off it.
-    // FIXME: Verify this is still ok when R3 is no longer being reserved.
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
       .addReg(0) // No write back.
       .addReg(ARM::R3, RegState::Define);
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index bb7a619..570a5bc 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -57,12 +57,9 @@ public:
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
                              const TargetRegisterClass *RC,
                              unsigned Reg) const;
-  void restoreScavengerRegister(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator I,
-                                const TargetRegisterClass *RC,
-                                unsigned Reg) const;
   unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
                                int SPAdj, int *Value = NULL,
                                RegScavenger *RS = NULL) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 98b5cbd..427c0bb 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -107,8 +107,12 @@ bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
     // Finalize IT mask.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
-    while (MBBI != E && Pos) {
+    // Branches, including tricky ones like LDM_RET, need to end an IT
+    // block so check the instruction we just put in the block.
+    while (MBBI != E && Pos &&
+           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) {
       MachineInstr *NMI = &*MBBI;
+      MI = NMI;
       DebugLoc ndl = NMI->getDebugLoc();
       unsigned NPredReg = 0;
       ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 6c4c15d..f217e0e 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
diff --git a/lib/Target/Blackfin/Blackfin.td b/lib/Target/Blackfin/Blackfin.td
index b904638..cd90962 100644
--- a/lib/Target/Blackfin/Blackfin.td
+++ b/lib/Target/Blackfin/Blackfin.td
@@ -74,6 +74,7 @@ def WA_IND_CALL : SubtargetFeature<"ind-call-anomaly", "wa_ind_call", "true",
 
 include "BlackfinRegisterInfo.td"
 include "BlackfinCallingConv.td"
+include "BlackfinIntrinsics.td"
 include "BlackfinInstrInfo.td"
 
 def BlackfinInstrInfo : InstrInfo {}
diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
new file mode 100644
index 0000000..544dc68
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
@@ -0,0 +1,53 @@
+//===- BlackfinIntrinsicInfo.cpp - Intrinsic Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinIntrinsicInfo.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+using namespace llvm;
+
+namespace bfinIntrinsic {
+
+  enum ID {
+    last_non_bfin_intrinsic = Intrinsic::num_intrinsics-1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+    , num_bfin_intrinsics
+  };
+
+}
+
+const char *BlackfinIntrinsicInfo::getName(unsigned IntrID) const {
+  static const char *const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  if (IntrID < Intrinsic::num_intrinsics)
+    return 0;
+  assert(IntrID < bfinIntrinsic::num_bfin_intrinsics && "Invalid intrinsic ID");
+
+  return names[IntrID - Intrinsic::num_intrinsics];
+}
+
+unsigned
+BlackfinIntrinsicInfo::lookupName(const char *Name, unsigned Len) const {
+#define GET_FUNCTION_RECOGNIZER
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  return 0;
+}
diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.h b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
new file mode 100644
index 0000000..3b59a60
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
@@ -0,0 +1,28 @@
+//===- BlackfinIntrinsicInfo.h - Blackfin Intrinsic Information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+#ifndef BLACKFININTRINSICS_H
+#define BLACKFININTRINSICS_H
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+
+  class BlackfinIntrinsicInfo : public TargetIntrinsicInfo {
+  public:
+    const char *getName(unsigned IntrID) const;
+    unsigned lookupName(const char *Name, unsigned Len) const;
+  };
+
+}
+
+#endif
diff --git a/lib/Target/Blackfin/BlackfinIntrinsics.td b/lib/Target/Blackfin/BlackfinIntrinsics.td
new file mode 100644
index 0000000..bf02cfe
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinIntrinsics.td
@@ -0,0 +1,34 @@
+//===- BlackfinIntrinsics.td - Defines Blackfin intrinsics -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the blackfin-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "bfin", isTarget = 1 in {
+
+//===----------------------------------------------------------------------===//
+// Core synchronisation etc.
+//
+// These intrinsics have sideeffects. Each represent a single instruction, but
+// workarounds are sometimes required depending on the cpu.
+
+// Execute csync instruction with workarounds
+def int_bfin_csync : GCCBuiltin<"__builtin_bfin_csync">,
+        Intrinsic<[llvm_void_ty]>;
+
+// Execute ssync instruction with workarounds
+def int_bfin_ssync : GCCBuiltin<"__builtin_bfin_ssync">,
+        Intrinsic<[llvm_void_ty]>;
+
+// Execute idle instruction with workarounds
+def int_bfin_idle : GCCBuiltin<"__builtin_bfin_idle">,
+        Intrinsic<[llvm_void_ty]>;
+
+}
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.h b/lib/Target/Blackfin/BlackfinTargetMachine.h
index 73ed314..a14052b 100644
--- a/lib/Target/Blackfin/BlackfinTargetMachine.h
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.h
@@ -20,6 +20,7 @@
 #include "BlackfinInstrInfo.h"
 #include "BlackfinSubtarget.h"
 #include "BlackfinISelLowering.h"
+#include "BlackfinIntrinsicInfo.h"
 
 namespace llvm {
 
@@ -29,6 +30,7 @@ namespace llvm {
     BlackfinTargetLowering TLInfo;
     BlackfinInstrInfo InstrInfo;
     TargetFrameInfo FrameInfo;
+    BlackfinIntrinsicInfo IntrinsicInfo;
   public:
     BlackfinTargetMachine(const Target &T, const std::string &TT,
                           const std::string &FS);
@@ -47,6 +49,9 @@ namespace llvm {
     virtual const TargetData *getTargetData() const { return &DataLayout; }
     virtual bool addInstSelector(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
+    const TargetIntrinsicInfo *getIntrinsicInfo() const {
+      return &IntrinsicInfo;
+    }
   };
 
 } // end namespace llvm
diff --git a/lib/Target/Blackfin/CMakeLists.txt b/lib/Target/Blackfin/CMakeLists.txt
index 6c3b244..deb005d 100644
--- a/lib/Target/Blackfin/CMakeLists.txt
+++ b/lib/Target/Blackfin/CMakeLists.txt
@@ -9,9 +9,11 @@ tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
 tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
 tablegen(BlackfinGenSubtarget.inc -gen-subtarget)
 tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
+tablegen(BlackfinGenIntrinsics.inc -gen-tgt-intrinsic)
 
 add_llvm_target(BlackfinCodeGen
   BlackfinInstrInfo.cpp
+  BlackfinIntrinsicInfo.cpp
   BlackfinISelDAGToDAG.cpp
   BlackfinISelLowering.cpp
   BlackfinMCAsmInfo.cpp
diff --git a/lib/Target/Blackfin/Makefile b/lib/Target/Blackfin/Makefile
index c0c1bce..c68760b 100644
--- a/lib/Target/Blackfin/Makefile
+++ b/lib/Target/Blackfin/Makefile
@@ -15,7 +15,7 @@ BUILT_SOURCES = BlackfinGenRegisterInfo.h.inc BlackfinGenRegisterNames.inc \
                 BlackfinGenRegisterInfo.inc BlackfinGenInstrNames.inc \
                 BlackfinGenInstrInfo.inc BlackfinGenAsmWriter.inc \
                 BlackfinGenDAGISel.inc BlackfinGenSubtarget.inc \
-		BlackfinGenCallingConv.inc
+		BlackfinGenCallingConv.inc BlackfinGenIntrinsics.inc
 
 DIRS = AsmPrinter TargetInfo
 
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index fe63edf..cbf769b 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -302,7 +302,6 @@ namespace {
     void visitInlineAsm(CallInst &I);
     bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee);
 
-    void visitMallocInst(MallocInst &I);
     void visitAllocaInst(AllocaInst &I);
     void visitFreeInst  (FreeInst   &I);
     void visitLoadInst  (LoadInst   &I);
@@ -3405,10 +3404,6 @@ void CWriter::visitInlineAsm(CallInst &CI) {
   Out << ")";
 }
 
-void CWriter::visitMallocInst(MallocInst &I) {
-  llvm_unreachable("lowerallocations pass didn't work!");
-}
-
 void CWriter::visitAllocaInst(AllocaInst &I) {
   Out << '(';
   printType(Out, I.getType());
@@ -3690,7 +3685,7 @@ bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
   if (FileType != TargetMachine::AssemblyFile) return true;
 
   PM.add(createGCLoweringPass());
-  PM.add(createLowerAllocationsPass(true));
+  PM.add(createLowerAllocationsPass());
   PM.add(createLowerInvokePass());
   PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
   PM.add(new CBackendNameAllUsedStructsAndMergeFunctions());
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 14ad451..45c2a7b 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -1258,20 +1258,6 @@ namespace {
       Out << "\");";
       break;
     }
-    case Instruction::Malloc: {
-      const MallocInst* mallocI = cast<MallocInst>(I);
-      Out << "MallocInst* " << iName << " = new MallocInst("
-          << getCppName(mallocI->getAllocatedType()) << ", ";
-      if (mallocI->isArrayAllocation())
-        Out << opNames[0] << ", " ;
-      Out << "\"";
-      printEscapedString(mallocI->getName());
-      Out << "\", " << bbname << ");";
-      if (mallocI->getAlignment())
-        nl(Out) << iName << "->setAlignment("
-            << mallocI->getAlignment() << ");";
-      break;
-    }
     case Instruction::Free: {
       Out << "FreeInst* " << iName << " = new FreeInst("
           << getCppName(I->getOperand(0)) << ", " << bbname << ");";
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index 26d637b..cf08a97 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -1191,9 +1191,6 @@ void MSILWriter::printInstruction(const Instruction* Inst) {
   case Instruction::Alloca:
     printAllocaInstruction(cast<AllocaInst>(Inst));
     break;
-  case Instruction::Malloc:
-    llvm_unreachable("LowerAllocationsPass used");
-    break;
   case Instruction::Free:
     llvm_unreachable("LowerAllocationsPass used");
     break;
@@ -1702,7 +1699,7 @@ bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM,
   if (FileType != TargetMachine::AssemblyFile) return true;
   MSILWriter* Writer = new MSILWriter(o);
   PM.add(createGCLoweringPass());
-  PM.add(createLowerAllocationsPass(true));
+  PM.add(createLowerAllocationsPass());
   // FIXME: Handle switch trougth native IL instruction "switch"
   PM.add(createLowerSwitchPass());
   PM.add(createCFGSimplificationPass());
diff --git a/lib/Target/MSP430/AsmPrinter/CMakeLists.txt b/lib/Target/MSP430/AsmPrinter/CMakeLists.txt
index 6e66887..f1eb885 100644
--- a/lib/Target/MSP430/AsmPrinter/CMakeLists.txt
+++ b/lib/Target/MSP430/AsmPrinter/CMakeLists.txt
@@ -1,6 +1,8 @@
 include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 
 add_llvm_library(LLVMMSP430AsmPrinter
+  MSP430InstPrinter.cpp
   MSP430AsmPrinter.cpp
+  MSP430MCInstLower.cpp
   )
 add_dependencies(LLVMMSP430AsmPrinter MSP430CodeGenTable_gen)
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp b/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp
index 852019f..ace358e 100644
--- a/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp
@@ -15,7 +15,9 @@
 #define DEBUG_TYPE "asm-printer"
 #include "MSP430.h"
 #include "MSP430InstrInfo.h"
+#include "MSP430InstPrinter.h"
 #include "MSP430MCAsmInfo.h"
+#include "MSP430MCInstLower.h"
 #include "MSP430TargetMachine.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -26,12 +28,14 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Mangler.h"
@@ -41,6 +45,10 @@ using namespace llvm;
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
+static cl::opt<bool>
+EnableMCInst("enable-msp430-mcinst-printer", cl::Hidden,
+             cl::desc("enable experimental mcinst gunk in the msp430 backend"));
+
 namespace {
   class VISIBILITY_HIDDEN MSP430AsmPrinter : public AsmPrinter {
   public:
@@ -52,8 +60,14 @@ namespace {
       return "MSP430 Assembly Printer";
     }
 
+    void printMCInst(const MCInst *MI) {
+      MSP430InstPrinter(O, *MAI).printInstruction(MI);
+    }
     void printOperand(const MachineInstr *MI, int OpNum,
                       const char* Modifier = 0);
+    void printPCRelImmOperand(const MachineInstr *MI, int OpNum) {
+      printOperand(MI, OpNum);
+    }
     void printSrcMemOperand(const MachineInstr *MI, int OpNum,
                             const char* Modifier = 0);
     void printCCOperand(const MachineInstr *MI, int OpNum);
@@ -67,6 +81,7 @@ namespace {
     bool PrintAsmMemoryOperand(const MachineInstr *MI,
                                unsigned OpNo, unsigned AsmVariant,
                                const char *ExtraCode);
+    void printInstructionThroughMCStreamer(const MachineInstr *MI);
 
     void emitFunctionHeader(const MachineFunction &MF);
     bool runOnMachineFunction(MachineFunction &F);
@@ -148,7 +163,11 @@ void MSP430AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   processDebugLoc(MI, true);
 
   // Call the autogenerated instruction printer routines.
-  printInstruction(MI);
+  if (EnableMCInst) {
+    printInstructionThroughMCStreamer(MI);
+  } else {
+    printInstruction(MI);
+  }
 
   if (VerboseAsm && !MI->getDebugLoc().isUnknown())
     EmitComments(*MI);
@@ -231,22 +250,22 @@ void MSP430AsmPrinter::printCCOperand(const MachineInstr *MI, int OpNum) {
   default:
    llvm_unreachable("Unsupported CC code");
    break;
-  case MSP430::COND_E:
+  case MSP430CC::COND_E:
    O << "eq";
    break;
-  case MSP430::COND_NE:
+  case MSP430CC::COND_NE:
    O << "ne";
    break;
-  case MSP430::COND_HS:
+  case MSP430CC::COND_HS:
    O << "hs";
    break;
-  case MSP430::COND_LO:
+  case MSP430CC::COND_LO:
    O << "lo";
    break;
-  case MSP430::COND_GE:
+  case MSP430CC::COND_GE:
    O << "ge";
    break;
-  case MSP430::COND_L:
+  case MSP430CC::COND_L:
    O << 'l';
    break;
   }
@@ -275,6 +294,36 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+void MSP430AsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI)
+{
+
+  MSP430MCInstLower MCInstLowering(OutContext, *Mang, *this);
+
+  switch (MI->getOpcode()) {
+  case TargetInstrInfo::DBG_LABEL:
+  case TargetInstrInfo::EH_LABEL:
+  case TargetInstrInfo::GC_LABEL:
+    printLabel(MI);
+    return;
+  case TargetInstrInfo::KILL:
+    return;
+  case TargetInstrInfo::INLINEASM:
+    O << '\t';
+    printInlineAsm(MI);
+    return;
+  case TargetInstrInfo::IMPLICIT_DEF:
+    printImplicitDef(MI);
+    return;
+  default: break;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  printMCInst(&TmpInst);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.cpp
new file mode 100644
index 0000000..a3ecc67
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.cpp
@@ -0,0 +1,116 @@
+//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430InstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define MSP430AsmPrinter MSP430InstPrinter  // FIXME: REMOVE.
+#define NO_ASM_WRITER_BOILERPLATE
+#include "MSP430GenAsmWriter.inc"
+#undef MachineInstr
+#undef MSP430AsmPrinter
+
+void MSP430InstPrinter::printInst(const MCInst *MI) {
+  printInstruction(MI);
+}
+
+void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << '#';
+    Op.getExpr()->print(O, &MAI);
+  }
+}
+
+void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                                           const char *Modifier) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  const MCOperand &Disp = MI->getOperand(OpNo+1);
+
+  // FIXME: move global to displacement field!
+  if (Base.isExpr()) {
+    O << '&';
+    Base.getExpr()->print(O, &MAI);
+  } else if (Disp.isImm() && !Base.isReg())
+    printOperand(MI, OpNo);
+  else if (Base.isReg()) {
+    if (Disp.getImm()) {
+      O << Disp.getImm() << '(';
+      printOperand(MI, OpNo);
+      O << ')';
+    } else {
+      O << '@';
+      printOperand(MI, OpNo);
+    }
+  } else {
+    Base.dump();
+    Disp.dump();
+    llvm_unreachable("Unsupported memory operand");
+  }
+}
+
+void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo) {
+  unsigned CC = MI->getOperand(OpNo).getImm();
+
+  switch (CC) {
+  default:
+   llvm_unreachable("Unsupported CC code");
+   break;
+  case MSP430CC::COND_E:
+   O << "eq";
+   break;
+  case MSP430CC::COND_NE:
+   O << "ne";
+   break;
+  case MSP430CC::COND_HS:
+   O << "hs";
+   break;
+  case MSP430CC::COND_LO:
+   O << "lo";
+   break;
+  case MSP430CC::COND_GE:
+   O << "ge";
+   break;
+  case MSP430CC::COND_L:
+   O << 'l';
+   break;
+  }
+}
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.h
new file mode 100644
index 0000000..2fac800
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.h
@@ -0,0 +1,46 @@
+//===-- MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430INSTPRINTER_H
+#define MSP430INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm
+{
+
+  class MCOperand;
+
+  class MSP430InstPrinter : public MCInstPrinter {
+  public:
+    MSP430InstPrinter(raw_ostream &O, const MCAsmInfo &MAI) :
+      MCInstPrinter(O, MAI){
+    }
+
+    virtual void printInst(const MCInst *MI);
+
+    // Autogenerated by tblgen.
+    void printInstruction(const MCInst *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo,
+                      const char *Modifier = 0);
+    void printPCRelImmOperand(const MCInst *MI, unsigned OpNo);
+    void printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                            const char *Modifier = 0);
+    void printCCOperand(const MCInst *MI, unsigned OpNo);
+
+  };
+}
+
+#endif
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
new file mode 100644
index 0000000..f505b23
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
@@ -0,0 +1,147 @@
+//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MSP430 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+MCSymbol *MSP430MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  const GlobalValue *GV = MO.getGlobal();
+
+  SmallString<128> Name;
+  Mang.getNameWithPrefix(Name, GV, false);
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  SmallString<128> Name;
+  Name += Printer.MAI->getGlobalPrefix();
+  Name += MO.getSymbolName();
+
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand MSP430MCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      assert(0 && "unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                         Printer.GetMBBSymbol(MO.getMBB()->getNumber()), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.h b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.h
new file mode 100644
index 0000000..a2b99ae
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.h
@@ -0,0 +1,49 @@
+//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_MCINSTLOWER_H
+#define MSP430_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCAsmInfo;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+
+  /// MSP430MCInstLower - This class is used to lower an MachineInstr
+  /// into an MCInst.
+class VISIBILITY_HIDDEN MSP430MCInstLower {
+  MCContext &Ctx;
+  Mangler &Mang;
+
+  AsmPrinter &Printer;
+public:
+  MSP430MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
+    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif
diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
index d9f5f86..1ff178d 100644
--- a/lib/Target/MSP430/MSP430.h
+++ b/lib/Target/MSP430/MSP430.h
@@ -17,6 +17,20 @@
 
 #include "llvm/Target/TargetMachine.h"
 
+namespace MSP430CC {
+  // MSP430 specific condition code.
+  enum CondCodes {
+    COND_E  = 0,  // aka COND_Z
+    COND_NE = 1,  // aka COND_NZ
+    COND_HS = 2,  // aka COND_C
+    COND_LO = 3,  // aka COND_NC
+    COND_GE = 4,
+    COND_L  = 5,
+
+    COND_INVALID = -1
+  };
+}
+
 namespace llvm {
   class MSP430TargetMachine;
   class FunctionPass;
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 4195a88..b7d9282 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -34,6 +35,14 @@
 
 using namespace llvm;
 
+#ifndef NDEBUG
+static cl::opt<bool>
+ViewRMWDAGs("view-msp430-rmw-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags after RMW preprocess"));
+#else
+static const bool ViewRMWDAGs = false;
+#endif
+
 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
 
 /// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
@@ -56,6 +65,9 @@ namespace {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
 
+    bool IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                    SDNode *Root) const;
+
     virtual bool
     SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                                  std::vector<SDValue> &OutOps);
@@ -64,6 +76,7 @@ namespace {
   #include "MSP430GenDAGISel.inc"
 
   private:
+    DenseMap<SDNode*, SDNode*> RMWStores;
     void PreprocessForRMW();
     SDNode *Select(SDValue Op);
     bool SelectAddr(SDValue Op, SDValue Addr, SDValue &Base, SDValue &Disp);
@@ -130,7 +143,6 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue Op, SDValue Addr,
   return true;
 }
 
-
 bool MSP430DAGToDAGISel::
 SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                              std::vector<SDValue> &OutOps) {
@@ -148,31 +160,54 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
   return false;
 }
 
+bool MSP430DAGToDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                                    SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  /// RMW preprocessing creates the following code:
+  ///         [Load1]
+  ///         ^     ^
+  ///        /      |
+  ///       /       |
+  ///       [Load2] |
+  ///       ^    ^  |
+  ///       |    |  |
+  ///       |     \-|
+  ///       |       |
+  ///       |     [Op]
+  ///       |       ^
+  ///       |       |
+  ///       \      /
+  ///        \    /
+  ///       [Store]
+  ///
+  /// The path Store => Load2 => Load1 is via chain. Note that in general it is
+  /// not allowed to fold Load1 into Op (and Store) since it will creates a
+  /// cycle. However, this is perfectly legal for the loads moved below the
+  /// TokenFactor by PreprocessForRMW. Query the map Store => Load1 (created
+  /// during preprocessing) to determine whether it's legal to introduce such
+  /// "cycle" for a moment.
+  DenseMap<SDNode*, SDNode*>::iterator I = RMWStores.find(Root);
+  if (I != RMWStores.end() && I->second == N)
+    return true;
+
+  // Proceed to 'generic' cycle finder code
+  return SelectionDAGISel::IsLegalAndProfitableToFold(N, U, Root);
+}
+
+
 /// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
 /// and move load below the TokenFactor. Replace store's chain operand with
 /// load's chain result.
-/// Shamelessly stolen from X86.
 static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
                                  SDValue Store, SDValue TF) {
   SmallVector<SDValue, 4> Ops;
-  bool isRMW = false;
-  SDValue TF0, TF1, NewTF;
   for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
-    if (Load.getNode() == TF.getOperand(i).getNode()) {
-      TF0 = Load.getOperand(0);
-      Ops.push_back(TF0);
-    } else {
-      TF1 = TF.getOperand(i);
-      Ops.push_back(TF1);
-      if (LoadSDNode* LD = dyn_cast<LoadSDNode>(TF1))
-        isRMW = !LD->isVolatile();
-    }
-
-  if (isRMW && TF1.getOperand(0).getNode() == TF0.getNode())
-    NewTF = TF0;
-  else
-    NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
-
+    if (Load.getNode() == TF.getOperand(i).getNode())
+      Ops.push_back(Load.getOperand(0));
+    else
+      Ops.push_back(TF.getOperand(i));
+  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
   SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
                                                Load.getOperand(1),
                                                Load.getOperand(2));
@@ -180,12 +215,43 @@ static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
                              Store.getOperand(2), Store.getOperand(3));
 }
 
-/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG. The chain
-/// produced by the load must only be used by the store's chain operand,
-/// otherwise this may produce a cycle in the DAG.
-/// Shamelessly stolen from X86. FIXME: Should we make this function common?
-static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
-                      SDValue &Load) {
+/// MoveBelowTokenFactor2 - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result. This a version which sinks two loads below token factor.
+/// Look into PreprocessForRMW comments for explanation of transform.
+static void MoveBelowTokenFactor2(SelectionDAG *CurDAG,
+                                  SDValue Load1, SDValue Load2,
+                                  SDValue Store, SDValue TF) {
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i) {
+    SDNode* N = TF.getOperand(i).getNode();
+    if (Load2.getNode() == N)
+      Ops.push_back(Load2.getOperand(0));
+    else if (Load1.getNode() != N)
+      Ops.push_back(TF.getOperand(i));
+  }
+
+  SDValue NewTF = SDValue(CurDAG->MorphNodeTo(TF.getNode(),
+                                  TF.getOpcode(),
+                                  TF.getNode()->getVTList(),
+                                  &Ops[0], Ops.size()), TF.getResNo());
+  SDValue NewLoad2 = CurDAG->UpdateNodeOperands(Load2, NewTF,
+                                                Load2.getOperand(1),
+                                                Load2.getOperand(2));
+
+  SDValue NewLoad1 = CurDAG->UpdateNodeOperands(Load1, NewLoad2.getValue(1),
+                                                Load1.getOperand(1),
+                                                Load1.getOperand(2));
+
+  CurDAG->UpdateNodeOperands(Store,
+                             NewLoad1.getValue(1),
+                             Store.getOperand(1),
+                             Store.getOperand(2), Store.getOperand(3));
+}
+
+/// isAllowedToSink - return true if N a load which can be moved below token
+/// factor. Basically, the load should be non-volatile and has single use.
+static bool isLoadAllowedToSink(SDValue N, SDValue Chain) {
   if (N.getOpcode() == ISD::BIT_CONVERT)
     N = N.getOperand(0);
 
@@ -199,10 +265,19 @@ static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
   if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
     return false;
 
-  if (N.hasOneUse() &&
-      LD->hasNUsesOfValue(1, 1) &&
-      N.getOperand(1) == Address &&
-      LD->isOperandOf(Chain.getNode())) {
+  return (N.hasOneUse() &&
+          LD->hasNUsesOfValue(1, 1) &&
+          LD->isOperandOf(Chain.getNode()));
+}
+
+
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
+/// The chain produced by the load must only be used by the store's chain
+/// operand, otherwise this may produce a cycle in the DAG.
+static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
+                      SDValue &Load) {
+  if (isLoadAllowedToSink(N, Chain) &&
+      N.getOperand(1) == Address) {
     Load = N;
     return true;
   }
@@ -210,57 +285,140 @@ static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
 }
 
 /// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
-/// Shamelessly stolen from X86.
+/// This is only run if not in -O0 mode.
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
+///
+///     [Load chain]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///      /      \-
+///     /         |
+/// [TokenFactor] [Op]
+///     ^          ^
+///     |          |
+///      \        /
+///       \      /
+///       [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+///     [Load chain]
+///         ^
+///         |
+///    [TokenFactor]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///       |     \-
+///       |       |
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+///
+/// We also recognize the case where second operand of Op is load as well and
+/// move it below token factor as well creating DAG as follows:
+///
+///       [Load chain]
+///            ^
+///            |
+///      [TokenFactor]
+///            ^
+///            |
+///         [Load1]
+///         ^     ^
+///        /      |
+///       /       |
+///       [Load2] |
+///       ^    ^  |
+///       |    |  |
+///       |     \-|
+///       |       |
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+///
+/// This allows selection of mem-mem instructions. Yay!
+
 void MSP430DAGToDAGISel::PreprocessForRMW() {
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
          E = CurDAG->allnodes_end(); I != E; ++I) {
     if (!ISD::isNON_TRUNCStore(I))
       continue;
-
     SDValue Chain = I->getOperand(0);
+
     if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
       continue;
 
-    SDValue N1 = I->getOperand(1); // Value to store
-    SDValue N2 = I->getOperand(2); // Address of store
-
-    if (!N1.hasOneUse())
+    SDValue N1 = I->getOperand(1);
+    SDValue N2 = I->getOperand(2);
+    if ((N1.getValueType().isFloatingPoint() &&
+         !N1.getValueType().isVector()) ||
+        !N1.hasOneUse())
       continue;
 
-    bool RModW = false;
-    SDValue Load;
+    unsigned RModW = 0;
+    SDValue Load1, Load2;
     unsigned Opcode = N1.getNode()->getOpcode();
     switch (Opcode) {
-      case ISD::ADD:
-      case ISD::AND:
-      case ISD::OR:
-      case ISD::XOR:
-      case ISD::ADDC:
-      case ISD::ADDE: {
-        SDValue N10 = N1.getOperand(0);
-        SDValue N11 = N1.getOperand(1);
-        RModW = isRMWLoad(N10, Chain, N2, Load);
-
-        if (!RModW && isRMWLoad(N11, Chain, N2, Load)) {
-          // Swap the operands, making the RMW load the first operand seems
-          // to help selection and prevent token chain loops.
-          N1 = CurDAG->UpdateNodeOperands(N1, N11, N10);
-          RModW = true;
-        }
-       break;
+    case ISD::ADD:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::ADDC:
+    case ISD::ADDE: {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if (isRMWLoad(N10, Chain, N2, Load1)) {
+        if (isLoadAllowedToSink(N11, Chain)) {
+          Load2 = N11;
+          RModW = 2;
+        } else
+          RModW = 1;
+      } else if (isRMWLoad(N11, Chain, N2, Load1)) {
+        if (isLoadAllowedToSink(N10, Chain)) {
+          Load2 = N10;
+          RModW = 2;
+        } else
+          RModW = 1;
       }
-      case ISD::SUB:
-      case ISD::SUBC:
-      case ISD::SUBE: {
-        SDValue N10 = N1.getOperand(0);
-        RModW = isRMWLoad(N10, Chain, N2, Load);
-        break;
+      break;
+    }
+    case ISD::SUB:
+    case ISD::SUBC:
+    case ISD::SUBE: {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if (isRMWLoad(N10, Chain, N2, Load1)) {
+        if (isLoadAllowedToSink(N11, Chain)) {
+          Load2 = N11;
+          RModW = 2;
+        } else
+          RModW = 1;
       }
+      break;
+    }
     }
 
-    if (RModW) {
-      MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
-      ++NumLoadMoved;
+    NumLoadMoved += RModW;
+    if (RModW == 1)
+      MoveBelowTokenFactor(CurDAG, Load1, SDValue(I, 0), Chain);
+    else if (RModW == 2) {
+      MoveBelowTokenFactor2(CurDAG, Load1, Load2, SDValue(I, 0), Chain);
+      SDNode* Store = I;
+      RMWStores[Store] = Load2.getNode();
     }
   }
 }
@@ -268,8 +426,15 @@ void MSP430DAGToDAGISel::PreprocessForRMW() {
 /// InstructionSelect - This callback is invoked by
 /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
 void MSP430DAGToDAGISel::InstructionSelect() {
+  std::string BlockName;
+  if (ViewRMWDAGs)
+    BlockName = MF->getFunction()->getNameStr() + ":" +
+                BB->getBasicBlock()->getNameStr();
+
   PreprocessForRMW();
 
+  if (ViewRMWDAGs) CurDAG->viewGraph("RMW preprocessed:" + BlockName);
+
   DEBUG(errs() << "Selection DAG after RMW preprocessing:\n");
   DEBUG(CurDAG->dump());
 
@@ -282,6 +447,7 @@ void MSP430DAGToDAGISel::InstructionSelect() {
   DEBUG(errs() << "===== Instruction selection ends:\n");
 
   CurDAG->RemoveDeadNodes();
+  RMWStores.clear();
 }
 
 SDNode *MSP430DAGToDAGISel::Select(SDValue Op) {
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index b56f069..34e6d2c 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -567,44 +567,45 @@ SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
   return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);;
 }
 
-static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, unsigned &TargetCC,
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
                        ISD::CondCode CC,
                        DebugLoc dl, SelectionDAG &DAG) {
   // FIXME: Handle bittests someday
   assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
 
   // FIXME: Handle jump negative someday
-  TargetCC = MSP430::COND_INVALID;
+  MSP430CC::CondCodes TCC = MSP430CC::COND_INVALID;
   switch (CC) {
   default: llvm_unreachable("Invalid integer condition!");
   case ISD::SETEQ:
-    TargetCC = MSP430::COND_E;  // aka COND_Z
+    TCC = MSP430CC::COND_E;     // aka COND_Z
     break;
   case ISD::SETNE:
-    TargetCC = MSP430::COND_NE; // aka COND_NZ
+    TCC = MSP430CC::COND_NE;    // aka COND_NZ
     break;
   case ISD::SETULE:
     std::swap(LHS, RHS);        // FALLTHROUGH
   case ISD::SETUGE:
-    TargetCC = MSP430::COND_HS; // aka COND_C
+    TCC = MSP430CC::COND_HS;    // aka COND_C
     break;
   case ISD::SETUGT:
     std::swap(LHS, RHS);        // FALLTHROUGH
   case ISD::SETULT:
-    TargetCC = MSP430::COND_LO; // aka COND_NC
+    TCC = MSP430CC::COND_LO;    // aka COND_NC
     break;
   case ISD::SETLE:
     std::swap(LHS, RHS);        // FALLTHROUGH
   case ISD::SETGE:
-    TargetCC = MSP430::COND_GE;
+    TCC = MSP430CC::COND_GE;
     break;
   case ISD::SETGT:
     std::swap(LHS, RHS);        // FALLTHROUGH
   case ISD::SETLT:
-    TargetCC = MSP430::COND_L;
+    TCC = MSP430CC::COND_L;
     break;
   }
 
+  TargetCC = DAG.getConstant(TCC, MVT::i8);
   return DAG.getNode(MSP430ISD::CMP, dl, MVT::Flag, LHS, RHS);
 }
 
@@ -617,13 +618,11 @@ SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
   SDValue Dest  = Op.getOperand(4);
   DebugLoc dl   = Op.getDebugLoc();
 
-  unsigned TargetCC = MSP430::COND_INVALID;
+  SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
 
   return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
-                     Chain,
-                     Dest, DAG.getConstant(TargetCC, MVT::i8),
-                     Flag);
+                     Chain, Dest, TargetCC, Flag);
 }
 
 SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
@@ -634,14 +633,14 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   DebugLoc dl    = Op.getDebugLoc();
 
-  unsigned TargetCC = MSP430::COND_INVALID;
+  SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
   SmallVector<SDValue, 4> Ops;
   Ops.push_back(TrueV);
   Ops.push_back(FalseV);
-  Ops.push_back(DAG.getConstant(TargetCC, MVT::i8));
+  Ops.push_back(TargetCC);
   Ops.push_back(Flag);
 
   return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 37fbb6d..a6d9638 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -151,6 +151,163 @@ MSP430InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
+unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != MSP430::JMP &&
+        I->getOpcode() != MSP430::JCC)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool MSP430InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  MSP430CC::CondCodes CC = static_cast<MSP430CC::CondCodes>(Cond[0].getImm());
+
+  switch (CC) {
+  default:
+    assert(0 && "Invalid branch condition!");
+    break;
+  case MSP430CC::COND_E:
+    CC = MSP430CC::COND_NE;
+    break;
+  case MSP430CC::COND_NE:
+    CC = MSP430CC::COND_E;
+    break;
+  case MSP430CC::COND_L:
+    CC = MSP430CC::COND_GE;
+    break;
+  case MSP430CC::COND_GE:
+    CC = MSP430CC::COND_L;
+    break;
+  case MSP430CC::COND_HS:
+    CC = MSP430CC::COND_LO;
+    break;
+  case MSP430CC::COND_LO:
+    CC = MSP430CC::COND_HS;
+    break;
+  }
+
+  Cond[0].setImm(CC);
+  return false;
+}
+
+bool MSP430InstrInfo::BlockHasNoFallThrough(const MachineBasicBlock &MBB)const{
+  if (MBB.empty()) return false;
+
+  switch (MBB.back().getOpcode()) {
+  case MSP430::RET:   // Return.
+  case MSP430::JMP:   // Uncond branch.
+    return true;
+  default: return false;
+  }
+}
+
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == MSP430::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (next(I) != MBB.end())
+        next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    assert(I->getOpcode() == MSP430::JCC && "Invalid conditional branch");
+    MSP430CC::CondCodes BranchCode =
+      static_cast<MSP430CC::CondCodes>(I->getOperand(1).getImm());
+    if (BranchCode == MSP430CC::COND_INVALID)
+      return true;  // Can't handle weird stuff.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    MSP430CC::CondCodes OldBranchCode = (MSP430CC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
 unsigned
 MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                               MachineBasicBlock *FBB,
@@ -172,7 +329,13 @@ MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 
   // Conditional branch.
   unsigned Count = 0;
-  llvm_unreachable("Implement conditional branches!");
+  BuildMI(&MBB, dl, get(MSP430::JCC)).addMBB(TBB).addImm(Cond[0].getImm());
+  ++Count;
 
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, dl, get(MSP430::JMP)).addMBB(FBB);
+    ++Count;
+  }
   return Count;
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index e07aaca..35e35db 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -21,20 +21,6 @@ namespace llvm {
 
 class MSP430TargetMachine;
 
-namespace MSP430 {
-  // MSP430 specific condition code.
-  enum CondCode {
-    COND_E  = 0,  // aka COND_Z
-    COND_NE = 1,  // aka COND_NZ
-    COND_HS = 2,  // aka COND_C
-    COND_LO = 3,  // aka COND_NC
-    COND_GE = 4,
-    COND_L  = 5,
-
-    COND_INVALID
-  };
-}
-
 class MSP430InstrInfo : public TargetInstrInfoImpl {
   const MSP430RegisterInfo RI;
   MSP430TargetMachine &TM;
@@ -73,9 +59,19 @@ public:
                                            MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI) const;
 
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
+  // Branch folding goodness
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool BlockHasNoFallThrough(const MachineBasicBlock &MBB) const;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB,
+                     MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond) const;
 
 };
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index f7e0d2b..e202175 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -71,7 +71,9 @@ def memdst : Operand<i16> {
 }
 
 // Branch targets have OtherVT type.
-def brtarget : Operand<OtherVT>;
+def brtarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImmOperand";
+}
 
 // Operand for printing out a condition code.
 def cc : Operand<i8> {
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 5e21f8e..da54507 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -32,7 +32,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
   LLVMTargetMachine(T, TT),
   Subtarget(TT, FS),
   // FIXME: Check TargetData string.
-  DataLayout("e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"),
+  DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32"),
   InstrInfo(*this), TLInfo(*this),
   FrameInfo(TargetFrameInfo::StackGrowsDown, 2, -2) { }
 
diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
index 3f415af..ea0f494 100644
--- a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
@@ -12,8 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "PIC16ABINames.h"
 #include "PIC16AsmPrinter.h"
-#include "MCSectionPIC16.h"
+#include "PIC16Section.h"
 #include "PIC16MCAsmInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -39,7 +40,7 @@ PIC16AsmPrinter::PIC16AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
 : AsmPrinter(O, TM, T, V), DbgInfo(O, T) {
   PTLI = static_cast<PIC16TargetLowering*>(TM.getTargetLowering());
   PMAI = static_cast<const PIC16MCAsmInfo*>(T);
-  PTOF = (PIC16TargetObjectFile*)&PTLI->getObjFileLowering();
+  PTOF = (PIC16TargetObjectFile *)&PTLI->getObjFileLowering();
 }
 
 bool PIC16AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
@@ -52,6 +53,45 @@ bool PIC16AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   return true;
 }
 
+static int getFunctionColor(const Function *F) {
+  if (F->hasSection()) {
+    std::string Sectn = F->getSection();
+    std::string StrToFind = "Overlay=";
+    std::string::size_type Pos = Sectn.find(StrToFind);
+
+    // Retreive the color number if the key is found.
+    if (Pos != std::string::npos) {
+      Pos += StrToFind.length();
+      std::string Color = "";
+      char c = Sectn.at(Pos);
+      // A Color can only consist of digits.
+      while (c >= '0' && c<= '9') {
+        Color.append(1,c);
+        Pos++;
+        if (Pos >= Sectn.length())
+          break;
+        c = Sectn.at(Pos);
+      }
+      return atoi(Color.c_str());
+    }
+  }
+
+  // Color was not set for function, so return -1.
+  return -1;
+}
+
+// Color the Auto section of the given function. 
+void PIC16AsmPrinter::ColorAutoSection(const Function *F) {
+  std::string SectionName = PAN::getAutosSectionName(CurrentFnName);
+  PIC16Section* Section = PTOF->findPIC16Section(SectionName);
+  if (Section != NULL) {
+    int Color = getFunctionColor(F);
+    if (Color >= 0)
+      Section->setColor(Color);
+  }
+}
+
+
 /// runOnMachineFunction - This emits the frame section, autos section and 
 /// assembly for each instruction. Also takes care of function begin debug
 /// directive and file begin debug directive (if required) for the function.
@@ -67,17 +107,18 @@ bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const Function *F = MF.getFunction();
   CurrentFnName = Mang->getMangledName(F);
 
+  // Put the color information from function to its auto section.
+  ColorAutoSection(F);
+
   // Emit the function frame (args and temps).
   EmitFunctionFrame(MF);
 
   DbgInfo.BeginFunction(MF);
 
-  // Emit the autos section of function.
-  EmitAutos(CurrentFnName);
-
   // Now emit the instructions of function in its code section.
-  const MCSection *fCodeSection = 
-    getObjFileLowering().getSectionForFunction(CurrentFnName);
+  const MCSection *fCodeSection 
+    = getObjFileLowering().SectionForCode(CurrentFnName);
+
   // Start the Code Section.
   O <<  "\n";
   OutStreamer.SwitchSection(fCodeSection);
@@ -229,12 +270,26 @@ bool PIC16AsmPrinter::doInitialization(Module &M) {
 
   // Set the section names for all globals.
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
+       I != E; ++I) {
+
+    // Record External Var Decls.
+    if (I->isDeclaration()) {
+      ExternalVarDecls.push_back(I);
+      continue;
+    }
+
+    // Record Exteranl Var Defs.
+    if (I->hasExternalLinkage() || I->hasCommonLinkage()) {
+      ExternalVarDefs.push_back(I);
+    }
+
+    // Sectionify actual data.
+    if (!I->hasAvailableExternallyLinkage()) {
       const MCSection *S = getObjFileLowering().SectionForGlobal(I, Mang, TM);
       
-      I->setSection(((const MCSectionPIC16*)S)->getName());
+      I->setSection(((const PIC16Section *)S)->getName());
     }
+  }
 
   DbgInfo.BeginModule(M);
   EmitFunctionDecls(M);
@@ -243,6 +298,7 @@ bool PIC16AsmPrinter::doInitialization(Module &M) {
   EmitIData(M);
   EmitUData(M);
   EmitRomData(M);
+  EmitUserSections(M);
   return Result;
 }
 
@@ -287,7 +343,7 @@ void PIC16AsmPrinter::EmitFunctionDecls(Module &M) {
 
 // Emit variables imported from other Modules.
 void PIC16AsmPrinter::EmitUndefinedVars(Module &M) {
-  std::vector<const GlobalVariable*> Items = PTOF->ExternalVarDecls->Items;
+  std::vector<const GlobalVariable*> Items = ExternalVarDecls;
   if (!Items.size()) return;
 
   O << "\n" << MAI->getCommentString() << "Imported Variables - BEGIN" << "\n";
@@ -299,7 +355,7 @@ void PIC16AsmPrinter::EmitUndefinedVars(Module &M) {
 
 // Emit variables defined in this module and are available to other modules.
 void PIC16AsmPrinter::EmitDefinedVars(Module &M) {
-  std::vector<const GlobalVariable*> Items = PTOF->ExternalVarDefs->Items;
+  std::vector<const GlobalVariable*> Items = ExternalVarDefs;
   if (!Items.size()) return;
 
   O << "\n" << MAI->getCommentString() << "Exported Variables - BEGIN" << "\n";
@@ -311,25 +367,12 @@ void PIC16AsmPrinter::EmitDefinedVars(Module &M) {
 
 // Emit initialized data placed in ROM.
 void PIC16AsmPrinter::EmitRomData(Module &M) {
-  // Print ROM Data section.
-  const std::vector<PIC16Section*> &ROSections = PTOF->ROSections;
-  for (unsigned i = 0; i < ROSections.size(); i++) {
-    const std::vector<const GlobalVariable*> &Items = ROSections[i]->Items;
-    if (!Items.size()) continue;
-    O << "\n";
-    OutStreamer.SwitchSection(PTOF->ROSections[i]->S_);
-    for (unsigned j = 0; j < Items.size(); j++) {
-      O << Mang->getMangledName(Items[j]);
-      Constant *C = Items[j]->getInitializer();
-      int AddrSpace = Items[j]->getType()->getAddressSpace();
-      EmitGlobalConstant(C, AddrSpace);
-    }
-  }
+  EmitSingleSection(PTOF->ROMDATASection());
 }
 
 bool PIC16AsmPrinter::doFinalization(Module &M) {
+  EmitAllAutos(M);
   printLibcallDecls();
-  EmitRemainingAutos();
   DbgInfo.EndModule(M);
   O << "\n\t" << "END\n";
   return AsmPrinter::doFinalization(M);
@@ -342,8 +385,10 @@ void PIC16AsmPrinter::EmitFunctionFrame(MachineFunction &MF) {
   // Emit the data section name.
   O << "\n"; 
   
-  const MCSection *fPDataSection =
-    getObjFileLowering().getSectionForFunctionFrame(CurrentFnName);
+  PIC16Section *fPDataSection = const_cast<PIC16Section *>(getObjFileLowering().
+                                SectionForFrame(CurrentFnName));
+ 
+  fPDataSection->setColor(getFunctionColor(F)); 
   OutStreamer.SwitchSection(fPDataSection);
   
   // Emit function frame label
@@ -379,106 +424,86 @@ void PIC16AsmPrinter::EmitFunctionFrame(MachineFunction &MF) {
     O << PAN::getTempdataLabel(CurrentFnName) << " RES  " << TempSize << '\n';
 }
 
-void PIC16AsmPrinter::EmitIData(Module &M) {
 
-  // Print all IDATA sections.
-  const std::vector<PIC16Section*> &IDATASections = PTOF->IDATASections;
-  for (unsigned i = 0; i < IDATASections.size(); i++) {
-    O << "\n";
-    if (IDATASections[i]->S_->getName().find("llvm.") != std::string::npos)
-      continue;
-    OutStreamer.SwitchSection(IDATASections[i]->S_);
-    std::vector<const GlobalVariable*> Items = IDATASections[i]->Items;
+void PIC16AsmPrinter::EmitInitializedDataSection(const PIC16Section *S) {
+  /// Emit Section header.
+  OutStreamer.SwitchSection(S);
+
+    std::vector<const GlobalVariable*> Items = S->Items;
     for (unsigned j = 0; j < Items.size(); j++) {
       std::string Name = Mang->getMangledName(Items[j]);
       Constant *C = Items[j]->getInitializer();
       int AddrSpace = Items[j]->getType()->getAddressSpace();
       O << Name;
       EmitGlobalConstant(C, AddrSpace);
-    }
-  }
+   }
 }
 
-void PIC16AsmPrinter::EmitUData(Module &M) {
-  const TargetData *TD = TM.getTargetData();
+// Print all IDATA sections.
+void PIC16AsmPrinter::EmitIData(Module &M) {
+  EmitSectionList (M, PTOF->IDATASections());
+}
 
-  // Print all BSS sections.
-  const std::vector<PIC16Section*> &BSSSections = PTOF->BSSSections;
-  for (unsigned i = 0; i < BSSSections.size(); i++) {
-    O << "\n";
-    OutStreamer.SwitchSection(BSSSections[i]->S_);
-    std::vector<const GlobalVariable*> Items = BSSSections[i]->Items;
+void PIC16AsmPrinter::
+EmitUninitializedDataSection(const PIC16Section *S) {
+    const TargetData *TD = TM.getTargetData();
+    OutStreamer.SwitchSection(S);
+    std::vector<const GlobalVariable*> Items = S->Items;
     for (unsigned j = 0; j < Items.size(); j++) {
       std::string Name = Mang->getMangledName(Items[j]);
       Constant *C = Items[j]->getInitializer();
       const Type *Ty = C->getType();
       unsigned Size = TD->getTypeAllocSize(Ty);
-
       O << Name << " RES " << Size << "\n";
     }
-  }
 }
 
-void PIC16AsmPrinter::EmitAutos(std::string FunctName) {
-  // Section names for all globals are already set.
-  const TargetData *TD = TM.getTargetData();
+// Print all UDATA sections.
+void PIC16AsmPrinter::EmitUData(Module &M) {
+  EmitSectionList (M, PTOF->UDATASections());
+}
 
-  // Now print Autos section for this function.
-  std::string SectionName = PAN::getAutosSectionName(FunctName);
-  const std::vector<PIC16Section*> &AutosSections = PTOF->AutosSections;
-  for (unsigned i = 0; i < AutosSections.size(); i++) {
-    O << "\n";
-    if (AutosSections[i]->S_->getName() == SectionName) { 
-      // Set the printing status to true
-      AutosSections[i]->setPrintedStatus(true);
-      OutStreamer.SwitchSection(AutosSections[i]->S_);
-      const std::vector<const GlobalVariable*> &Items = AutosSections[i]->Items;
-      for (unsigned j = 0; j < Items.size(); j++) {
-        std::string VarName = Mang->getMangledName(Items[j]);
-        Constant *C = Items[j]->getInitializer();
-        const Type *Ty = C->getType();
-        unsigned Size = TD->getTypeAllocSize(Ty);
-        // Emit memory reserve directive.
-        O << VarName << "  RES  " << Size << "\n";
-      }
-      break;
-    }
-  }
+// Print all USER sections.
+void PIC16AsmPrinter::EmitUserSections(Module &M) {
+  EmitSectionList (M, PTOF->USERSections());
 }
 
-// Print autos that were not printed during the code printing of functions.
-// As the functions might themselves would have got deleted by the optimizer.
-void PIC16AsmPrinter::EmitRemainingAutos() {
-  const TargetData *TD = TM.getTargetData();
+// Print all AUTO sections.
+void PIC16AsmPrinter::EmitAllAutos(Module &M) {
+  EmitSectionList (M, PTOF->AUTOSections());
+}
 
-  // Now print Autos section for this function.
-  std::vector <PIC16Section *>AutosSections = PTOF->AutosSections;
-  for (unsigned i = 0; i < AutosSections.size(); i++) {
-    
-    // if the section is already printed then don't print again
-    if (AutosSections[i]->isPrinted()) 
-      continue;
+extern "C" void LLVMInitializePIC16AsmPrinter() { 
+  RegisterAsmPrinter<PIC16AsmPrinter> X(ThePIC16Target);
+}
 
-    // Set status as printed
-    AutosSections[i]->setPrintedStatus(true);
+// Emit one data section using correct section emitter based on section type.
+void PIC16AsmPrinter::EmitSingleSection(const PIC16Section *S) {
+  if (S == NULL) return;
 
-    O << "\n";
-    OutStreamer.SwitchSection(AutosSections[i]->S_);
-    const std::vector<const GlobalVariable*> &Items = AutosSections[i]->Items;
-    for (unsigned j = 0; j < Items.size(); j++) {
-      std::string VarName = Mang->getMangledName(Items[j]);
-      Constant *C = Items[j]->getInitializer();
-      const Type *Ty = C->getType();
-      unsigned Size = TD->getTypeAllocSize(Ty);
-      // Emit memory reserve directive.
-      O << VarName << "  RES  " << Size << "\n";
-    }
+  switch (S->getType()) {
+    default: llvm_unreachable ("unknow user section type");
+    case UDATA:
+    case UDATA_SHR:
+    case UDATA_OVR:
+      EmitUninitializedDataSection(S);
+      break;
+    case IDATA:
+    case ROMDATA:
+      EmitInitializedDataSection(S);
+      break;
   }
 }
 
-
-extern "C" void LLVMInitializePIC16AsmPrinter() { 
-  RegisterAsmPrinter<PIC16AsmPrinter> X(ThePIC16Target);
+// Emit a list of sections.
+void PIC16AsmPrinter::
+EmitSectionList(Module &M, const std::vector<PIC16Section *> &SList) {
+  for (unsigned i = 0; i < SList.size(); i++) {
+    // Exclude llvm specific metadata sections.
+    if (SList[i]->getName().find("llvm.") != std::string::npos)
+      continue;
+    O << "\n";
+    EmitSingleSection(SList[i]);
+  }
 }
 
-
diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h
index 2dd4600..b13d9ce 100644
--- a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h
@@ -53,11 +53,17 @@ namespace llvm {
     void EmitDefinedVars (Module &M);
     void EmitIData (Module &M);
     void EmitUData (Module &M);
-    void EmitAutos (std::string FunctName);
-    void EmitRemainingAutos ();
+    void EmitAllAutos (Module &M);
     void EmitRomData (Module &M);
+    void EmitUserSections (Module &M);
     void EmitFunctionFrame(MachineFunction &MF);
     void printLibcallDecls();
+    void EmitUninitializedDataSection(const PIC16Section *S);
+    void EmitInitializedDataSection(const PIC16Section *S);
+    void EmitSingleSection(const PIC16Section *S);
+    void EmitSectionList(Module &M, 
+                         const std::vector< PIC16Section *> &SList);
+    void ColorAutoSection(const Function *F);
   protected:
     bool doInitialization(Module &M);
     bool doFinalization(Module &M);
@@ -74,6 +80,8 @@ namespace llvm {
     PIC16DbgInfo DbgInfo;
     const PIC16MCAsmInfo *PMAI;
     std::list<const char *> LibcallDecls; // List of extern decls.
+    std::vector<const GlobalVariable *> ExternalVarDecls;
+    std::vector<const GlobalVariable *> ExternalVarDefs;
   };
 } // end of namespace
 
diff --git a/lib/Target/PIC16/CMakeLists.txt b/lib/Target/PIC16/CMakeLists.txt
index 0ee88f9..208b067 100644
--- a/lib/Target/PIC16/CMakeLists.txt
+++ b/lib/Target/PIC16/CMakeLists.txt
@@ -18,6 +18,7 @@ add_llvm_target(PIC16
   PIC16MemSelOpt.cpp
   PIC16MCAsmInfo.cpp
   PIC16RegisterInfo.cpp
+  PIC16Section.cpp
   PIC16Subtarget.cpp
   PIC16TargetMachine.cpp
   PIC16TargetObjectFile.cpp
diff --git a/lib/Target/PIC16/Makefile b/lib/Target/PIC16/Makefile
index f913675..4382eba7 100644
--- a/lib/Target/PIC16/Makefile
+++ b/lib/Target/PIC16/Makefile
@@ -17,7 +17,7 @@ BUILT_SOURCES = PIC16GenRegisterInfo.h.inc PIC16GenRegisterNames.inc \
 		PIC16GenDAGISel.inc PIC16GenCallingConv.inc \
 		PIC16GenSubtarget.inc
 
-DIRS = AsmPrinter TargetInfo
+DIRS = AsmPrinter TargetInfo PIC16Passes
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/PIC16/PIC16.h b/lib/Target/PIC16/PIC16.h
index 8a3704d..e46c9b2 100644
--- a/lib/Target/PIC16/PIC16.h
+++ b/lib/Target/PIC16/PIC16.h
@@ -42,266 +42,15 @@ namespace PIC16CC {
     UGE
   };
 }
-  // A Central class to manage all ABI naming conventions.
-  // PAN - [P]ic16 [A]BI [N]ames
-  class PAN {
-    public:
-    // Map the name of the symbol to its section name.
-    // Current ABI:
-    // -----------------------------------------------------
-    // ALL Names are prefixed with the symobl '@'.
-    // ------------------------------------------------------
-    // Global variables do not have any '.' in their names.
-    // These are maily function names and global variable names.
-    // Example - @foo,  @i
-    // -------------------------------------------------------
-    // Functions and auto variables.
-    // Names are mangled as <prefix><funcname>.<tag>.<varname>
-    // Where <prefix> is '@' and <tag> is any one of
-    // the following
-    // .auto. - an automatic var of a function.
-    // .temp. - temproray data of a function.
-    // .ret.  - return value label for a function.
-    // .frame. - Frame label for a function where retval, args
-    //           and temps are stored.
-    // .args. - Label used to pass arguments to a direct call.
-    // Example - Function name:   @foo
-    //           Its frame:       @foo.frame.
-    //           Its retval:      @foo.ret.
-    //           Its local vars:  @foo.auto.a
-    //           Its temp data:   @foo.temp.
-    //           Its arg passing: @foo.args.
-    //----------------------------------------------
-    // Libcall - compiler generated libcall names must start with .lib.
-    //           This id will be used to emit extern decls for libcalls.
-    // Example - libcall name:   @.lib.sra.i8
-    //           To pass args:   @.lib.sra.i8.args.
-    //           To return val:  @.lib.sra.i8.ret.
-    //----------------------------------------------
-    // SECTION Names
-    // uninitialized globals - @udata.<num>.#
-    // initialized globals - @idata.<num>.#
-    // Function frame - @<func>.frame_section.
-    // Function autos - @<func>.autos_section.
-    // Declarations - Enclosed in comments. No section for them.
-    //----------------------------------------------------------
-    
-    // Tags used to mangle different names. 
-    enum TAGS {
-      PREFIX_SYMBOL,
-      GLOBAL,
-      STATIC_LOCAL,
-      AUTOS_LABEL,
-      FRAME_LABEL,
-      RET_LABEL,
-      ARGS_LABEL,
-      TEMPS_LABEL,
-      
-      LIBCALL,
-      
-      FRAME_SECTION,
-      AUTOS_SECTION,
-      CODE_SECTION
-    };
-
-    // Textual names of the tags.
-    inline static const char *getTagName(TAGS tag) {
-      switch (tag) {
-      default: return "";
-      case PREFIX_SYMBOL:    return "@";
-      case AUTOS_LABEL:       return ".auto.";
-      case FRAME_LABEL:       return ".frame.";
-      case TEMPS_LABEL:       return ".temp.";
-      case ARGS_LABEL:       return ".args.";
-      case RET_LABEL:       return ".ret.";
-      case LIBCALL:       return ".lib.";
-      case FRAME_SECTION:       return ".frame_section.";
-      case AUTOS_SECTION:       return ".autos_section.";
-      case CODE_SECTION:       return ".code_section.";
-      }
-    }
-
-    // Get tag type for the Symbol.
-    inline static TAGS getSymbolTag(const std::string &Sym) {
-      if (Sym.find(getTagName(TEMPS_LABEL)) != std::string::npos)
-        return TEMPS_LABEL;
-
-      if (Sym.find(getTagName(FRAME_LABEL)) != std::string::npos)
-        return FRAME_LABEL;
-
-      if (Sym.find(getTagName(RET_LABEL)) != std::string::npos)
-        return RET_LABEL;
-
-      if (Sym.find(getTagName(ARGS_LABEL)) != std::string::npos)
-        return ARGS_LABEL;
-
-      if (Sym.find(getTagName(AUTOS_LABEL)) != std::string::npos)
-        return AUTOS_LABEL;
-
-      if (Sym.find(getTagName(LIBCALL)) != std::string::npos)
-        return LIBCALL;
-
-      // It does not have any Tag. So its a true global or static local.
-      if (Sym.find(".") == std::string::npos) 
-        return GLOBAL;
-      
-      // If a . is there, then it may be static local.
-      // We should mangle these as well in clang.
-      if (Sym.find(".") != std::string::npos) 
-        return STATIC_LOCAL;
- 
-      assert (0 && "Could not determine Symbol's tag");
-      return PREFIX_SYMBOL; // Silence warning when assertions are turned off.
-    }
-
-    // addPrefix - add prefix symbol to a name if there isn't one already.
-    inline static std::string addPrefix (const std::string &Name) {
-      std::string prefix = getTagName (PREFIX_SYMBOL);
-
-      // If this name already has a prefix, nothing to do.
-      if (Name.compare(0, prefix.size(), prefix) == 0)
-        return Name;
-
-      return prefix + Name;
-    }
-
-    // Get mangled func name from a mangled sym name.
-    // In all cases func name is the first component before a '.'.
-    static inline std::string getFuncNameForSym(const std::string &Sym1) {
-      assert (getSymbolTag(Sym1) != GLOBAL && "not belongs to a function");
-
-      std::string Sym = addPrefix(Sym1);
-
-      // Position of the . after func name. That's where func name ends.
-      size_t func_name_end = Sym.find ('.');
-
-      return Sym.substr (0, func_name_end);
-    }
-
-    // Get Frame start label for a func.
-    static std::string getFrameLabel(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(FRAME_LABEL);
-      return Func1 + tag;
-    }
-
-    static std::string getRetvalLabel(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(RET_LABEL);
-      return Func1 + tag;
-    }
-
-    static std::string getArgsLabel(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(ARGS_LABEL);
-      return Func1 + tag;
-    }
-
-    static std::string getTempdataLabel(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(TEMPS_LABEL);
-      return Func1 + tag;
-    }
 
-    static std::string getFrameSectionName(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(FRAME_SECTION);
-      return Func1 + tag + "# UDATA_OVR";
-    }
-
-    static std::string getAutosSectionName(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(AUTOS_SECTION);
-      return Func1 + tag + "# UDATA_OVR";
-    }
-
-    static std::string getCodeSectionName(const std::string &Func) {
-      std::string Func1 = addPrefix(Func);
-      std::string tag = getTagName(CODE_SECTION);
-      return Func1 + tag + "# CODE";
-    }
-
-    // udata, romdata and idata section names are generated by a given number.
-    // @udata.<num>.# 
-    static std::string getUdataSectionName(unsigned num, 
-                                           std::string prefix = "") {
-       std::ostringstream o;
-       o << getTagName(PREFIX_SYMBOL) << prefix << "udata." << num 
-         << ".# UDATA"; 
-       return o.str(); 
-    }
-
-    static std::string getRomdataSectionName(unsigned num,
-                                             std::string prefix = "") {
-       std::ostringstream o;
-       o << getTagName(PREFIX_SYMBOL) << prefix << "romdata." << num 
-         << ".# ROMDATA";
-       return o.str();
-    }
-
-    static std::string getIdataSectionName(unsigned num,
-                                           std::string prefix = "") {
-       std::ostringstream o;
-       o << getTagName(PREFIX_SYMBOL) << prefix << "idata." << num 
-         << ".# IDATA"; 
-       return o.str(); 
-    }
-
-    inline static bool isLocalName (const std::string &Name) {
-      if (getSymbolTag(Name) == AUTOS_LABEL)
-        return true;
-
-      return false;
-    }
-
-    inline static bool isMemIntrinsic (const std::string &Name) {
-      if (Name.compare("@memcpy") == 0 || Name.compare("@memset") == 0 ||
-          Name.compare("@memmove") == 0) {
-        return true;
-      }
-      
-      return false;
-    }
-
-    inline static bool isLocalToFunc (std::string &Func, std::string &Var) {
-      if (! isLocalName(Var)) return false;
-
-      std::string Func1 = addPrefix(Func);
-      // Extract func name of the varilable.
-      const std::string &fname = getFuncNameForSym(Var);
-
-      if (fname.compare(Func1) == 0)
-        return true;
-
-      return false;
-    }
-
-
-    // Get the section for the given external symbol names.
-    // This tries to find the type (Tag) of the symbol from its mangled name
-    // and return appropriate section name for it.
-    static inline std::string getSectionNameForSym(const std::string &Sym1) {
-      std::string Sym = addPrefix(Sym1);
-
-      std::string SectionName;
- 
-      std::string Fname = getFuncNameForSym (Sym);
-      TAGS id = getSymbolTag (Sym);
-
-      switch (id) {
-        default : assert (0 && "Could not determine external symbol type");
-        case FRAME_LABEL:
-        case RET_LABEL:
-        case TEMPS_LABEL:
-        case ARGS_LABEL:  {
-          return getFrameSectionName(Fname);
-        }
-        case AUTOS_LABEL: {
-          return getAutosSectionName(Fname);
-        }
-      }
-    }
-  }; // class PAN.
+  enum PIC16SectionType {
+      CODE,
+      UDATA,
+      IDATA,
+      ROMDATA,
+      UDATA_OVR,
+      UDATA_SHR
+    };
 
 
   // External symbol names require memory to live till the program end.
diff --git a/lib/Target/PIC16/PIC16ABINames.h b/lib/Target/PIC16/PIC16ABINames.h
new file mode 100644
index 0000000..7f4c2f1
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ABINames.h
@@ -0,0 +1,325 @@
+//===-- PIC16ABINames.h - PIC16 Naming conventios for ABI----- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions to manage ABI Naming conventions for PIC16. 
+// 
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PIC16ABINAMES_H
+#define LLVM_TARGET_PIC16ABINAMES_H
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <sstream>
+#include <cstring>
+#include <string>
+
+namespace llvm {
+  class PIC16TargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class formatted_raw_ostream;
+
+  // A Central class to manage all ABI naming conventions.
+  // PAN - [P]ic16 [A]BI [N]ames
+  class PAN {
+    public:
+    // Map the name of the symbol to its section name.
+    // Current ABI:
+    // -----------------------------------------------------
+    // ALL Names are prefixed with the symobl '@'.
+    // ------------------------------------------------------
+    // Global variables do not have any '.' in their names.
+    // These are maily function names and global variable names.
+    // Example - @foo,  @i
+    // Static local variables - @<func>.<var>
+    // -------------------------------------------------------
+    // Functions and auto variables.
+    // Names are mangled as <prefix><funcname>.<tag>.<varname>
+    // Where <prefix> is '@' and <tag> is any one of
+    // the following
+    // .auto. - an automatic var of a function.
+    // .temp. - temproray data of a function.
+    // .ret.  - return value label for a function.
+    // .frame. - Frame label for a function where retval, args
+    //           and temps are stored.
+    // .args. - Label used to pass arguments to a direct call.
+    // Example - Function name:   @foo
+    //           Its frame:       @foo.frame.
+    //           Its retval:      @foo.ret.
+    //           Its local vars:  @foo.auto.a
+    //           Its temp data:   @foo.temp.
+    //           Its arg passing: @foo.args.
+    //----------------------------------------------
+    // Libcall - compiler generated libcall names must start with .lib.
+    //           This id will be used to emit extern decls for libcalls.
+    // Example - libcall name:   @.lib.sra.i8
+    //           To pass args:   @.lib.sra.i8.args.
+    //           To return val:  @.lib.sra.i8.ret.
+    //----------------------------------------------
+    // SECTION Names
+    // uninitialized globals - @udata.<num>.#
+    // initialized globals - @idata.<num>.#
+    // Program memory data - @romdata.#
+    // Variables with user defined section name - <user_defined_section>
+    // Variables with user defined address - @<var>.user_section.<address>.#
+    // Function frame - @<func>.frame_section.
+    // Function autos - @<func>.autos_section.
+    // Overlay sections - @<color>.##
+    // Declarations - Enclosed in comments. No section for them.
+    //----------------------------------------------------------
+    
+    // Tags used to mangle different names. 
+    enum TAGS {
+      PREFIX_SYMBOL,
+      GLOBAL,
+      STATIC_LOCAL,
+      AUTOS_LABEL,
+      FRAME_LABEL,
+      RET_LABEL,
+      ARGS_LABEL,
+      TEMPS_LABEL,
+      
+      LIBCALL,
+      
+      FRAME_SECTION,
+      AUTOS_SECTION,
+      CODE_SECTION,
+      USER_SECTION
+    };
+
+    // Textual names of the tags.
+    inline static const char *getTagName(TAGS tag) {
+      switch (tag) {
+      default: return "";
+      case PREFIX_SYMBOL:    return "@";
+      case AUTOS_LABEL:       return ".auto.";
+      case FRAME_LABEL:       return ".frame.";
+      case TEMPS_LABEL:       return ".temp.";
+      case ARGS_LABEL:       return ".args.";
+      case RET_LABEL:       return ".ret.";
+      case LIBCALL:       return ".lib.";
+      case FRAME_SECTION:       return ".frame_section.";
+      case AUTOS_SECTION:       return ".autos_section.";
+      case CODE_SECTION:       return ".code_section.";
+      case USER_SECTION:       return ".user_section.";
+      }
+    }
+
+    // Get tag type for the Symbol.
+    inline static TAGS getSymbolTag(const std::string &Sym) {
+      if (Sym.find(getTagName(TEMPS_LABEL)) != std::string::npos)
+        return TEMPS_LABEL;
+
+      if (Sym.find(getTagName(FRAME_LABEL)) != std::string::npos)
+        return FRAME_LABEL;
+
+      if (Sym.find(getTagName(RET_LABEL)) != std::string::npos)
+        return RET_LABEL;
+
+      if (Sym.find(getTagName(ARGS_LABEL)) != std::string::npos)
+        return ARGS_LABEL;
+
+      if (Sym.find(getTagName(AUTOS_LABEL)) != std::string::npos)
+        return AUTOS_LABEL;
+
+      if (Sym.find(getTagName(LIBCALL)) != std::string::npos)
+        return LIBCALL;
+
+      // It does not have any Tag. So its a true global or static local.
+      if (Sym.find(".") == std::string::npos) 
+        return GLOBAL;
+      
+      // If a . is there, then it may be static local.
+      // We should mangle these as well in clang.
+      if (Sym.find(".") != std::string::npos) 
+        return STATIC_LOCAL;
+ 
+      assert (0 && "Could not determine Symbol's tag");
+      return PREFIX_SYMBOL; // Silence warning when assertions are turned off.
+    }
+
+    // addPrefix - add prefix symbol to a name if there isn't one already.
+    inline static std::string addPrefix (const std::string &Name) {
+      std::string prefix = getTagName (PREFIX_SYMBOL);
+
+      // If this name already has a prefix, nothing to do.
+      if (Name.compare(0, prefix.size(), prefix) == 0)
+        return Name;
+
+      return prefix + Name;
+    }
+
+    // Get mangled func name from a mangled sym name.
+    // In all cases func name is the first component before a '.'.
+    static inline std::string getFuncNameForSym(const std::string &Sym1) {
+      assert (getSymbolTag(Sym1) != GLOBAL && "not belongs to a function");
+
+      std::string Sym = addPrefix(Sym1);
+
+      // Position of the . after func name. That's where func name ends.
+      size_t func_name_end = Sym.find ('.');
+
+      return Sym.substr (0, func_name_end);
+    }
+
+    // Get Frame start label for a func.
+    static std::string getFrameLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(FRAME_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getRetvalLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(RET_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getArgsLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(ARGS_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getTempdataLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(TEMPS_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getFrameSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(FRAME_SECTION);
+      return Func1 + tag + "#";
+    }
+
+    static std::string getAutosSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(AUTOS_SECTION);
+      return Func1 + tag + "#";
+    }
+
+    static std::string getCodeSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(CODE_SECTION);
+      return Func1 + tag + "#";
+    }
+
+    static std::string getUserSectionName(const std::string &Name) {
+      std::string sname = addPrefix(Name);;
+      std::string tag = getTagName(USER_SECTION);
+      return sname + tag + "#";
+    }
+
+    // udata, romdata and idata section names are generated by a given number.
+    // @udata.<num>.# 
+    static std::string getUdataSectionName(unsigned num, 
+                                           std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "udata." << num 
+         << ".#"; 
+       return o.str(); 
+    }
+
+    static std::string getRomdataSectionName() {
+      return "romdata.#";
+    }
+
+    static std::string getRomdataSectionName(unsigned num,
+                                             std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "romdata." << num 
+         << ".#";
+       return o.str();
+    }
+
+    static std::string getIdataSectionName(unsigned num,
+                                           std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "idata." << num 
+         << ".#"; 
+       return o.str(); 
+    }
+
+    inline static bool isLocalName (const std::string &Name) {
+      if (getSymbolTag(Name) == AUTOS_LABEL)
+        return true;
+
+      return false;
+    }
+
+    inline static bool isMemIntrinsic (const std::string &Name) {
+      if (Name.compare("@memcpy") == 0 || Name.compare("@memset") == 0 ||
+          Name.compare("@memmove") == 0) {
+        return true;
+      }
+      
+      return false;
+    }
+
+    inline static bool isLocalToFunc (std::string &Func, std::string &Var) {
+      if (! isLocalName(Var)) return false;
+
+      std::string Func1 = addPrefix(Func);
+      // Extract func name of the varilable.
+      const std::string &fname = getFuncNameForSym(Var);
+
+      if (fname.compare(Func1) == 0)
+        return true;
+
+      return false;
+    }
+
+
+    // Get the section for the given external symbol names.
+    // This tries to find the type (Tag) of the symbol from its mangled name
+    // and return appropriate section name for it.
+    static inline std::string getSectionNameForSym(const std::string &Sym1) {
+      std::string Sym = addPrefix(Sym1);
+
+      std::string SectionName;
+ 
+      std::string Fname = getFuncNameForSym (Sym);
+      TAGS id = getSymbolTag (Sym);
+
+      switch (id) {
+        default : assert (0 && "Could not determine external symbol type");
+        case FRAME_LABEL:
+        case RET_LABEL:
+        case TEMPS_LABEL:
+        case ARGS_LABEL:  {
+          return getFrameSectionName(Fname);
+        }
+        case AUTOS_LABEL: {
+          return getAutosSectionName(Fname);
+        }
+      }
+    }
+
+    /// Return Overlay Name for the section.
+    /// The ABI Convention is: @<Color>.##.<section_tag>
+    /// The section_tag is retrieved from the SectName parameter and
+    /// and Color is passed in parameter.
+    static inline std::string  getOverlayName(std::string SectName, int Color) {
+      // FIXME: Only autos_section and frame_section are colored.
+      // So check and assert if the passed SectName does not have AUTOS_SECTION
+      // or FRAME_SECTION tag in it.
+      std::ostringstream o;
+      o << getTagName(PREFIX_SYMBOL) << Color << ".##" 
+        << SectName.substr(SectName.find("."));
+
+      return o.str();
+    } 
+  }; // class PAN.
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/PIC16/PIC16DebugInfo.cpp b/lib/Target/PIC16/PIC16DebugInfo.cpp
index 961caed..0ed44d2 100644
--- a/lib/Target/PIC16/PIC16DebugInfo.cpp
+++ b/lib/Target/PIC16/PIC16DebugInfo.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PIC16.h"
+#include "PIC16ABINames.h"
 #include "PIC16DebugInfo.h" 
 #include "llvm/GlobalVariable.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -30,10 +31,10 @@ void PIC16DbgInfo::PopulateDebugInfo (DIType Ty, unsigned short &TypeNo,
                                       std::string &TagName) {
   if (Ty.isBasicType())
     PopulateBasicTypeInfo (Ty, TypeNo);
-  else if (Ty.isDerivedType())
-    PopulateDerivedTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
   else if (Ty.isCompositeType())
     PopulateCompositeTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
+  else if (Ty.isDerivedType())
+    PopulateDerivedTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
   else {
     TypeNo = PIC16Dbg::T_NULL;
     HasAux = false;
@@ -190,7 +191,7 @@ unsigned PIC16DbgInfo::GetTypeDebugNumber(std::string &type)  {
 ///
 short PIC16DbgInfo::getStorageClass(DIGlobalVariable DIGV) {
   short ClassNo;
-  if (PAN::isLocalName(DIGV.getGlobal()->getName())) {
+  if (PAN::isLocalName(DIGV.getName())) {
     // Generating C_AUTO here fails due to error in linker. Change it once
     // linker is fixed.
     ClassNo = PIC16Dbg::C_STAT;
@@ -446,7 +447,8 @@ void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
     bool HasAux = false;
     int Aux[PIC16Dbg::AuxSize] = { 0 };
     std::string TagName = "";
-    std::string VarName = MAI->getGlobalPrefix()+DIGV.getGlobal()->getNameStr();
+    std::string VarName = DIGV.getName();
+    VarName = MAI->getGlobalPrefix() + VarName;
     PopulateDebugInfo(Ty, TypeNo, HasAux, Aux, TagName);
     // Emit debug info only if type information is availaible.
     if (TypeNo != PIC16Dbg::T_NULL) {
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index bf986b1..635befe 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "pic16-lower"
+#include "PIC16ABINames.h"
 #include "PIC16ISelLowering.h"
 #include "PIC16TargetObjectFile.h"
 #include "PIC16TargetMachine.h"
diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
index cb0c41b..87bd3d9 100644
--- a/lib/Target/PIC16/PIC16InstrInfo.cpp
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PIC16.h"
+#include "PIC16ABINames.h"
 #include "PIC16InstrInfo.h"
 #include "PIC16TargetMachine.h"
 #include "PIC16GenInstrInfo.inc"
diff --git a/lib/Target/PIC16/PIC16MCAsmInfo.cpp b/lib/Target/PIC16/PIC16MCAsmInfo.cpp
index a17d1a8..827315e 100644
--- a/lib/Target/PIC16/PIC16MCAsmInfo.cpp
+++ b/lib/Target/PIC16/PIC16MCAsmInfo.cpp
@@ -16,6 +16,7 @@
 // FIXME: Layering violation to get enums and static function, should be moved
 // to separate headers.
 #include "PIC16.h"
+#include "PIC16ABINames.h"
 #include "PIC16ISelLowering.h"
 using namespace llvm;
 
diff --git a/lib/Target/PIC16/PIC16MemSelOpt.cpp b/lib/Target/PIC16/PIC16MemSelOpt.cpp
index c9ebb57..a97dc35 100644
--- a/lib/Target/PIC16/PIC16MemSelOpt.cpp
+++ b/lib/Target/PIC16/PIC16MemSelOpt.cpp
@@ -21,6 +21,7 @@
 
 #define DEBUG_TYPE "pic16-codegen"
 #include "PIC16.h"
+#include "PIC16ABINames.h"
 #include "PIC16InstrInfo.h"
 #include "PIC16MCAsmInfo.h"
 #include "PIC16TargetMachine.h"
diff --git a/lib/Target/PIC16/PIC16Passes/Makefile b/lib/Target/PIC16/PIC16Passes/Makefile
new file mode 100644
index 0000000..cbb34b3
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/Makefile
@@ -0,0 +1,17 @@
+##===- lib/Target/PIC16/PIC16Passes/Makefile -----------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source 
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+TARGET = PIC16
+LIBRARYNAME = LLVMpic16passes
+BUILD_ARCHIVE = 1
+
+
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
new file mode 100644
index 0000000..197c398
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
@@ -0,0 +1,181 @@
+//===-- PIC16Overlay.cpp - Implementation for PIC16 Frame Overlay===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 Frame Overlay implementation.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Value.h"
+#include "PIC16Overlay.h"
+#include "llvm/Function.h"
+#include <cstdlib>
+#include <sstream>
+using namespace llvm;
+
+namespace llvm {
+  char PIC16FrameOverlay::ID = 0;
+  ModulePass *createPIC16OverlayPass() { return new PIC16FrameOverlay(); }
+}
+
+void PIC16FrameOverlay::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<CallGraph>();
+}
+
+void PIC16FrameOverlay::DFSTraverse(CallGraphNode *CGN, unsigned Depth) {
+  // Do not set any color for external calling node.
+  if (Depth != 0 && CGN->getFunction()) {
+    unsigned Color = getColor(CGN->getFunction());
+
+    // Handle indirectly called functions
+    if (Color >= PIC16Overlay::StartIndirectCallColor || 
+        Depth >= PIC16Overlay::StartIndirectCallColor) {
+      // All functions called from an indirectly called function are given
+      // an unique color.
+      if (Color < PIC16Overlay::StartIndirectCallColor &&
+          Depth >= PIC16Overlay::StartIndirectCallColor)
+        setColor(CGN->getFunction(), Depth);
+
+      for (unsigned int i = 0; i < CGN->size(); i++)
+        DFSTraverse((*CGN)[i], ++IndirectCallColor);
+      return;
+    }
+    // Just return if the node already has a color greater than the current 
+    // depth. A node must be colored with the maximum depth that it has.
+    if (Color >= Depth)
+      return;
+    
+    Depth = ModifyDepthForInterrupt(CGN, Depth);  
+    setColor(CGN->getFunction(), Depth);
+  }
+  
+  // Color all children of this node with color depth+1.
+  for (unsigned int i = 0; i < CGN->size(); i++)
+    DFSTraverse((*CGN)[i], Depth+1);
+}
+
+unsigned PIC16FrameOverlay::ModifyDepthForInterrupt(CallGraphNode *CGN,
+                                                    unsigned Depth) {
+  Function *Fn = CGN->getFunction();
+
+  // Return original Depth if function or section for function do not exist.
+  if (!Fn || !Fn->hasSection())
+    return Depth;
+
+  // Return original Depth if this function is not marked as interrupt.
+  if (Fn->getSection().find("interrupt") == string::npos)
+    return Depth;
+
+  Depth = Depth + InterruptDepth;
+  return Depth;
+}
+
+void PIC16FrameOverlay::setColor(Function *Fn, unsigned Color) {
+  std::string Section = "";
+  if (Fn->hasSection())
+    Section = Fn->getSection();
+
+  size_t Pos = Section.find(OverlayStr);
+
+  // Convert Color to string.
+  std::stringstream ss;
+  ss << Color;
+  std::string ColorString = ss.str();
+
+  // If color is already set then reset it with the new value. Else append 
+  // the Color string to section.
+  if (Pos != std::string::npos) {
+    Pos += OverlayStr.length();
+    char c = Section.at(Pos);
+    unsigned OldColorLength = 0;  
+    while (c >= '0' && c<= '9') {
+      OldColorLength++;    
+      if (Pos < Section.length() - 1)
+        Pos++;
+      else
+        break;
+      c = Section.at(Pos);
+    }
+    // Replace old color with new one.
+    Section.replace(Pos-OldColorLength +1, OldColorLength, ColorString); 
+  }
+  else {
+    // Append Color information to section string.
+    if (Fn->hasSection())
+      Section.append(" ");
+    Section.append(OverlayStr + ColorString);
+  }
+  Fn->setSection(Section);
+}
+
+unsigned PIC16FrameOverlay::getColor(Function *Fn) {
+  int Color = 0;
+  if (!Fn->hasSection())
+    return 0;
+
+  std::string Section = Fn->getSection();
+  size_t Pos = Section.find(OverlayStr);
+  
+  // Return 0 if Color is not set.
+  if (Pos == std::string::npos)
+    return 0;
+
+  // Set Pos to after "Overlay=".
+  Pos += OverlayStr.length();
+  char c = Section.at(Pos);
+  std::string ColorString = "";
+
+  // Find the string representing Color. A Color can only consist of digits.
+  while (c >= '0' && c<= '9') { 
+    ColorString.append(1,c);
+    if (Pos < Section.length() - 1)
+      Pos++;
+    else
+      break;
+    c = Section.at(Pos);
+  }
+  Color = atoi(ColorString.c_str());
+  
+  return Color;    
+}
+
+bool PIC16FrameOverlay::runOnModule(Module &M) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  CallGraphNode *ECN = CG.getExternalCallingNode();
+
+  MarkIndirectlyCalledFunctions(M); 
+  // Since External Calling Node is the base function, do a depth first 
+  // traversal of CallGraph with ECN as root. Each node with be marked with 
+  // a color that is max(color(callers)) + 1.
+  if(ECN) {
+    DFSTraverse(ECN, 0);
+  }
+  return false;
+}
+
+void PIC16FrameOverlay::MarkIndirectlyCalledFunctions(Module &M) {
+  // If the use of a function is not a call instruction then this
+  // function might be called indirectly. In that case give it
+  // an unique color.
+  for (Module::iterator MI = M.begin(), E = M.end(); MI != E; ++MI) {
+    for (Value::use_iterator I = MI->use_begin(), E = MI->use_end(); I != E;
+         ++I) {
+      if ((!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          || !CallSite(cast<Instruction>(I)).isCallee(I)) {
+        setColor(MI, ++IndirectCallColor);
+        break;
+      }
+    }
+  }
+}
diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
new file mode 100644
index 0000000..d70c4e7
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
@@ -0,0 +1,55 @@
+//===-- PIC16FrameOverlay.h - Interface for PIC16 Frame Overlay -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 Overlay infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16FRAMEOVERLAY_H
+#define PIC16FRAMEOVERLAY_H
+ 
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Pass.h"
+#include "llvm/CallGraphSCCPass.h"
+
+using std::string;
+using namespace llvm;
+
+namespace  llvm {
+  namespace PIC16Overlay {
+    enum OverlayConsts {
+      StartInterruptColor = 200,
+      StartIndirectCallColor = 300
+    }; 
+  }
+  class PIC16FrameOverlay : public ModulePass {
+    std::string OverlayStr;
+    unsigned InterruptDepth;
+    unsigned IndirectCallColor;
+  public:
+    static char ID; // Class identification 
+    PIC16FrameOverlay() : ModulePass(&ID) {
+      OverlayStr = "Overlay=";
+      InterruptDepth = PIC16Overlay::StartInterruptColor;
+      IndirectCallColor = PIC16Overlay::StartIndirectCallColor;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const; 
+    virtual bool runOnModule(Module &M);
+
+  private: 
+    unsigned getColor(Function *Fn);
+    void setColor(Function *Fn, unsigned Color);
+    unsigned ModifyDepthForInterrupt(CallGraphNode *CGN, unsigned Depth);
+    void MarkIndirectlyCalledFunctions(Module &M);
+    void DFSTraverse(CallGraphNode *CGN, unsigned Depth);
+  };
+}  // End of  namespace
+
+#endif
diff --git a/lib/Target/PIC16/PIC16Section.cpp b/lib/Target/PIC16/PIC16Section.cpp
new file mode 100644
index 0000000..a96ebb8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Section.cpp
@@ -0,0 +1,96 @@
+//===-- PIC16Section.cpp - PIC16 Section ----------- --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "PIC16Section.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+// This is the only way to create a PIC16Section. Sections created here
+// do not need to be explicitly deleted as they are managed by auto_ptrs.
+PIC16Section *PIC16Section::Create(const StringRef &Name,
+                                   PIC16SectionType Ty,
+                                   const std::string &Address, 
+                                   int Color, MCContext &Ctx) {
+
+  /// Determine the internal SectionKind info.
+  /// Users of PIC16Section class should not need to know the internal
+  /// SectionKind. They should work only with PIC16SectionType.
+  ///
+  /// PIC16 Terminology for section kinds is as below.
+  /// UDATA - BSS
+  /// IDATA - initialized data (equiv to Metadata) 
+  /// ROMDATA - ReadOnly.
+  /// UDATA_OVR - Sections that can be overlaid. Section of such type is
+  ///             used to contain function autos an frame. We can think of
+  ///             it as equiv to llvm ThreadBSS)
+  /// UDATA_SHR - Shared RAM. Memory area that is mapped to all banks.
+
+  SectionKind K;
+  switch (Ty) {
+    default: llvm_unreachable ("can not create unknown section type");
+    case UDATA_OVR: {
+      K = SectionKind::getThreadBSS();
+      break;
+    }
+    case UDATA_SHR:
+    case UDATA: {
+      K = SectionKind::getBSS();
+      break;
+    }
+    case ROMDATA:
+    case IDATA: {
+      K = SectionKind::getMetadata();
+      break;
+    }
+    case CODE: {
+      K = SectionKind::getText();
+      break;
+    }
+      
+  }
+
+  // Create the Section.
+  PIC16Section *S = new (Ctx) PIC16Section(Name, K, Address, Color);
+  S->T = Ty;
+  return S;
+}
+
+// A generic way to print all types of sections.
+void PIC16Section::PrintSwitchToSection(const MCAsmInfo &MAI,
+                                          raw_ostream &OS) const {
+ 
+  // If the section is overlaid(i.e. it has a color), print overlay name for 
+  // it. Otherwise print its normal name.
+  if (Color != -1)
+    OS << PAN::getOverlayName(getName(), Color) << '\t';
+  else
+    OS << getName() << '\t';
+
+  // Print type.
+  switch (getType()) {
+  default : llvm_unreachable ("unknown section type"); 
+  case UDATA: OS << "UDATA"; break;
+  case IDATA: OS << "IDATA"; break;
+  case ROMDATA: OS << "ROMDATA"; break;
+  case UDATA_SHR: OS << "UDATA_SHR"; break;
+  case UDATA_OVR: OS << "UDATA_OVR"; break;
+  case CODE: OS << "CODE"; break;
+  }
+
+  OS << '\t';
+
+  // Print Address.
+  OS << Address;
+
+  OS << '\n';
+}
diff --git a/lib/Target/PIC16/PIC16Section.h b/lib/Target/PIC16/PIC16Section.h
new file mode 100644
index 0000000..3a8bbfb
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Section.h
@@ -0,0 +1,92 @@
+//===- PIC16Section.h - PIC16-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PIC16Section class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PIC16SECTION_H
+#define LLVM_PIC16SECTION_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/GlobalVariable.h"
+#include <vector>
+
+namespace llvm {
+  /// PIC16Section - Represents a physical section in PIC16 COFF.
+  /// Contains data objects.
+  ///
+  class PIC16Section : public MCSection {
+    /// PIC16 Sections does not really use the SectionKind class to
+    /// to distinguish between various types of sections. PIC16 maintain
+    /// its own Section Type info. See the PIC16SectionType enum in PIC16.h 
+    /// for various section types.
+    PIC16SectionType T;
+
+    /// Name of the section to uniquely identify it.
+    std::string Name;
+
+    /// User can specify an address at which a section should be placed. 
+    /// Negative value here means user hasn't specified any. 
+    std::string Address; 
+
+    /// Overlay information - Sections with same color can be overlaid on
+    /// one another.
+    int Color; 
+
+    /// Total size of all data objects contained here.
+    unsigned Size;
+    
+    PIC16Section(const StringRef &name, SectionKind K, const std::string &addr, 
+                 int color)
+      : MCSection(K), Name(name), Address(addr), Color(color) {
+    }
+    
+  public:
+    /// Return the name of the section.
+    const std::string &getName() const { return Name; }
+
+    /// Return the Address of the section.
+    const std::string &getAddress() const { return Address; }
+
+    /// Return the Color of the section.
+    int getColor() const { return Color; }
+    void setColor(int color) { Color = color; }
+
+    /// Return the size of the section.
+    unsigned getSize() const { return Size; }
+    void setSize(unsigned size) { Size = size; }
+
+    /// Conatined data objects.
+    std::vector<const GlobalVariable *>Items;
+
+    /// Check section type. 
+    bool isUDATA_Type() const { return T == UDATA; }
+    bool isIDATA_Type() const { return T == IDATA; }
+    bool isROMDATA_Type() const { return T == ROMDATA; }
+    bool isUDATA_OVR_Type() const { return T == UDATA_OVR; }
+    bool isUDATA_SHR_Type() const { return T == UDATA_SHR; }
+    bool isCODE_Type() const { return T == CODE; }
+
+    PIC16SectionType getType() const { return T; }
+
+    /// This would be the only way to create a section. 
+    static PIC16Section *Create(const StringRef &Name, PIC16SectionType Ty, 
+                                const std::string &Address, int Color, 
+                                MCContext &Ctx);
+    
+    /// Override this as PIC16 has its own way of printing switching
+    /// to a section.
+    virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                      raw_ostream &OS) const;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PIC16/PIC16TargetObjectFile.cpp b/lib/Target/PIC16/PIC16TargetObjectFile.cpp
index a2a4c09..7eedf7f 100644
--- a/lib/Target/PIC16/PIC16TargetObjectFile.cpp
+++ b/lib/Target/PIC16/PIC16TargetObjectFile.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PIC16TargetObjectFile.h"
-#include "MCSectionPIC16.h"
 #include "PIC16ISelLowering.h"
 #include "PIC16TargetMachine.h"
+#include "PIC16Section.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/MC/MCSection.h"
@@ -19,75 +19,118 @@
 using namespace llvm;
 
 
-MCSectionPIC16 *MCSectionPIC16::Create(const StringRef &Name, SectionKind K, 
-                                       int Address, int Color, MCContext &Ctx) {
-  return new (Ctx) MCSectionPIC16(Name, K, Address, Color);
+PIC16TargetObjectFile::PIC16TargetObjectFile() {
 }
 
+PIC16TargetObjectFile::~PIC16TargetObjectFile() {
+}
 
-void MCSectionPIC16::PrintSwitchToSection(const MCAsmInfo &MAI,
-                                          raw_ostream &OS) const {
-  OS << getName() << '\n';
+/// Find a pic16 section. Return null if not found. Do not create one.
+PIC16Section *PIC16TargetObjectFile::
+findPIC16Section(const std::string &Name) {
+  /// Return if we have an already existing one.
+  PIC16Section *Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+  return NULL;
 }
 
 
+/// Find a pic16 section. If not found, create one.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16Section(const std::string &Name, PIC16SectionType Ty, 
+                const std::string &Address, int Color) const {
 
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
 
-PIC16TargetObjectFile::PIC16TargetObjectFile()
-  : ExternalVarDecls(0), ExternalVarDefs(0) {
+
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+  return Entry;
 }
 
-const MCSectionPIC16 *PIC16TargetObjectFile::
-getPIC16Section(const char *Name, SectionKind Kind, 
-                int Address, int Color) const {
-  MCSectionPIC16 *&Entry = SectionsByName[Name];
+/// Find a standard pic16 data section. If not found, create one and keep
+/// track of it by adding it to appropriate std section list.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16DataSection(const std::string &Name, PIC16SectionType Ty, 
+                    const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
   if (Entry)
     return Entry;
 
-  return Entry = MCSectionPIC16::Create(Name, Kind, Address, Color, 
-                                        getContext());
-}
 
+  /// Else create a new one and add it to appropriate section list.
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
 
-void PIC16TargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &tm){
-  TargetLoweringObjectFile::Initialize(Ctx, tm);
-  TM = &tm;
-  
-  BSSSection = getPIC16Section("udata.# UDATA", MCSectionPIC16::UDATA_Kind());
-  ReadOnlySection = getPIC16Section("romdata.# ROMDATA", 
-                                    MCSectionPIC16::ROMDATA_Kind());
-  DataSection = getPIC16Section("idata.# IDATA", MCSectionPIC16::IDATA_Kind());
-  
-  // Need because otherwise a .text symbol is emitted by DwarfWriter
-  // in BeginModule, and gpasm cribbs for that .text symbol.
-  TextSection = getPIC16Section("", SectionKind::getText());
+  switch (Ty) {
+  default: llvm_unreachable ("unknow standard section type.");
+  case UDATA: UDATASections_.push_back(Entry); break;
+  case IDATA: IDATASections_.push_back(Entry); break;
+  case ROMDATA: ROMDATASection_ = Entry; break;
+  }
 
-  ROSections.push_back(new PIC16Section((MCSectionPIC16*)ReadOnlySection));
-  
-  // FIXME: I don't know what the classification of these sections really is.
-  // These aren't really objects belonging to any section. Just emit them
-  // in AsmPrinter and remove this code from here. 
-  ExternalVarDecls = new PIC16Section(getPIC16Section("ExternalVarDecls",
-                                      SectionKind::getMetadata()));
-  ExternalVarDefs = new PIC16Section(getPIC16Section("ExternalVarDefs",
-                                      SectionKind::getMetadata()));
+  return Entry;
 }
+    
 
-const MCSection *PIC16TargetObjectFile::
-getSectionForFunction(const std::string &FnName) const {
-  std::string T = PAN::getCodeSectionName(FnName);
-  return getPIC16Section(T.c_str(), SectionKind::getText());
+/// Find a standard pic16 autos section. If not found, create one and keep
+/// track of it by adding it to appropriate std section list.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16AutoSection(const std::string &Name, PIC16SectionType Ty, 
+                    const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+
+  /// Else create a new one and add it to appropriate section list.
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+
+  assert (Ty == UDATA_OVR && "incorrect section type for autos");
+  AUTOSections_.push_back(Entry);
+
+  return Entry;
 }
+    
+/// Find a pic16 user section. If not found, create one and keep
+/// track of it by adding it to appropriate std section list.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16UserSection(const std::string &Name, PIC16SectionType Ty, 
+                    const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
 
 
-const MCSection *PIC16TargetObjectFile::
-getSectionForFunctionFrame(const std::string &FnName) const {
-  std::string T = PAN::getFrameSectionName(FnName);
-  return getPIC16Section(T.c_str(), SectionKind::getDataRel());
+  /// Else create a new one and add it to appropriate section list.
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+
+  USERSections_.push_back(Entry);
+
+  return Entry;
 }
 
+/// Do some standard initialization.
+void PIC16TargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &tm){
+  TargetLoweringObjectFile::Initialize(Ctx, tm);
+  TM = &tm;
+  
+  ROMDATASection_ = NULL;
+}
+
+/// allocateUDATA - Allocate a un-initialized global to an existing or new UDATA
+/// section and return that section.
 const MCSection *
-PIC16TargetObjectFile::getBSSSectionForGlobal(const GlobalVariable *GV) const {
+PIC16TargetObjectFile::allocateUDATA(const GlobalVariable *GV) const {
   assert(GV->hasInitializer() && "This global doesn't need space");
   Constant *C = GV->getInitializer();
   assert(C->isNullValue() && "Unitialized globals has non-zero initializer");
@@ -97,41 +140,37 @@ PIC16TargetObjectFile::getBSSSectionForGlobal(const GlobalVariable *GV) const {
   const Type *Ty = C->getType(); 
   unsigned ValSize = TD->getTypeAllocSize(Ty);
  
-  // Go through all BSS Sections and assign this variable
+  // Go through all UDATA Sections and assign this variable
   // to the first available section having enough space.
-  PIC16Section *FoundBSS = NULL;
-  for (unsigned i = 0; i < BSSSections.size(); i++) {
-    if (DataBankSize - BSSSections[i]->Size >= ValSize) {
-      FoundBSS = BSSSections[i];
+  PIC16Section *Found = NULL;
+  for (unsigned i = 0; i < UDATASections_.size(); i++) {
+    if (DataBankSize - UDATASections_[i]->getSize() >= ValSize) {
+      Found = UDATASections_[i];
       break;
     }
   }
 
-  // No BSS section spacious enough was found. Crate a new one.
-  if (!FoundBSS) {
-    std::string name = PAN::getUdataSectionName(BSSSections.size());
-    const MCSectionPIC16 *NewSection
-      = getPIC16Section(name.c_str(), MCSectionPIC16::UDATA_Kind());
-
-    FoundBSS = new PIC16Section(NewSection);
-
-    // Add this newly created BSS section to the list of BSSSections.
-    BSSSections.push_back(FoundBSS);
+  // No UDATA section spacious enough was found. Crate a new one.
+  if (!Found) {
+    std::string name = PAN::getUdataSectionName(UDATASections_.size());
+    Found = getPIC16DataSection(name.c_str(), UDATA);
   }
   
-  // Insert the GV into this BSS.
-  FoundBSS->Items.push_back(GV);
-  FoundBSS->Size += ValSize;
-  return FoundBSS->S_;
+  // Insert the GV into this UDATA section.
+  Found->Items.push_back(GV);
+  Found->setSize(Found->getSize() + ValSize);
+  return Found;
 } 
 
+/// allocateIDATA - allocate an initialized global into an existing
+/// or new section and return that section.
 const MCSection *
-PIC16TargetObjectFile::getIDATASectionForGlobal(const GlobalVariable *GV) const{
+PIC16TargetObjectFile::allocateIDATA(const GlobalVariable *GV) const{
   assert(GV->hasInitializer() && "This global doesn't need space");
   Constant *C = GV->getInitializer();
   assert(!C->isNullValue() && "initialized globals has zero initializer");
   assert(GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE &&
-         "can split initialized RAM data only");
+         "can allocate initialized RAM data only");
 
   // Find how much space this global needs.
   const TargetData *TD = TM->getTargetData();
@@ -140,64 +179,47 @@ PIC16TargetObjectFile::getIDATASectionForGlobal(const GlobalVariable *GV) const{
  
   // Go through all IDATA Sections and assign this variable
   // to the first available section having enough space.
-  PIC16Section *FoundIDATA = NULL;
-  for (unsigned i = 0; i < IDATASections.size(); i++) {
-    if (DataBankSize - IDATASections[i]->Size >= ValSize) {
-      FoundIDATA = IDATASections[i]; 
+  PIC16Section *Found = NULL;
+  for (unsigned i = 0; i < IDATASections_.size(); i++) {
+    if (DataBankSize - IDATASections_[i]->getSize() >= ValSize) {
+      Found = IDATASections_[i]; 
       break;
     }
   }
 
   // No IDATA section spacious enough was found. Crate a new one.
-  if (!FoundIDATA) {
-    std::string name = PAN::getIdataSectionName(IDATASections.size());
-    const MCSectionPIC16 *NewSection =
-      getPIC16Section(name.c_str(), MCSectionPIC16::IDATA_Kind());
-
-    FoundIDATA = new PIC16Section(NewSection);
-
-    // Add this newly created IDATA section to the list of IDATASections.
-    IDATASections.push_back(FoundIDATA);
+  if (!Found) {
+    std::string name = PAN::getIdataSectionName(IDATASections_.size());
+    Found = getPIC16DataSection(name.c_str(), IDATA);
   }
   
   // Insert the GV into this IDATA.
-  FoundIDATA->Items.push_back(GV);
-  FoundIDATA->Size += ValSize;
-  return FoundIDATA->S_;
+  Found->Items.push_back(GV);
+  Found->setSize(Found->getSize() + ValSize);
+  return Found;
 } 
 
-// Get the section for an automatic variable of a function.
-// For PIC16 they are globals only with mangled names.
+// Allocate a program memory variable into ROMDATA section.
 const MCSection *
-PIC16TargetObjectFile::getSectionForAuto(const GlobalVariable *GV) const {
+PIC16TargetObjectFile::allocateROMDATA(const GlobalVariable *GV) const {
 
-  const std::string name = PAN::getSectionNameForSym(GV->getName());
+  std::string name = PAN::getRomdataSectionName();
+  PIC16Section *S = getPIC16DataSection(name.c_str(), ROMDATA);
 
-  // Go through all Auto Sections and assign this variable
-  // to the appropriate section.
-  PIC16Section *FoundAutoSec = NULL;
-  for (unsigned i = 0; i < AutosSections.size(); i++) {
-    if (AutosSections[i]->S_->getName() == name) {
-      FoundAutoSec = AutosSections[i];
-      break;
-    }
-  }
-
-  // No Auto section was found. Crate a new one.
-  if (!FoundAutoSec) {
-    const MCSectionPIC16 *NewSection =
-      getPIC16Section(name.c_str(), MCSectionPIC16::UDATA_OVR_Kind());
-
-    FoundAutoSec = new PIC16Section(NewSection);
+  S->Items.push_back(GV);
+  return S;
+}
 
-    // Add this newly created autos section to the list of AutosSections.
-    AutosSections.push_back(FoundAutoSec);
-  }
+// Get the section for an automatic variable of a function.
+// For PIC16 they are globals only with mangled names.
+const MCSection *
+PIC16TargetObjectFile::allocateAUTO(const GlobalVariable *GV) const {
 
-  // Insert the auto into this section.
-  FoundAutoSec->Items.push_back(GV);
+  const std::string name = PAN::getSectionNameForSym(GV->getName());
+  PIC16Section *S = getPIC16AutoSection(name.c_str());
 
-  return FoundAutoSec->S_;
+  S->Items.push_back(GV);
+  return S;
 }
 
 
@@ -214,56 +236,35 @@ PIC16TargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV1,
   if (!GV)
     return TargetLoweringObjectFile::SelectSectionForGlobal(GV1, Kind, Mang,TM);
 
-  // Record External Var Decls.
-  if (GV->isDeclaration()) {
-    ExternalVarDecls->Items.push_back(GV);
-    return ExternalVarDecls->S_;
-  }
-    
   assert(GV->hasInitializer() && "A def without initializer?");
 
   // First, if this is an automatic variable for a function, get the section
   // name for it and return.
   std::string name = GV->getName();
   if (PAN::isLocalName(name))
-    return getSectionForAuto(GV);
-
-  // Record Exteranl Var Defs.
-  if (GV->hasExternalLinkage() || GV->hasCommonLinkage())
-    ExternalVarDefs->Items.push_back(GV);
+    return allocateAUTO(GV);
 
   // See if this is an uninitialized global.
   const Constant *C = GV->getInitializer();
   if (C->isNullValue()) 
-    return getBSSSectionForGlobal(GV); 
+    return allocateUDATA(GV);
 
   // If this is initialized data in RAM. Put it in the correct IDATA section.
   if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE) 
-    return getIDATASectionForGlobal(GV);
+    return allocateIDATA(GV);
 
   // This is initialized data in rom, put it in the readonly section.
   if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
-    return getROSectionForGlobal(GV);
+    return allocateROMDATA(GV);
 
   // Else let the default implementation take care of it.
   return TargetLoweringObjectFile::SelectSectionForGlobal(GV, Kind, Mang,TM);
 }
 
-PIC16TargetObjectFile::~PIC16TargetObjectFile() {
-  for (unsigned i = 0; i < BSSSections.size(); i++)
-    delete BSSSections[i]; 
-  for (unsigned i = 0; i < IDATASections.size(); i++)
-    delete IDATASections[i]; 
-  for (unsigned i = 0; i < AutosSections.size(); i++)
-    delete AutosSections[i]; 
-  for (unsigned i = 0; i < ROSections.size(); i++)
-    delete ROSections[i];
-  delete ExternalVarDecls;
-  delete ExternalVarDefs;
-}
 
 
-/// getSpecialCasedSectionGlobals - Allow the target to completely override
+
+/// getExplicitSectionGlobal - Allow the target to completely override
 /// section assignment of a global.
 const MCSection *PIC16TargetObjectFile::
 getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, 
@@ -274,167 +275,83 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
     std::string SectName = GVar->getSection();
     // If address for a variable is specified, get the address and create
     // section.
+    // FIXME: move this attribute checking in PAN.
     std::string AddrStr = "Address=";
     if (SectName.compare(0, AddrStr.length(), AddrStr) == 0) {
       std::string SectAddr = SectName.substr(AddrStr.length());
-      return CreateSectionForGlobal(GVar, Mang, SectAddr);
+      return allocateAtGivenAddress(GVar, SectAddr);
     }
      
     // Create the section specified with section attribute. 
-    return CreateSectionForGlobal(GVar, Mang);
+    return allocateInGivenSection(GVar);
   }
 
-  return getPIC16Section(GV->getSection().c_str(), Kind);
+  return getPIC16DataSection(GV->getSection().c_str(), UDATA);
+}
+
+// Interface used by AsmPrinter to get a code section for a function.
+const PIC16Section *
+PIC16TargetObjectFile::SectionForCode(const std::string &FnName) const {
+  const std::string &sec_name = PAN::getCodeSectionName(FnName);
+  return getPIC16Section(sec_name, CODE);
+}
+
+// Interface used by AsmPrinter to get a frame section for a function.
+const PIC16Section *
+PIC16TargetObjectFile::SectionForFrame(const std::string &FnName) const {
+  const std::string &sec_name = PAN::getFrameSectionName(FnName);
+  return getPIC16Section(sec_name, UDATA_OVR);
 }
 
-// Create a new section for global variable. If Addr is given then create
-// section at that address else create by name.
+// Allocate a global var in existing or new section of given name.
 const MCSection *
-PIC16TargetObjectFile::CreateSectionForGlobal(const GlobalVariable *GV,
-                                              Mangler *Mang,
-                                              const std::string &Addr) const {
+PIC16TargetObjectFile::allocateInGivenSection(const GlobalVariable *GV) const {
+  // Determine the type of section that we need to create.
+  PIC16SectionType SecTy;
+
   // See if this is an uninitialized global.
   const Constant *C = GV->getInitializer();
   if (C->isNullValue())
-    return CreateBSSSectionForGlobal(GV, Addr);
-
+    SecTy = UDATA;
   // If this is initialized data in RAM. Put it in the correct IDATA section.
-  if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE)
-    return CreateIDATASectionForGlobal(GV, Addr);
-
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE)
+    SecTy = IDATA;
   // This is initialized data in rom, put it in the readonly section.
-  if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
-    return CreateROSectionForGlobal(GV, Addr);
-
-  // Else let the default implementation take care of it.
-  return TargetLoweringObjectFile::SectionForGlobal(GV, Mang, *TM);
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    SecTy = ROMDATA;
+  else
+    llvm_unreachable ("Could not determine section type for global");
+
+  PIC16Section *S = getPIC16UserSection(GV->getSection().c_str(), SecTy);
+  S->Items.push_back(GV);
+  return S;
 }
 
-// Create uninitialized section for a variable.
+// Allocate a global var in a new absolute sections at given address.
 const MCSection *
-PIC16TargetObjectFile::CreateBSSSectionForGlobal(const GlobalVariable *GV,
-                                                 std::string Addr) const {
-  assert(GV->hasInitializer() && "This global doesn't need space");
-  assert(GV->getInitializer()->isNullValue() &&
-         "Unitialized global has non-zero initializer");
-  std::string Name;
-  // If address is given then create a section at that address else create a
-  // section by section name specified in GV.
-  PIC16Section *FoundBSS = NULL;
-  if (Addr.empty()) { 
-    Name = GV->getSection() + " UDATA";
-    for (unsigned i = 0; i < BSSSections.size(); i++) {
-      if (BSSSections[i]->S_->getName() == Name) {
-        FoundBSS = BSSSections[i];
-        break;
-      }
-    }
-  } else {
-    std::string Prefix = GV->getNameStr() + "." + Addr + ".";
-    Name = PAN::getUdataSectionName(BSSSections.size(), Prefix) + " " + Addr;
-  }
-  
-  PIC16Section *NewBSS = FoundBSS;
-  if (NewBSS == NULL) {
-    const MCSectionPIC16 *NewSection =
-      getPIC16Section(Name.c_str(), MCSectionPIC16::UDATA_Kind());
-    NewBSS = new PIC16Section(NewSection);
-    BSSSections.push_back(NewBSS);
-  }
+PIC16TargetObjectFile::allocateAtGivenAddress(const GlobalVariable *GV,
+                                               const std::string &Addr) const {
+  // Determine the type of section that we need to create.
+  PIC16SectionType SecTy;
 
-  // Insert the GV into this BSS.
-  NewBSS->Items.push_back(GV);
-
-  // We do not want to put any  GV without explicit section into this section
-  // so set its size to DatabankSize.
-  NewBSS->Size = DataBankSize;
-  return NewBSS->S_;
-}
-
-// Get rom section for a variable. Currently there can be only one rom section
-// unless a variable explicitly requests a section.
-const MCSection *
-PIC16TargetObjectFile::getROSectionForGlobal(const GlobalVariable *GV) const {
-  ROSections[0]->Items.push_back(GV);
-  return ROSections[0]->S_;
-}
-
-// Create initialized data section for a variable.
-const MCSection *
-PIC16TargetObjectFile::CreateIDATASectionForGlobal(const GlobalVariable *GV,
-                                                   std::string Addr) const {
-  assert(GV->hasInitializer() && "This global doesn't need space");
-  assert(!GV->getInitializer()->isNullValue() &&
-         "initialized global has zero initializer");
-  assert(GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE &&
-         "can be used for initialized RAM data only");
-
-  std::string Name;
-  // If address is given then create a section at that address else create a
-  // section by section name specified in GV.
-  PIC16Section *FoundIDATASec = NULL;
-  if (Addr.empty()) {
-    Name = GV->getSection() + " IDATA";
-    for (unsigned i = 0; i < IDATASections.size(); i++) {
-      if (IDATASections[i]->S_->getName() == Name) {
-        FoundIDATASec = IDATASections[i];
-        break;
-      }
-    }
-  } else {
-    std::string Prefix = GV->getNameStr() + "." + Addr + ".";
-    Name = PAN::getIdataSectionName(IDATASections.size(), Prefix) + " " + Addr;
-  }
-
-  PIC16Section *NewIDATASec = FoundIDATASec;
-  if (NewIDATASec == NULL) {
-    const MCSectionPIC16 *NewSection =
-      getPIC16Section(Name.c_str(), MCSectionPIC16::IDATA_Kind());
-    NewIDATASec = new PIC16Section(NewSection);
-    IDATASections.push_back(NewIDATASec);
-  }
-  // Insert the GV into this IDATA Section.
-  NewIDATASec->Items.push_back(GV);
-  // We do not want to put any  GV without explicit section into this section 
-  // so set its size to DatabankSize.
-  NewIDATASec->Size = DataBankSize;
-  return NewIDATASec->S_;
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue())
+    SecTy = UDATA;
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE)
+    SecTy = IDATA;
+  // This is initialized data in rom, put it in the readonly section.
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    SecTy = ROMDATA;
+  else
+    llvm_unreachable ("Could not determine section type for global");
+
+  std::string Prefix = GV->getNameStr() + "." + Addr + ".";
+  std::string SName = PAN::getUserSectionName(Prefix);
+  PIC16Section *S = getPIC16UserSection(SName.c_str(), SecTy, Addr.c_str());
+  S->Items.push_back(GV);
+  return S;
 }
 
-// Create a section in rom for a variable.
-const MCSection *
-PIC16TargetObjectFile::CreateROSectionForGlobal(const GlobalVariable *GV,
-                                                std::string Addr) const {
-  assert(GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE &&
-         "can be used for ROM data only");
-
-  std::string Name;
-  // If address is given then create a section at that address else create a
-  // section by section name specified in GV.
-  PIC16Section *FoundROSec = NULL;
-  if (Addr.empty()) {
-    Name = GV->getSection() + " ROMDATA";
-    for (unsigned i = 1; i < ROSections.size(); i++) {
-      if (ROSections[i]->S_->getName() == Name) {
-        FoundROSec = ROSections[i];
-        break;
-      }
-    }
-  } else {
-    std::string Prefix = GV->getNameStr() + "." + Addr + ".";
-    Name = PAN::getRomdataSectionName(ROSections.size(), Prefix) + " " + Addr;
-  }
-
-  PIC16Section *NewRomSec = FoundROSec;
-  if (NewRomSec == NULL) {
-    const MCSectionPIC16 *NewSection =
-      getPIC16Section(Name.c_str(), MCSectionPIC16::ROMDATA_Kind());
-    NewRomSec = new PIC16Section(NewSection);
-    ROSections.push_back(NewRomSec);
-  }
-
-  // Insert the GV into this ROM Section.
-  NewRomSec->Items.push_back(GV);
-  return NewRomSec->S_;
-}
 
diff --git a/lib/Target/PIC16/PIC16TargetObjectFile.h b/lib/Target/PIC16/PIC16TargetObjectFile.h
index 75f6cce..ca07bed 100644
--- a/lib/Target/PIC16/PIC16TargetObjectFile.h
+++ b/lib/Target/PIC16/PIC16TargetObjectFile.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_TARGET_PIC16_TARGETOBJECTFILE_H
 #define LLVM_TARGET_PIC16_TARGETOBJECTFILE_H
 
+#include "PIC16.h"
+#include "PIC16ABINames.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/ADT/StringMap.h"
 #include <vector>
@@ -19,7 +21,7 @@ namespace llvm {
   class GlobalVariable;
   class Module;
   class PIC16TargetMachine;
-  class MCSectionPIC16;
+  class PIC16Section;
   
   enum { DataBankSize = 80 };
 
@@ -29,91 +31,128 @@ namespace llvm {
   /// again and printing only those that match the current section. 
   /// Keeping values inside the sections make printing a section much easier.
   ///
-  /// FIXME: MOVE ALL THIS STUFF TO MCSectionPIC16.
+  /// FIXME: MOVE ALL THIS STUFF TO PIC16Section.
   ///
-  struct PIC16Section {
-    const MCSectionPIC16 *S_; // Connection to actual Section.
-    unsigned Size;  // Total size of the objects contained.
-    bool SectionPrinted;
-    std::vector<const GlobalVariable*> Items;
-    
-    PIC16Section(const MCSectionPIC16 *s) {
-      S_ = s;
-      Size = 0;
-      SectionPrinted = false;
-    }
-    bool isPrinted() const { return SectionPrinted; }
-    void setPrintedStatus(bool status) { SectionPrinted = status; } 
-  };
-  
+
+  /// PIC16TargetObjectFile - PIC16 Object file. Contains data and code
+  /// sections. 
+  // PIC16 Object File has two types of sections.
+  // 1. Standard Sections
+  //    1.1 un-initialized global data 
+  //    1.2 initialized global data
+  //    1.3 program memory data
+  //    1.4 local variables of functions.
+  // 2. User defined sections
+  //    2.1 Objects placed in a specific section. (By _Section() macro)
+  //    2.2 Objects placed at a specific address. (By _Address() macro)
   class PIC16TargetObjectFile : public TargetLoweringObjectFile {
     /// SectionsByName - Bindings of names to allocated sections.
-    mutable StringMap<MCSectionPIC16*> SectionsByName;
+    mutable StringMap<PIC16Section*> SectionsByName;
 
     const TargetMachine *TM;
     
-    const MCSectionPIC16 *getPIC16Section(const char *Name,
-                                          SectionKind K, 
-                                          int Address = -1, 
-                                          int Color = -1) const;
-  public:
-    mutable std::vector<PIC16Section*> BSSSections;
-    mutable std::vector<PIC16Section*> IDATASections;
-    mutable std::vector<PIC16Section*> AutosSections;
-    mutable std::vector<PIC16Section*> ROSections;
-    mutable PIC16Section *ExternalVarDecls;
-    mutable PIC16Section *ExternalVarDefs;
+    /// Lists of sections.
+    /// Standard Data Sections.
+    mutable std::vector<PIC16Section *> UDATASections_;
+    mutable std::vector<PIC16Section *> IDATASections_;
+    mutable PIC16Section * ROMDATASection_;
+
+    /// Standard Auto Sections.
+    mutable std::vector<PIC16Section *> AUTOSections_;
+ 
+    /// User specified sections.
+    mutable std::vector<PIC16Section *> USERSections_;
+
+    
+    /// Find or Create a PIC16 Section, without adding it to any
+    /// section list.
+    PIC16Section *getPIC16Section(const std::string &Name,
+                                   PIC16SectionType Ty, 
+                                   const std::string &Address = "", 
+                                   int Color = -1) const;
+
+    /// Convenience functions. These wrappers also take care of adding 
+    /// the newly created section to the appropriate sections list.
+
+    /// Find or Create PIC16 Standard Data Section.
+    PIC16Section *getPIC16DataSection(const std::string &Name,
+                                       PIC16SectionType Ty, 
+                                       const std::string &Address = "", 
+                                       int Color = -1) const;
+
+    /// Find or Create PIC16 Standard Auto Section.
+    PIC16Section *getPIC16AutoSection(const std::string &Name,
+                                       PIC16SectionType Ty = UDATA_OVR,
+                                       const std::string &Address = "", 
+                                       int Color = -1) const;
+
+    /// Find or Create PIC16 Standard Auto Section.
+    PIC16Section *getPIC16UserSection(const std::string &Name,
+                                       PIC16SectionType Ty, 
+                                       const std::string &Address = "", 
+                                       int Color = -1) const;
+
+    /// Allocate Un-initialized data to a standard UDATA section. 
+    const MCSection *allocateUDATA(const GlobalVariable *GV) const;
+
+    /// Allocate Initialized data to a standard IDATA section. 
+    const MCSection *allocateIDATA(const GlobalVariable *GV) const;
 
+    /// Allocate ROM data to the standard ROMDATA section. 
+    const MCSection *allocateROMDATA(const GlobalVariable *GV) const;
+
+    /// Allocate an AUTO variable to an AUTO section.
+    const MCSection *allocateAUTO(const GlobalVariable *GV) const;
+    
+    /// Allocate DATA in user specified section.
+    const MCSection *allocateInGivenSection(const GlobalVariable *GV) const;
+
+    /// Allocate DATA at user specified address.
+    const MCSection *allocateAtGivenAddress(const GlobalVariable *GV,
+                                            const std::string &Addr) const;
+   
+    public:
     PIC16TargetObjectFile();
     ~PIC16TargetObjectFile();
-    
     void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
-    
+    /// Return the section with the given Name. Null if not found.
+    PIC16Section *findPIC16Section(const std::string &Name);
+
+    /// Override section allocations for user specified sections.
     virtual const MCSection *
     getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, 
                              Mangler *Mang, const TargetMachine &TM) const;
     
+    /// Select sections for Data and Auto variables(globals).
     virtual const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
                                                     SectionKind Kind,
                                                     Mangler *Mang,
                                                     const TargetMachine&) const;
 
-    const MCSection *getSectionForFunction(const std::string &FnName) const;
-    const MCSection *getSectionForFunctionFrame(const std::string &FnName)const;
-    
-    
-  private:
-    std::string getSectionNameForSym(const std::string &Sym) const;
-
-    const MCSection *getBSSSectionForGlobal(const GlobalVariable *GV) const;
-    const MCSection *getIDATASectionForGlobal(const GlobalVariable *GV) const;
-    const MCSection *getSectionForAuto(const GlobalVariable *GV) const;
-    const MCSection *CreateBSSSectionForGlobal(const GlobalVariable *GV,
-                                               std::string Addr = "") const;
-    const MCSection *CreateIDATASectionForGlobal(const GlobalVariable *GV,
-                                                 std::string Addr = "") const;
-    const MCSection *getROSectionForGlobal(const GlobalVariable *GV) const;
-    const MCSection *CreateROSectionForGlobal(const GlobalVariable *GV,
-                                              std::string Addr = "") const;
-    const MCSection *CreateSectionForGlobal(const GlobalVariable *GV,
-                                            Mangler *Mang,
-                                            const std::string &Addr = "") const;
-  public:
-    void SetSectionForGVs(Module &M);
-    const std::vector<PIC16Section*> &getBSSSections() const {
-      return BSSSections;
+
+    /// Return a code section for a function.
+    const PIC16Section *SectionForCode (const std::string &FnName) const;
+
+    /// Return a frame section for a function.
+    const PIC16Section *SectionForFrame (const std::string &FnName) const;
+
+    /// Accessors for various section lists.
+    const std::vector<PIC16Section *> &UDATASections() const {
+      return UDATASections_;
     }
-    const std::vector<PIC16Section*> &getIDATASections() const {
-      return IDATASections;
+    const std::vector<PIC16Section *> &IDATASections() const {
+      return IDATASections_;
     }
-    const std::vector<PIC16Section*> &getAutosSections() const {
-      return AutosSections;
+    const PIC16Section *ROMDATASection() const {
+      return ROMDATASection_;
     }
-    const std::vector<PIC16Section*> &getROSections() const {
-      return ROSections;
+    const std::vector<PIC16Section *> &AUTOSections() const {
+      return AUTOSections_;
+    }
+    const std::vector<PIC16Section *> &USERSections() const {
+      return USERSections_;
     }
-    
   };
 } // end namespace llvm
 
diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
index a0fba86..dc6b852 100644
--- a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
@@ -1129,7 +1129,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
   // implementation of multiple entry points).  If this doesn't occur, the
   // linker can safely perform dead code stripping.  Since LLVM never generates
   // code that does this, it is always safe to set.
-  O << "\t.subsections_via_symbols\n";
+  OutStreamer.EmitAssemblerFlag(MCStreamer::SubsectionsViaSymbols);
 
   return AsmPrinter::doFinalization(M);
 }
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 89ea9d0..2740387 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -1594,23 +1594,8 @@ int int_char(char m) {if(m>7) return 0; return m;}
 
 //===---------------------------------------------------------------------===//
 
-Instcombine should replace the load with a constant in:
-
-  static const char x[4] = {'a', 'b', 'c', 'd'};
-  
-  unsigned int y(void) {
-    return *(unsigned int *)x;
-  }
-
-It currently only does this transformation when the size of the constant 
-is the same as the size of the integer (so, try x[5]) and the last byte 
-is a null (making it a C string). There's no need for these restrictions.
-
-//===---------------------------------------------------------------------===//
-
-InstCombine's "turn load from constant into constant" optimization should be
-more aggressive in the presence of bitcasts.  For example, because of unions,
-this code:
+libanalysis is not aggressively folding vector bitcasts.  For example, the
+constant expressions generated when compiling this code:
 
 union vec2d {
     double e[2];
@@ -1624,23 +1609,29 @@ vec2d foo () {
     return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
 }
 
-Compiles into:
+in X86-32 end up being:
 
-@a = internal constant %0 { [2 x double] 
-           [double 1.000000e+00, double 2.000000e+00] }, align 16
-@b = internal constant %0 { [2 x double]
-           [double 3.000000e+00, double 4.000000e+00] }, align 16
-...
-define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind {
+define void @foo(%union.vec2d* noalias nocapture sret %agg.result) nounwind ssp {
 entry:
-	%0 = load <2 x double>* getelementptr (%struct.vec2d* 
-           bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16
-	%1 = load <2 x double>* getelementptr (%struct.vec2d* 
-           bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16
+  %agg.result.0 = getelementptr %union.vec2d* %agg.result, i32 0, i32 0 ; <<2 x double>*> [#uses=1]
+  store <2 x double> fadd (<2 x double> bitcast (<1 x i128> <i128 85070591730234615870450834276742070272> to <2 x double>), <2 x double> fmul (<2 x double> bitcast (<1 x i128> <i128 85153668479971173112514077617450647552> to <2 x double>), <2 x double> <double 5.000000e+00, double 5.000000e+00>)), <2 x double>* %agg.result.0, align 16
+  ret void
+}
 
+and in X86-64 mode:
 
-Instcombine should be able to optimize away the loads (and thus the globals).
+define %0 @foo() nounwind readnone ssp {
+entry:
+  %mrv5 = insertvalue %0 undef, double extractelement (<2 x double> fadd (<2 x double> bitcast (<1 x i128> <i128 85070591730234615870450834276742070272> to <2 x double>), <2 x double> fmul (<2 x double> bitcast (<1 x i128> <i128 85153668479971173112514077617450647552> to <2 x double>), <2 x double> bitcast (<1 x i128> <i128 85174437667405312423031577302488055808> to <2 x double>))), i32 0), 0 ; <%0> [#uses=1]
+  %mrv6 = insertvalue %0 %mrv5, double extractelement (<2 x double> fadd (<2 x double> bitcast (<1 x i128> <i128 85070591730234615870450834276742070272> to <2 x double>), <2 x double> fmul (<2 x double> bitcast (<1 x i128> <i128 85153668479971173112514077617450647552> to <2 x double>), <2 x double> bitcast (<1 x i128> <i128 85174437667405312423031577302488055808> to <2 x double>))), i32 1), 1 ; <%0> [#uses=1]
+  ret %0 %mrv6
+}
 
-See also PR4973
+//===---------------------------------------------------------------------===//
+
+IPSCCP is propagating elements of first class aggregates, but is not propagating
+the entire aggregate itself.  This leads it to miss opportunities, for example
+in test/Transforms/SCCP/ipsccp-basic.ll:test5b.
 
 //===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/TargetIntrinsicInfo.cpp b/lib/Target/TargetIntrinsicInfo.cpp
index d8da08e..e049a1d 100644
--- a/lib/Target/TargetIntrinsicInfo.cpp
+++ b/lib/Target/TargetIntrinsicInfo.cpp
@@ -12,11 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/StringMap.h"
 using namespace llvm;
 
-TargetIntrinsicInfo::TargetIntrinsicInfo(const char **desc, unsigned count)
-  : Intrinsics(desc), NumIntrinsics(count) {
+TargetIntrinsicInfo::TargetIntrinsicInfo() {
 }
 
 TargetIntrinsicInfo::~TargetIntrinsicInfo() {
 }
+
+unsigned TargetIntrinsicInfo::getIntrinsicID(Function *F) const {
+  const ValueName *ValName = F->getValueName();
+  if (!ValName)
+    return 0;
+  return lookupName(ValName->getKeyData(), ValName->getKeyLength());
+}
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
index bc70ffe..8ec5b62 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -57,9 +57,7 @@ void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo) {
   }
 }
 
-void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                     const char *Modifier) {
-  assert(Modifier == 0 && "Modifiers should not be used");
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo) {
   
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
index 5f28fa4..3180618 100644
--- a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
@@ -32,8 +32,7 @@ public:
   static const char *getRegisterName(unsigned RegNo);
 
 
-  void printOperand(const MCInst *MI, unsigned OpNo,
-                    const char *Modifier = 0);
+  void printOperand(const MCInst *MI, unsigned OpNo);
   void printMemReference(const MCInst *MI, unsigned Op);
   void printLeaMemReference(const MCInst *MI, unsigned Op);
   void printSSECC(const MCInst *MI, unsigned Op);
diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
index 2a0290d..ae8e6d3 100644
--- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -225,7 +225,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO) {
     
     std::string Name = Mang->getMangledName(GV, Suffix, Suffix[0] != '\0');
     if (Subtarget->isTargetCygMing()) {
-      X86COFFMachineModuleInfo &COFFMMI = 
+      X86COFFMachineModuleInfo &COFFMMI =
         MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
       COFFMMI.DecorateCygMingName(Name, GV, *TM.getTargetData());
     }
@@ -288,12 +288,12 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO) {
     std::string Name = Mang->makeNameProper(MO.getSymbolName());
     if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
       Name += "$stub";
-      MCSymbol *Sym = OutContext.GetOrCreateSymbol(Name);
+      MCSymbol *Sym = OutContext.GetOrCreateSymbol(StringRef(Name));
       const MCSymbol *&StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
       if (StubSym == 0) {
         Name.erase(Name.end()-5, Name.end());
-        StubSym = OutContext.GetOrCreateSymbol(Name);
+        StubSym = OutContext.GetOrCreateSymbol(StringRef(Name));
       }
     }
     
@@ -870,49 +870,55 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
-    O << "\t.subsections_via_symbols\n";
-  }  
-  
-  if (Subtarget->isTargetCOFF()) {
-    // Necessary for dllexport support
-    std::vector<std::string> DLLExportedFns, DLLExportedGlobals;
+    OutStreamer.EmitAssemblerFlag(MCStreamer::SubsectionsViaSymbols);
+  }
 
-    X86COFFMachineModuleInfo &COFFMMI = 
+  if (Subtarget->isTargetCOFF()) {
+    X86COFFMachineModuleInfo &COFFMMI =
       MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
-    TargetLoweringObjectFileCOFF &TLOFCOFF = 
-      static_cast<TargetLoweringObjectFileCOFF&>(getObjFileLowering());
 
-    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-      if (I->hasDLLExportLinkage())
-        DLLExportedFns.push_back(Mang->getMangledName(I));
-    
-    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I)
-      if (I->hasDLLExportLinkage())
-        DLLExportedGlobals.push_back(Mang->getMangledName(I));
-    
-    if (Subtarget->isTargetCygMing()) {
-      // Emit type information for external functions
-      for (X86COFFMachineModuleInfo::stub_iterator I = COFFMMI.stub_begin(),
+    // Emit type information for external functions
+    for (X86COFFMachineModuleInfo::stub_iterator I = COFFMMI.stub_begin(),
            E = COFFMMI.stub_end(); I != E; ++I) {
-        O << "\t.def\t " << I->getKeyData()
+      O << "\t.def\t " << I->getKeyData()
         << ";\t.scl\t" << COFF::C_EXT
         << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
         << ";\t.endef\n";
-      }
     }
-  
-    // Output linker support code for dllexported globals on windows.
-    if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
-      OutStreamer.SwitchSection(TLOFCOFF.getCOFFSection(".section .drectve",
-                                                        true,
+
+    if (Subtarget->isTargetCygMing()) {
+      // Necessary for dllexport support
+      std::vector<std::string> DLLExportedFns, DLLExportedGlobals;
+
+      TargetLoweringObjectFileCOFF &TLOFCOFF =
+        static_cast<TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+      for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+        if (I->hasDLLExportLinkage()) {
+          std::string Name = Mang->getMangledName(I);
+          COFFMMI.DecorateCygMingName(Name, I, *TM.getTargetData());
+          DLLExportedFns.push_back(Name);
+        }
+
+      for (Module::const_global_iterator I = M.global_begin(),
+             E = M.global_end(); I != E; ++I)
+        if (I->hasDLLExportLinkage()) {
+          std::string Name = Mang->getMangledName(I);
+          COFFMMI.DecorateCygMingName(Name, I, *TM.getTargetData());
+          DLLExportedGlobals.push_back(Mang->getMangledName(I));
+        }
+
+      // Output linker support code for dllexported globals on windows.
+      if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+        OutStreamer.SwitchSection(TLOFCOFF.getCOFFSection(".section .drectve",
+                                                          true,
                                                    SectionKind::getMetadata()));
-    
-      for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i)
-        O << "\t.ascii \" -export:" << DLLExportedGlobals[i] << ",data\"\n";
-    
-      for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i)
-        O << "\t.ascii \" -export:" << DLLExportedFns[i] << "\"\n";
+        for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i)
+          O << "\t.ascii \" -export:" << DLLExportedGlobals[i] << ",data\"\n";
+
+        for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i)
+          O << "\t.ascii \" -export:" << DLLExportedFns[i] << "\"\n";
+      }
     }
   }
 }
diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
index 5ccddf5..d498c57 100644
--- a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
@@ -38,10 +38,8 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 
 
 MCSymbol *X86MCInstLower::GetPICBaseSymbol() const {
-  SmallString<60> Name;
-  raw_svector_ostream(Name) << AsmPrinter.MAI->getPrivateGlobalPrefix()
-    << AsmPrinter.getFunctionNumber() << "$pb";
-  return Ctx.GetOrCreateSymbol(Name.str());
+  return Ctx.GetOrCreateSymbol(Twine(AsmPrinter.MAI->getPrivateGlobalPrefix())+
+                               Twine(AsmPrinter.getFunctionNumber())+"$pb");
 }
 
 
@@ -308,6 +306,8 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       MI->dump();
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
       MCOp = MCOperand::CreateReg(MO.getReg());
       break;
     case MachineOperand::MO_Immediate:
@@ -449,10 +449,9 @@ void X86AsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
     //   MYGLOBAL + (. - PICBASE)
     // However, we can't generate a ".", so just emit a new label here and refer
     // to it.  We know that this operand flag occurs at most once per function.
-    SmallString<64> Name;
-    raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix()
-      << "picbaseref" << getFunctionNumber();
-    MCSymbol *DotSym = OutContext.GetOrCreateSymbol(Name.str());
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *DotSym = OutContext.GetOrCreateSymbol(Twine(Prefix)+"picbaseref"+
+                                                    Twine(getFunctionNumber()));
     OutStreamer.EmitLabel(DotSym);
     
     // Now that we have emitted the label, lower the complex operand expression.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fadc818..e5e7bc8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2286,6 +2286,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
   case ISD::SETNE:   return X86::COND_NE;
   case ISD::SETUO:   return X86::COND_P;
   case ISD::SETO:    return X86::COND_NP;
+  case ISD::SETOEQ:
+  case ISD::SETUNE:  return X86::COND_INVALID;
   }
 }
 
@@ -2389,6 +2391,56 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
   return ::isPSHUFLWMask(M, N->getValueType(0));
 }
 
+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                          bool hasSSSE3) {
+  int i, e = VT.getVectorNumElements();
+  
+  // Do not handle v2i64 / v2f64 shuffles with palignr.
+  if (e < 4 || !hasSSSE3)
+    return false;
+  
+  for (i = 0; i != e; ++i)
+    if (Mask[i] >= 0)
+      break;
+  
+  // All undef, not a palignr.
+  if (i == e)
+    return false;
+
+  // Determine if it's ok to perform a palignr with only the LHS, since we
+  // don't have access to the actual shuffle elements to see if RHS is undef.
+  bool Unary = Mask[i] < (int)e;
+  bool NeedsUnary = false;
+
+  int s = Mask[i] - i;
+  
+  // Check the rest of the elements to see if they are consecutive.
+  for (++i; i != e; ++i) {
+    int m = Mask[i];
+    if (m < 0) 
+      continue;
+    
+    Unary = Unary && (m < (int)e);
+    NeedsUnary = NeedsUnary || (m < s);
+
+    if (NeedsUnary && !Unary)
+      return false;
+    if (Unary && m != ((s+i) & (e-1)))
+      return false;
+    if (!Unary && m != (s+i))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isPALIGNRMask(M, N->getValueType(0), true);
+}
+
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to SHUFP*.
 static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
@@ -2733,8 +2785,7 @@ bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
 }
 
 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
-/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
-/// instructions.
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
 unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   int NumOperands = SVOp->getValueType(0).getVectorNumElements();
@@ -2753,8 +2804,7 @@ unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
 }
 
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
-/// instructions.
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   unsigned Mask = 0;
@@ -2770,8 +2820,7 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
 }
 
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
-/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
-/// instructions.
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   unsigned Mask = 0;
@@ -2786,6 +2835,23 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
   return Mask;
 }
 
+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VVT = N->getValueType(0);
+  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+  int Val = 0;
+
+  unsigned i, e;
+  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+    Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      break;
+  }
+  return (Val - i) * EltSize;
+}
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
@@ -5502,6 +5568,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
 
   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
 
   SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -5650,8 +5718,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   SDValue CC;
 
-  if (Cond.getOpcode() == ISD::SETCC)
-    Cond = LowerSETCC(Cond, DAG);
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
@@ -5724,8 +5795,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   SDValue CC;
 
-  if (Cond.getOpcode() == ISD::SETCC)
-    Cond = LowerSETCC(Cond, DAG);
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
 #if 0
   // FIXME: LowerXALUO doesn't handle these!!
   else if (Cond.getOpcode() == X86ISD::ADD  ||
@@ -6274,6 +6348,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     SDValue LHS = Op.getOperand(1);
     SDValue RHS = Op.getOperand(2);
     unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
     SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
@@ -7274,7 +7349,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   if (VT.getSizeInBits() == 64)
     return false;
 
-  // FIXME: pshufb, blends, palignr, shifts.
+  // FIXME: pshufb, blends, shifts.
   return (VT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isMOVLMask(M, VT) ||
@@ -7282,6 +7357,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isPSHUFDMask(M, VT) ||
           isPSHUFHWMask(M, VT) ||
           isPSHUFLWMask(M, VT) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
           isUNPCKLMask(M, VT) ||
           isUNPCKHMask(M, VT) ||
           isUNPCKL_v_undef_Mask(M, VT) ||
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2f7b8ba..66a9107 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -323,21 +323,27 @@ namespace llvm {
     /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
     bool isMOVDDUPMask(ShuffleVectorSDNode *N);
 
+    /// isPALIGNRMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PALIGNR.
+    bool isPALIGNRMask(ShuffleVectorSDNode *N);
+
     /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
     /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
     /// instructions.
     unsigned getShuffleSHUFImmediate(SDNode *N);
 
     /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
-    /// instructions.
+    /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction.
     unsigned getShufflePSHUFHWImmediate(SDNode *N);
 
-    /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
-    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
-    /// instructions.
+    /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction.
     unsigned getShufflePSHUFLWImmediate(SDNode *N);
 
+    /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+    unsigned getShufflePALIGNRImmediate(SDNode *N);
+
     /// isZeroNode - Returns true if Elt is a constant zero or a floating point
     /// constant +0.0.
     bool isZeroNode(SDValue Elt);
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index ef19823..c1b7b8f 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -368,19 +368,15 @@ def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
 // Use movzbl instead of movzbq when the destination is a register; it's
 // equivalent due to implicit zero-extending, and it has a smaller encoding.
 def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
-                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR64:$dst, (zext GR8:$src))]>, TB;
+                   "", [(set GR64:$dst, (zext GR8:$src))]>, TB;
 def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
-                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+                   "", [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
 // Use movzwl instead of movzwq when the destination is a register; it's
 // equivalent due to implicit zero-extending, and it has a smaller encoding.
 def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
-                   "movz{wl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR64:$dst, (zext GR16:$src))]>, TB;
+                   "", [(set GR64:$dst, (zext GR16:$src))]>, TB;
 def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                   "movz{wl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+                   "", [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
 
 // There's no movzlq instruction, but movl can be used for this purpose, using
 // implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
@@ -390,11 +386,9 @@ def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
 // necessarily all zero. In such cases, we fall back to these explicit zext
 // instructions.
 def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                    [(set GR64:$dst, (zext GR32:$src))]>;
+                    "", [(set GR64:$dst, (zext GR32:$src))]>;
 def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
-                    "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                    [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+                    "", [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
 
 // Any instruction that defines a 32-bit result leaves the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
@@ -1455,8 +1449,7 @@ def : Pat<(i64 0),
 // Materialize i64 constant where top 32-bits are zero.
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
-                        "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                        [(set GR64:$dst, i64immZExt32:$src)]>;
+                        "", [(set GR64:$dst, i64immZExt32:$src)]>;
 
 //===----------------------------------------------------------------------===//
 // Thread Local Storage Instructions
@@ -1515,6 +1508,7 @@ def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$ptr,GR64:$val)
 }
 
 // Optimized codegen when the non-memory output is not used.
+let Defs = [EFLAGS] in {
 // FIXME: Use normal add / sub instructions and add lock prefix dynamically.
 def LOCK_ADD64mr : RI<0x03, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                       "lock\n\t"
@@ -1544,7 +1538,7 @@ def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
 def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
                       "lock\n\t"
                       "dec{q}\t$dst", []>, LOCK;
-
+}
 // Atomic exchange, and, or, xor
 let Constraints = "$val = $dst", Defs = [EFLAGS],
                   usesCustomDAGSchedInserter = 1 in {
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index abdb313..2f14bb0 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -224,10 +224,10 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
 // 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 30b57d8..16b2af7 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -3392,11 +3392,9 @@ def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
 // of the register here. This has a smaller encoding and avoids a
 // partial-register update.
 def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
-                   "movs{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR16:$dst, (sext GR8:$src))]>, TB;
+                   "", [(set GR16:$dst, (sext GR8:$src))]>, TB;
 def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
-                   "movs{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
+                   "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR8:$src))]>, TB;
@@ -3414,11 +3412,9 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
 // of the register here. This has a smaller encoding and avoids a
 // partial-register update.
 def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
-                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR16:$dst, (zext GR8:$src))]>, TB;
+                   "", [(set GR16:$dst, (zext GR8:$src))]>, TB;
 def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
-                   "movz{bl|x}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
-                   [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
+                   "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR8:$src))]>, TB;
@@ -3474,9 +3470,8 @@ def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins),
                  [(set GR8:$dst, 0)]>;
 // Use xorl instead of xorw since we don't care about the high 16 bits,
 // it's smaller, and it avoids a partial-register update.
-def MOV16r0  : I<0x31, MRMInitReg,  (outs GR16:$dst), (ins),
-                 "xor{l}\t${dst:subreg32}, ${dst:subreg32}",
-                 [(set GR16:$dst, 0)]>;
+def MOV16r0  : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
+                 "", [(set GR16:$dst, 0)]>;
 def MOV32r0  : I<0x31, MRMInitReg,  (outs GR32:$dst), (ins),
                  "xor{l}\t$dst, $dst",
                  [(set GR32:$dst, 0)]>;
@@ -3598,6 +3593,7 @@ def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins i8mem:$ptr, GR8:$val),
 
 // Optimized codegen when the non-memory output is not used.
 // FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+let Defs = [EFLAGS] in {
 def LOCK_ADD8mr  : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                     "lock\n\t"
                     "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
@@ -3667,6 +3663,7 @@ def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
 def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
                     "lock\n\t"
                     "dec{l}\t$dst", []>, LOCK;
+}
 
 // Atomic exchange, and, or, xor
 let Constraints = "$val = $dst", Defs = [EFLAGS],
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 96fc932..f4e97c9 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -197,6 +197,12 @@ def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
   return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
 }]>;
 
+// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
+// a PALIGNR imm.
+def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePALIGNRImmediate(N));
+}]>;
+
 def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
                        (vector_shuffle node:$lhs, node:$rhs), [{
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
@@ -283,6 +289,11 @@ def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
 }], SHUFFLE_get_pshuflw_imm>;
 
+def palign : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_palign_imm>;
+
 //===----------------------------------------------------------------------===//
 // SSE scalar FP Instructions
 //===----------------------------------------------------------------------===//
@@ -2062,6 +2073,7 @@ defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
 defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
 
 // Shuffle and unpack instructions
+let AddedComplexity = 5 in {
 def PSHUFDri : PDIi8<0x70, MRMSrcReg,
                      (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
                      "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2073,6 +2085,7 @@ def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
                      [(set VR128:$dst, (v4i32 (pshufd:$src2
                                              (bc_v4i32(memopv2i64 addr:$src1)),
                                              (undef))))]>;
+}                                             
 
 // SSE2 with ImmT == Imm8 and XS prefix.
 def PSHUFHWri : Ii8<0x70, MRMSrcReg,
@@ -2839,6 +2852,26 @@ let Constraints = "$src1 = $dst" in {
                               imm:$src3))]>, OpSize;
 }
 
+// palignr patterns.
+let AddedComplexity = 5 in {
+def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+}      
+
 def : Pat<(X86pshufb VR128:$src, VR128:$mask),
           (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
 def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index fb76aeb..9525f04 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -382,7 +382,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
   , HasFMA4(false)
   , IsBTMemSlow(false)
   , DarwinVers(0)
-  , IsLinux(false)
   , stackAlignment(8)
   // FIXME: this is a known good value for Yonah. How about others?
   , MaxInlineSizeThreshold(128)
@@ -434,7 +433,6 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
     } else if (TT.find("linux") != std::string::npos) {
       // Linux doesn't imply ELF, but we don't currently support anything else.
       TargetType = isELF;
-      IsLinux = true;
     } else if (TT.find("cygwin") != std::string::npos) {
       TargetType = isCygwin;
     } else if (TT.find("mingw") != std::string::npos) {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a2e368d..e64b854 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -82,9 +82,6 @@ protected:
   /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
   unsigned char DarwinVers; // Is any darwin-x86 platform.
 
-  /// isLinux - true if this is a "linux" platform.
-  bool IsLinux;
-
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -195,11 +192,7 @@ public:
   /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
   /// 10 = Snow Leopard, etc.
   unsigned getDarwinVers() const { return DarwinVers; }
-  
-  /// isLinux - Return true if the target is "Linux".
-  bool isLinux() const { return IsLinux; }
-
-  
+    
   /// ClassifyGlobalReference - Classify a global variable reference for the
   /// current subtarget according to how we should reference it in a non-pcrel
   /// context.
@@ -222,6 +215,14 @@ public:
   /// indicating the number of scheduling cycles of backscheduling that
   /// should be attempted.
   unsigned getSpecialAddressLatency() const;
+
+  /// enablePostRAScheduler - X86 target is enabling post-alloc scheduling
+  /// at 'More' optimization level.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& mode) const {
+    mode = TargetSubtarget::ANTIDEP_CRITICAL;
+    return OptLevel >= CodeGenOpt::Default;
+  }
 };
 
 } // End llvm namespace
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index 8000d0d..91534a7 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -30,9 +30,8 @@ namespace {
 
     virtual bool runOnFunction(Function &F) {
       HelloCounter++;
-      std::string fname = F.getName();
-      EscapeString(fname);
-      errs() << "Hello: " << fname << "\n";
+      errs() << "Hello: ";
+      errs().write_escaped(F.getName()) << '\n';
       return false;
     }
   };
@@ -49,9 +48,8 @@ namespace {
 
     virtual bool runOnFunction(Function &F) {
       HelloCounter++;
-      std::string fname = F.getName();
-      EscapeString(fname);
-      errs() << "Hello: " << fname << "\n";
+      errs() << "Hello: ";
+      errs().write_escaped(F.getName()) << '\n';
       return false;
     }
 
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index ec0f1e1..5c28801 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -9,7 +9,6 @@ add_llvm_library(LLVMipo
   GlobalOpt.cpp
   IPConstantPropagation.cpp
   IPO.cpp
-  IndMemRemoval.cpp
   InlineAlways.cpp
   InlineSimple.cpp
   Inliner.cpp
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 7edaa7f..0701b94 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -153,7 +153,7 @@ bool FunctionAttrs::AddReadAttrs(const std::vector<CallGraphNode *> &SCC) {
         // Writes memory.  Just give up.
         return false;
 
-      if (isa<MallocInst>(I))
+      if (isMalloc(I))
         // malloc claims not to write memory!  PR3754.
         return false;
 
@@ -267,11 +267,8 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,
 
         // Check whether the pointer came from an allocation.
         case Instruction::Alloca:
-        case Instruction::Malloc:
           break;
         case Instruction::Call:
-          if (isMalloc(RVI))
-            break;
         case Instruction::Invoke: {
           CallSite CS(RVI);
           if (CS.paramHasAttr(0, Attribute::NoAlias))
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 09f9e7c..8f4e8b3 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -149,8 +149,6 @@ bool GlobalDCE::runOnModule(Module &M) {
   // Make sure that all memory is released
   AliveGlobals.clear();
 
-  // Remove dead metadata.
-  Changed |= M.getContext().RemoveDeadMetadata();
   return Changed;
 }
 
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index a44386e..9ced2e8 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -822,140 +822,18 @@ static void ConstantPropUsersOf(Value *V, LLVMContext &Context) {
 /// malloc, there is no reason to actually DO the malloc.  Instead, turn the
 /// malloc into a global, and any loads of GV as uses of the new global.
 static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
-                                                     MallocInst *MI,
-                                                     LLVMContext &Context) {
-  DEBUG(errs() << "PROMOTING MALLOC GLOBAL: " << *GV << "  MALLOC = " << *MI);
-  ConstantInt *NElements = cast<ConstantInt>(MI->getArraySize());
-
-  if (NElements->getZExtValue() != 1) {
-    // If we have an array allocation, transform it to a single element
-    // allocation to make the code below simpler.
-    Type *NewTy = ArrayType::get(MI->getAllocatedType(),
-                                 NElements->getZExtValue());
-    MallocInst *NewMI =
-      new MallocInst(NewTy, Constant::getNullValue(Type::getInt32Ty(Context)),
-                     MI->getAlignment(), MI->getName(), MI);
-    Value* Indices[2];
-    Indices[0] = Indices[1] = Constant::getNullValue(Type::getInt32Ty(Context));
-    Value *NewGEP = GetElementPtrInst::Create(NewMI, Indices, Indices + 2,
-                                              NewMI->getName()+".el0", MI);
-    MI->replaceAllUsesWith(NewGEP);
-    MI->eraseFromParent();
-    MI = NewMI;
-  }
-
-  // Create the new global variable.  The contents of the malloc'd memory is
-  // undefined, so initialize with an undef value.
-  // FIXME: This new global should have the alignment returned by malloc.  Code
-  // could depend on malloc returning large alignment (on the mac, 16 bytes) but
-  // this would only guarantee some lower alignment.
-  Constant *Init = UndefValue::get(MI->getAllocatedType());
-  GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(), 
-                                             MI->getAllocatedType(), false,
-                                             GlobalValue::InternalLinkage, Init,
-                                             GV->getName()+".body",
-                                             GV,
-                                             GV->isThreadLocal());
-  
-  // Anything that used the malloc now uses the global directly.
-  MI->replaceAllUsesWith(NewGV);
-
-  Constant *RepValue = NewGV;
-  if (NewGV->getType() != GV->getType()->getElementType())
-    RepValue = ConstantExpr::getBitCast(RepValue, 
-                                        GV->getType()->getElementType());
-
-  // If there is a comparison against null, we will insert a global bool to
-  // keep track of whether the global was initialized yet or not.
-  GlobalVariable *InitBool =
-    new GlobalVariable(Context, Type::getInt1Ty(Context), false,
-                       GlobalValue::InternalLinkage,
-                       ConstantInt::getFalse(Context), GV->getName()+".init",
-                       GV->isThreadLocal());
-  bool InitBoolUsed = false;
-
-  // Loop over all uses of GV, processing them in turn.
-  std::vector<StoreInst*> Stores;
-  while (!GV->use_empty())
-    if (LoadInst *LI = dyn_cast<LoadInst>(GV->use_back())) {
-      while (!LI->use_empty()) {
-        Use &LoadUse = LI->use_begin().getUse();
-        if (!isa<ICmpInst>(LoadUse.getUser()))
-          LoadUse = RepValue;
-        else {
-          ICmpInst *CI = cast<ICmpInst>(LoadUse.getUser());
-          // Replace the cmp X, 0 with a use of the bool value.
-          Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", CI);
-          InitBoolUsed = true;
-          switch (CI->getPredicate()) {
-          default: llvm_unreachable("Unknown ICmp Predicate!");
-          case ICmpInst::ICMP_ULT:
-          case ICmpInst::ICMP_SLT:
-            LV = ConstantInt::getFalse(Context);   // X < null -> always false
-            break;
-          case ICmpInst::ICMP_ULE:
-          case ICmpInst::ICMP_SLE:
-          case ICmpInst::ICMP_EQ:
-            LV = BinaryOperator::CreateNot(LV, "notinit", CI);
-            break;
-          case ICmpInst::ICMP_NE:
-          case ICmpInst::ICMP_UGE:
-          case ICmpInst::ICMP_SGE:
-          case ICmpInst::ICMP_UGT:
-          case ICmpInst::ICMP_SGT:
-            break;  // no change.
-          }
-          CI->replaceAllUsesWith(LV);
-          CI->eraseFromParent();
-        }
-      }
-      LI->eraseFromParent();
-    } else {
-      StoreInst *SI = cast<StoreInst>(GV->use_back());
-      // The global is initialized when the store to it occurs.
-      new StoreInst(ConstantInt::getTrue(Context), InitBool, SI);
-      SI->eraseFromParent();
-    }
-
-  // If the initialization boolean was used, insert it, otherwise delete it.
-  if (!InitBoolUsed) {
-    while (!InitBool->use_empty())  // Delete initializations
-      cast<Instruction>(InitBool->use_back())->eraseFromParent();
-    delete InitBool;
-  } else
-    GV->getParent()->getGlobalList().insert(GV, InitBool);
-
-
-  // Now the GV is dead, nuke it and the malloc.
-  GV->eraseFromParent();
-  MI->eraseFromParent();
-
-  // To further other optimizations, loop over all users of NewGV and try to
-  // constant prop them.  This will promote GEP instructions with constant
-  // indices into GEP constant-exprs, which will allow global-opt to hack on it.
-  ConstantPropUsersOf(NewGV, Context);
-  if (RepValue != NewGV)
-    ConstantPropUsersOf(RepValue, Context);
-
-  return NewGV;
-}
-
-/// OptimizeGlobalAddressOfMalloc - This function takes the specified global
-/// variable, and transforms the program as if it always contained the result of
-/// the specified malloc.  Because it is always the result of the specified
-/// malloc, there is no reason to actually DO the malloc.  Instead, turn the
-/// malloc into a global, and any loads of GV as uses of the new global.
-static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                                      CallInst *CI,
                                                      BitCastInst *BCI,
                                                      LLVMContext &Context,
                                                      TargetData* TD) {
+  DEBUG(errs() << "PROMOTING MALLOC GLOBAL: " << *GV
+               << "  CALL = " << *CI << "  BCI = " << *BCI << '\n');
+
   const Type *IntPtrTy = TD->getIntPtrType(Context);
   
-  DEBUG(errs() << "PROMOTING MALLOC GLOBAL: " << *GV << "  MALLOC = " << *CI);
-
-  ConstantInt *NElements = cast<ConstantInt>(getMallocArraySize(CI,
-                                                                Context, TD));
+  Value* ArraySize = getMallocArraySize(CI, Context, TD);
+  assert(ArraySize && "not a malloc whose array size can be determined");
+  ConstantInt *NElements = cast<ConstantInt>(ArraySize);
   if (NElements->getZExtValue() != 1) {
     // If we have an array allocation, transform it to a single element
     // allocation to make the code below simpler.
@@ -976,9 +854,6 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
 
   // Create the new global variable.  The contents of the malloc'd memory is
   // undefined, so initialize with an undef value.
-  // FIXME: This new global should have the alignment returned by malloc.  Code
-  // could depend on malloc returning large alignment (on the mac, 16 bytes) but
-  // this would only guarantee some lower alignment.
   const Type *MAT = getMallocAllocatedType(CI);
   Constant *Init = UndefValue::get(MAT);
   GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(), 
@@ -1398,185 +1273,6 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
   }
 }
 
-/// PerformHeapAllocSRoA - MI is an allocation of an array of structures.  Break
-/// it up into multiple allocations of arrays of the fields.
-static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, MallocInst *MI,
-                                            LLVMContext &Context){
-  DEBUG(errs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *MI);
-  const StructType *STy = cast<StructType>(MI->getAllocatedType());
-
-  // There is guaranteed to be at least one use of the malloc (storing
-  // it into GV).  If there are other uses, change them to be uses of
-  // the global to simplify later code.  This also deletes the store
-  // into GV.
-  ReplaceUsesOfMallocWithGlobal(MI, GV);
-  
-  // Okay, at this point, there are no users of the malloc.  Insert N
-  // new mallocs at the same place as MI, and N globals.
-  std::vector<Value*> FieldGlobals;
-  std::vector<MallocInst*> FieldMallocs;
-  
-  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
-    const Type *FieldTy = STy->getElementType(FieldNo);
-    const Type *PFieldTy = PointerType::getUnqual(FieldTy);
-    
-    GlobalVariable *NGV =
-      new GlobalVariable(*GV->getParent(),
-                         PFieldTy, false, GlobalValue::InternalLinkage,
-                         Constant::getNullValue(PFieldTy),
-                         GV->getName() + ".f" + Twine(FieldNo), GV,
-                         GV->isThreadLocal());
-    FieldGlobals.push_back(NGV);
-    
-    MallocInst *NMI = new MallocInst(FieldTy, MI->getArraySize(),
-                                     MI->getName() + ".f" + Twine(FieldNo), MI);
-    FieldMallocs.push_back(NMI);
-    new StoreInst(NMI, NGV, MI);
-  }
-  
-  // The tricky aspect of this transformation is handling the case when malloc
-  // fails.  In the original code, malloc failing would set the result pointer
-  // of malloc to null.  In this case, some mallocs could succeed and others
-  // could fail.  As such, we emit code that looks like this:
-  //    F0 = malloc(field0)
-  //    F1 = malloc(field1)
-  //    F2 = malloc(field2)
-  //    if (F0 == 0 || F1 == 0 || F2 == 0) {
-  //      if (F0) { free(F0); F0 = 0; }
-  //      if (F1) { free(F1); F1 = 0; }
-  //      if (F2) { free(F2); F2 = 0; }
-  //    }
-  Value *RunningOr = 0;
-  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
-    Value *Cond = new ICmpInst(MI, ICmpInst::ICMP_EQ, FieldMallocs[i],
-                              Constant::getNullValue(FieldMallocs[i]->getType()),
-                                  "isnull");
-    if (!RunningOr)
-      RunningOr = Cond;   // First seteq
-    else
-      RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", MI);
-  }
-
-  // Split the basic block at the old malloc.
-  BasicBlock *OrigBB = MI->getParent();
-  BasicBlock *ContBB = OrigBB->splitBasicBlock(MI, "malloc_cont");
-  
-  // Create the block to check the first condition.  Put all these blocks at the
-  // end of the function as they are unlikely to be executed.
-  BasicBlock *NullPtrBlock = BasicBlock::Create(Context, "malloc_ret_null",
-                                                OrigBB->getParent());
-  
-  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
-  // branch on RunningOr.
-  OrigBB->getTerminator()->eraseFromParent();
-  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
-  
-  // Within the NullPtrBlock, we need to emit a comparison and branch for each
-  // pointer, because some may be null while others are not.
-  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
-    Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
-    Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal, 
-                              Constant::getNullValue(GVVal->getType()),
-                              "tmp");
-    BasicBlock *FreeBlock = BasicBlock::Create(Context, "free_it", 
-                                               OrigBB->getParent());
-    BasicBlock *NextBlock = BasicBlock::Create(Context, "next", 
-                                               OrigBB->getParent());
-    BranchInst::Create(FreeBlock, NextBlock, Cmp, NullPtrBlock);
-
-    // Fill in FreeBlock.
-    new FreeInst(GVVal, FreeBlock);
-    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
-                  FreeBlock);
-    BranchInst::Create(NextBlock, FreeBlock);
-    
-    NullPtrBlock = NextBlock;
-  }
-  
-  BranchInst::Create(ContBB, NullPtrBlock);
-  
-  // MI is no longer needed, remove it.
-  MI->eraseFromParent();
-
-  /// InsertedScalarizedLoads - As we process loads, if we can't immediately
-  /// update all uses of the load, keep track of what scalarized loads are
-  /// inserted for a given load.
-  DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
-  InsertedScalarizedValues[GV] = FieldGlobals;
-  
-  std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite;
-  
-  // Okay, the malloc site is completely handled.  All of the uses of GV are now
-  // loads, and all uses of those loads are simple.  Rewrite them to use loads
-  // of the per-field globals instead.
-  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;) {
-    Instruction *User = cast<Instruction>(*UI++);
-    
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite,
-                                   Context);
-      continue;
-    }
-    
-    // Must be a store of null.
-    StoreInst *SI = cast<StoreInst>(User);
-    assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
-           "Unexpected heap-sra user!");
-    
-    // Insert a store of null into each global.
-    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
-      const PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
-      Constant *Null = Constant::getNullValue(PT->getElementType());
-      new StoreInst(Null, FieldGlobals[i], SI);
-    }
-    // Erase the original store.
-    SI->eraseFromParent();
-  }
-
-  // While we have PHIs that are interesting to rewrite, do it.
-  while (!PHIsToRewrite.empty()) {
-    PHINode *PN = PHIsToRewrite.back().first;
-    unsigned FieldNo = PHIsToRewrite.back().second;
-    PHIsToRewrite.pop_back();
-    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
-    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
-
-    // Add all the incoming values.  This can materialize more phis.
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-      Value *InVal = PN->getIncomingValue(i);
-      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
-                               PHIsToRewrite, Context);
-      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
-    }
-  }
-  
-  // Drop all inter-phi links and any loads that made it this far.
-  for (DenseMap<Value*, std::vector<Value*> >::iterator
-       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
-       I != E; ++I) {
-    if (PHINode *PN = dyn_cast<PHINode>(I->first))
-      PN->dropAllReferences();
-    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
-      LI->dropAllReferences();
-  }
-  
-  // Delete all the phis and loads now that inter-references are dead.
-  for (DenseMap<Value*, std::vector<Value*> >::iterator
-       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
-       I != E; ++I) {
-    if (PHINode *PN = dyn_cast<PHINode>(I->first))
-      PN->eraseFromParent();
-    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
-      LI->eraseFromParent();
-  }
-  
-  // The old global is now dead, remove it.
-  GV->eraseFromParent();
-
-  ++NumHeapSRA;
-  return cast<GlobalVariable>(FieldGlobals[0]);
-}
-
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV,
@@ -1587,6 +1283,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV,
                << " BITCAST = " << *BCI << '\n');
   const Type* MAT = getMallocAllocatedType(CI);
   const StructType *STy = cast<StructType>(MAT);
+  Value* ArraySize = getMallocArraySize(CI, Context, TD);
+  assert(ArraySize && "not a malloc whose array size can be determined");
 
   // There is guaranteed to be at least one use of the malloc (storing
   // it into GV).  If there are other uses, change them to be uses of
@@ -1611,8 +1309,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV,
                          GV->isThreadLocal());
     FieldGlobals.push_back(NGV);
     
-    Value *NMI = CallInst::CreateMalloc(CI, TD->getIntPtrType(Context), FieldTy,
-                                        getMallocArraySize(CI, Context, TD),
+    Value *NMI = CallInst::CreateMalloc(CI, TD->getIntPtrType(Context),
+                                        FieldTy, ArraySize,
                                         BCI->getName() + ".f" + Twine(FieldNo));
     FieldMallocs.push_back(NMI);
     new StoreInst(NMI, NGV, BCI);
@@ -1766,95 +1464,6 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV,
 /// pointer global variable with a single value stored it that is a malloc or
 /// cast of malloc.
 static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
-                                               MallocInst *MI,
-                                               Module::global_iterator &GVI,
-                                               TargetData *TD,
-                                               LLVMContext &Context) {
-  // If this is a malloc of an abstract type, don't touch it.
-  if (!MI->getAllocatedType()->isSized())
-    return false;
-  
-  // We can't optimize this global unless all uses of it are *known* to be
-  // of the malloc value, not of the null initializer value (consider a use
-  // that compares the global's value against zero to see if the malloc has
-  // been reached).  To do this, we check to see if all uses of the global
-  // would trap if the global were null: this proves that they must all
-  // happen after the malloc.
-  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
-    return false;
-  
-  // We can't optimize this if the malloc itself is used in a complex way,
-  // for example, being stored into multiple globals.  This allows the
-  // malloc to be stored into the specified global, loaded setcc'd, and
-  // GEP'd.  These are all things we could transform to using the global
-  // for.
-  {
-    SmallPtrSet<PHINode*, 8> PHIs;
-    if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(MI, GV, PHIs))
-      return false;
-  }
-  
-  
-  // If we have a global that is only initialized with a fixed size malloc,
-  // transform the program to use global memory instead of malloc'd memory.
-  // This eliminates dynamic allocation, avoids an indirection accessing the
-  // data, and exposes the resultant global to further GlobalOpt.
-  if (ConstantInt *NElements = dyn_cast<ConstantInt>(MI->getArraySize())) {
-    // Restrict this transformation to only working on small allocations
-    // (2048 bytes currently), as we don't want to introduce a 16M global or
-    // something.
-    if (TD &&
-        NElements->getZExtValue()*
-        TD->getTypeAllocSize(MI->getAllocatedType()) < 2048) {
-      GVI = OptimizeGlobalAddressOfMalloc(GV, MI, Context);
-      return true;
-    }
-  }
-  
-  // If the allocation is an array of structures, consider transforming this
-  // into multiple malloc'd arrays, one for each field.  This is basically
-  // SRoA for malloc'd memory.
-  const Type *AllocTy = MI->getAllocatedType();
-  
-  // If this is an allocation of a fixed size array of structs, analyze as a
-  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
-  if (!MI->isArrayAllocation())
-    if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
-      AllocTy = AT->getElementType();
-  
-  if (const StructType *AllocSTy = dyn_cast<StructType>(AllocTy)) {
-    // This the structure has an unreasonable number of fields, leave it
-    // alone.
-    if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
-        AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, MI)) {
-      
-      // If this is a fixed size array, transform the Malloc to be an alloc of
-      // structs.  malloc [100 x struct],1 -> malloc struct, 100
-      if (const ArrayType *AT = dyn_cast<ArrayType>(MI->getAllocatedType())) {
-        MallocInst *NewMI = 
-          new MallocInst(AllocSTy, 
-                  ConstantInt::get(Type::getInt32Ty(Context),
-                  AT->getNumElements()),
-                         "", MI);
-        NewMI->takeName(MI);
-        Value *Cast = new BitCastInst(NewMI, MI->getType(), "tmp", MI);
-        MI->replaceAllUsesWith(Cast);
-        MI->eraseFromParent();
-        MI = NewMI;
-      }
-      
-      GVI = PerformHeapAllocSRoA(GV, MI, Context);
-      return true;
-    }
-  }
-  
-  return false;
-}  
-
-/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a
-/// pointer global variable with a single value stored it that is a malloc or
-/// cast of malloc.
-static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                CallInst *CI,
                                                BitCastInst *BCI,
                                                Module::global_iterator &GVI,
@@ -1892,52 +1501,55 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // transform the program to use global memory instead of malloc'd memory.
   // This eliminates dynamic allocation, avoids an indirection accessing the
   // data, and exposes the resultant global to further GlobalOpt.
-  if (ConstantInt *NElements =
-              dyn_cast<ConstantInt>(getMallocArraySize(CI, Context, TD))) {
-    // Restrict this transformation to only working on small allocations
-    // (2048 bytes currently), as we don't want to introduce a 16M global or
-    // something.
-    if (TD && 
-        NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) {
-      GVI = OptimizeGlobalAddressOfMalloc(GV, CI, BCI, Context, TD);
-      return true;
-    }
-  }
-  
-  // If the allocation is an array of structures, consider transforming this
-  // into multiple malloc'd arrays, one for each field.  This is basically
-  // SRoA for malloc'd memory.
-
-  // If this is an allocation of a fixed size array of structs, analyze as a
-  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
-  if (!isArrayMalloc(CI, Context, TD))
-    if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
-      AllocTy = AT->getElementType();
-  
-  if (const StructType *AllocSTy = dyn_cast<StructType>(AllocTy)) {
-    // This the structure has an unreasonable number of fields, leave it
-    // alone.
-    if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
-        AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, BCI)) {
-
-      // If this is a fixed size array, transform the Malloc to be an alloc of
-      // structs.  malloc [100 x struct],1 -> malloc struct, 100
-      if (const ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
-        Value* NumElements = ConstantInt::get(Type::getInt32Ty(Context),
-                                              AT->getNumElements());
-        Value* NewMI = CallInst::CreateMalloc(CI, TD->getIntPtrType(Context),
-                                              AllocSTy, NumElements,
-                                              BCI->getName());
-        Value *Cast = new BitCastInst(NewMI, getMallocType(CI), "tmp", CI);
-        BCI->replaceAllUsesWith(Cast);
-        BCI->eraseFromParent();
-        CI->eraseFromParent();
-        BCI = cast<BitCastInst>(NewMI);
-        CI = extractMallocCallFromBitCast(NewMI);
+  Value *NElems = getMallocArraySize(CI, Context, TD);
+  // We cannot optimize the malloc if we cannot determine malloc array size.
+  if (NElems) {
+    if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
+      // Restrict this transformation to only working on small allocations
+      // (2048 bytes currently), as we don't want to introduce a 16M global or
+      // something.
+      if (TD && 
+          NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) {
+        GVI = OptimizeGlobalAddressOfMalloc(GV, CI, BCI, Context, TD);
+        return true;
       }
+  
+    // If the allocation is an array of structures, consider transforming this
+    // into multiple malloc'd arrays, one for each field.  This is basically
+    // SRoA for malloc'd memory.
+
+    // If this is an allocation of a fixed size array of structs, analyze as a
+    // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
+    if (!isArrayMalloc(CI, Context, TD))
+      if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+        AllocTy = AT->getElementType();
+  
+    if (const StructType *AllocSTy = dyn_cast<StructType>(AllocTy)) {
+      // This the structure has an unreasonable number of fields, leave it
+      // alone.
+      if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+          AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, BCI)) {
+
+        // If this is a fixed size array, transform the Malloc to be an alloc of
+        // structs.  malloc [100 x struct],1 -> malloc struct, 100
+        if (const ArrayType *AT =
+                              dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
+          Value* NumElements = ConstantInt::get(Type::getInt32Ty(Context),
+                                                AT->getNumElements());
+          Value* NewMI = CallInst::CreateMalloc(CI, TD->getIntPtrType(Context),
+                                                AllocSTy, NumElements,
+                                                BCI->getName());
+          Value *Cast = new BitCastInst(NewMI, getMallocType(CI), "tmp", CI);
+          BCI->replaceAllUsesWith(Cast);
+          BCI->eraseFromParent();
+          CI->eraseFromParent();
+          BCI = cast<BitCastInst>(NewMI);
+          CI = extractMallocCallFromBitCast(NewMI);
+        }
       
-      GVI = PerformHeapAllocSRoA(GV, CI, BCI, Context, TD);
-      return true;
+        GVI = PerformHeapAllocSRoA(GV, CI, BCI, Context, TD);
+        return true;
+      }
     }
   }
   
@@ -1966,9 +1578,6 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, Context))
         return true;
-    } else if (MallocInst *MI = dyn_cast<MallocInst>(StoredOnceVal)) {
-      if (TryToOptimizeStoreOfMallocToGlobal(GV, MI, GVI, TD, Context))
-        return true;
     } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
       if (getMallocAllocatedType(CI)) {
         BitCastInst* BCI = NULL;
diff --git a/lib/Transforms/IPO/RaiseAllocations.cpp b/lib/Transforms/IPO/RaiseAllocations.cpp
index 4c1f26d..deb4405 100644
--- a/lib/Transforms/IPO/RaiseAllocations.cpp
+++ b/lib/Transforms/IPO/RaiseAllocations.cpp
@@ -1,4 +1,4 @@
-//===- RaiseAllocations.cpp - Convert @malloc & @free calls to insts ------===//
+//===- RaiseAllocations.cpp - Convert @free calls to insts ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the RaiseAllocations pass which convert malloc and free
-// calls to malloc and free instructions.
+// This file defines the RaiseAllocations pass which convert free calls to free
+// instructions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,19 +29,19 @@ using namespace llvm;
 STATISTIC(NumRaised, "Number of allocations raised");
 
 namespace {
-  // RaiseAllocations - Turn @malloc and @free calls into the appropriate
+  // RaiseAllocations - Turn @free calls into the appropriate
   // instruction.
   //
   class VISIBILITY_HIDDEN RaiseAllocations : public ModulePass {
-    Function *MallocFunc;   // Functions in the module we are processing
-    Function *FreeFunc;     // Initialized by doPassInitializationVirt
+    Function *FreeFunc;   // Functions in the module we are processing
+                          // Initialized by doPassInitializationVirt
   public:
     static char ID; // Pass identification, replacement for typeid
     RaiseAllocations() 
-      : ModulePass(&ID), MallocFunc(0), FreeFunc(0) {}
+      : ModulePass(&ID), FreeFunc(0) {}
 
     // doPassInitialization - For the raise allocations pass, this finds a
-    // declaration for malloc and free if they exist.
+    // declaration for free if it exists.
     //
     void doInitialization(Module &M);
 
@@ -61,50 +61,16 @@ ModulePass *llvm::createRaiseAllocationsPass() {
 }
 
 
-// If the module has a symbol table, they might be referring to the malloc and
-// free functions.  If this is the case, grab the method pointers that the
-// module is using.
+// If the module has a symbol table, they might be referring to the free 
+// function.  If this is the case, grab the method pointers that the module is
+// using.
 //
-// Lookup @malloc and @free in the symbol table, for later use.  If they don't
+// Lookup @free in the symbol table, for later use.  If they don't
 // exist, or are not external, we do not worry about converting calls to that
 // function into the appropriate instruction.
 //
 void RaiseAllocations::doInitialization(Module &M) {
-  // Get Malloc and free prototypes if they exist!
-  MallocFunc = M.getFunction("malloc");
-  if (MallocFunc) {
-    const FunctionType* TyWeHave = MallocFunc->getFunctionType();
-
-    // Get the expected prototype for malloc
-    const FunctionType *Malloc1Type = 
-      FunctionType::get(Type::getInt8PtrTy(M.getContext()),
-                      std::vector<const Type*>(1,
-                                      Type::getInt64Ty(M.getContext())), false);
-
-    // Chck to see if we got the expected malloc
-    if (TyWeHave != Malloc1Type) {
-      // Check to see if the prototype is wrong, giving us i8*(i32) * malloc
-      // This handles the common declaration of: 'void *malloc(unsigned);'
-      const FunctionType *Malloc2Type = 
-        FunctionType::get(PointerType::getUnqual(
-                          Type::getInt8Ty(M.getContext())),
-                          std::vector<const Type*>(1, 
-                                      Type::getInt32Ty(M.getContext())), false);
-      if (TyWeHave != Malloc2Type) {
-        // Check to see if the prototype is missing, giving us 
-        // i8*(...) * malloc
-        // This handles the common declaration of: 'void *malloc();'
-        const FunctionType *Malloc3Type = 
-          FunctionType::get(PointerType::getUnqual(
-                                    Type::getInt8Ty(M.getContext())), 
-                                    true);
-        if (TyWeHave != Malloc3Type)
-          // Give up
-          MallocFunc = 0;
-      }
-    }
-  }
-
+  // Get free prototype if it exists!
   FreeFunc = M.getFunction("free");
   if (FreeFunc) {
     const FunctionType* TyWeHave = FreeFunc->getFunctionType();
@@ -138,72 +104,18 @@ void RaiseAllocations::doInitialization(Module &M) {
   }
 
   // Don't mess with locally defined versions of these functions...
-  if (MallocFunc && !MallocFunc->isDeclaration()) MallocFunc = 0;
   if (FreeFunc && !FreeFunc->isDeclaration())     FreeFunc = 0;
 }
 
 // run - Transform calls into instructions...
 //
 bool RaiseAllocations::runOnModule(Module &M) {
-  // Find the malloc/free prototypes...
+  // Find the free prototype...
   doInitialization(M);
   
   bool Changed = false;
 
-  // First, process all of the malloc calls...
-  if (MallocFunc) {
-    std::vector<User*> Users(MallocFunc->use_begin(), MallocFunc->use_end());
-    std::vector<Value*> EqPointers;   // Values equal to MallocFunc
-    while (!Users.empty()) {
-      User *U = Users.back();
-      Users.pop_back();
-
-      if (Instruction *I = dyn_cast<Instruction>(U)) {
-        CallSite CS = CallSite::get(I);
-        if (CS.getInstruction() && !CS.arg_empty() &&
-            (CS.getCalledFunction() == MallocFunc ||
-             std::find(EqPointers.begin(), EqPointers.end(),
-                       CS.getCalledValue()) != EqPointers.end())) {
-
-          Value *Source = *CS.arg_begin();
-
-          // If no prototype was provided for malloc, we may need to cast the
-          // source size.
-          if (Source->getType() != Type::getInt32Ty(M.getContext()))
-            Source = 
-              CastInst::CreateIntegerCast(Source, 
-                                          Type::getInt32Ty(M.getContext()), 
-                                          false/*ZExt*/,
-                                          "MallocAmtCast", I);
-
-          MallocInst *MI = new MallocInst(Type::getInt8Ty(M.getContext()),
-                                          Source, "", I);
-          MI->takeName(I);
-          I->replaceAllUsesWith(MI);
-
-          // If the old instruction was an invoke, add an unconditional branch
-          // before the invoke, which will become the new terminator.
-          if (InvokeInst *II = dyn_cast<InvokeInst>(I))
-            BranchInst::Create(II->getNormalDest(), I);
-
-          // Delete the old call site
-          I->eraseFromParent();
-          Changed = true;
-          ++NumRaised;
-        }
-      } else if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) {
-        Users.insert(Users.end(), GV->use_begin(), GV->use_end());
-        EqPointers.push_back(GV);
-      } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
-        if (CE->isCast()) {
-          Users.insert(Users.end(), CE->use_begin(), CE->use_end());
-          EqPointers.push_back(CE);
-        }
-      }
-    }
-  }
-
-  // Next, process all free calls...
+  // Process all free calls...
   if (FreeFunc) {
     std::vector<User*> Users(FreeFunc->use_begin(), FreeFunc->use_end());
     std::vector<Value*> EqPointers;   // Values equal to FreeFunc
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 77d44b2..57aaf43 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -250,8 +250,6 @@ static bool StripDebugInfo(Module &M) {
   if (NMD)
     NMD->eraseFromParent();
 
-  // Remove dead metadata.
-  M.getContext().RemoveDeadMetadata();
   return true;
 }
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index a3e3fea..42209b8 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -73,6 +73,7 @@ namespace {
                             DenseMap<Value*,Value*> &SunkAddrs);
     bool OptimizeInlineAsmInst(Instruction *I, CallSite CS,
                                DenseMap<Value*,Value*> &SunkAddrs);
+    bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     void findLoopBackEdges(const Function &F);
   };
@@ -731,6 +732,43 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(Instruction *I, CallSite CS,
   return MadeChange;
 }
 
+/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
+/// basic block as the load, unless conditions are unfavorable. This allows
+/// SelectionDAG to fold the extend into the load.
+///
+bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
+  // Look for a load being extended.
+  LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0));
+  if (!LI) return false;
+
+  // If they're already in the same block, there's nothing to do.
+  if (LI->getParent() == I->getParent())
+    return false;
+
+  // If the load has other users and the truncate is not free, this probably
+  // isn't worthwhile.
+  if (!LI->hasOneUse() &&
+      TLI && !TLI->isTruncateFree(I->getType(), LI->getType()))
+    return false;
+
+  // Check whether the target supports casts folded into loads.
+  unsigned LType;
+  if (isa<ZExtInst>(I))
+    LType = ISD::ZEXTLOAD;
+  else {
+    assert(isa<SExtInst>(I) && "Unexpected ext type!");
+    LType = ISD::SEXTLOAD;
+  }
+  if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType())))
+    return false;
+
+  // Move the extend into the same block as the load, so that SelectionDAG
+  // can fold it.
+  I->removeFromParent();
+  I->insertAfter(LI);
+  return true;
+}
+
 bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
   BasicBlock *DefBB = I->getParent();
 
@@ -846,8 +884,10 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
         MadeChange |= Change;
       }
 
-      if (!Change && (isa<ZExtInst>(I) || isa<SExtInst>(I)))
+      if (!Change && (isa<ZExtInst>(I) || isa<SExtInst>(I))) {
+        MadeChange |= MoveExtToFormExtLoad(I);
         MadeChange |= OptimizeExtUses(I);
+      }
     } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
       MadeChange |= OptimizeCmpExpression(CI);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 2ed4a63..8859324 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -78,13 +78,10 @@ namespace {
                             SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
                             FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT,
                             PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, CONSTANT,
-                            EMPTY, TOMBSTONE };
+                            INSERTVALUE, EXTRACTVALUE, EMPTY, TOMBSTONE };
 
     ExpressionOpcode opcode;
     const Type* type;
-    uint32_t firstVN;
-    uint32_t secondVN;
-    uint32_t thirdVN;
     SmallVector<uint32_t, 4> varargs;
     Value *function;
 
@@ -100,12 +97,6 @@ namespace {
         return false;
       else if (function != other.function)
         return false;
-      else if (firstVN != other.firstVN)
-        return false;
-      else if (secondVN != other.secondVN)
-        return false;
-      else if (thirdVN != other.thirdVN)
-        return false;
       else {
         if (varargs.size() != other.varargs.size())
           return false;
@@ -146,6 +137,10 @@ namespace {
       Expression create_expression(GetElementPtrInst* G);
       Expression create_expression(CallInst* C);
       Expression create_expression(Constant* C);
+      Expression create_expression(ExtractValueInst* C);
+      Expression create_expression(InsertValueInst* C);
+      
+      uint32_t lookup_or_add_call(CallInst* C);
     public:
       ValueTable() : nextValueNumber(1) { }
       uint32_t lookup_or_add(Value *V);
@@ -176,13 +171,8 @@ template <> struct DenseMapInfo<Expression> {
   static unsigned getHashValue(const Expression e) {
     unsigned hash = e.opcode;
 
-    hash = e.firstVN + hash * 37;
-    hash = e.secondVN + hash * 37;
-    hash = e.thirdVN + hash * 37;
-
     hash = ((unsigned)((uintptr_t)e.type >> 4) ^
-            (unsigned)((uintptr_t)e.type >> 9)) +
-           hash * 37;
+            (unsigned)((uintptr_t)e.type >> 9));
 
     for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(),
          E = e.varargs.end(); I != E; ++I)
@@ -290,9 +280,6 @@ Expression ValueTable::create_expression(CallInst* C) {
   Expression e;
 
   e.type = C->getType();
-  e.firstVN = 0;
-  e.secondVN = 0;
-  e.thirdVN = 0;
   e.function = C->getCalledFunction();
   e.opcode = Expression::CALL;
 
@@ -305,10 +292,8 @@ Expression ValueTable::create_expression(CallInst* C) {
 
 Expression ValueTable::create_expression(BinaryOperator* BO) {
   Expression e;
-
-  e.firstVN = lookup_or_add(BO->getOperand(0));
-  e.secondVN = lookup_or_add(BO->getOperand(1));
-  e.thirdVN = 0;
+  e.varargs.push_back(lookup_or_add(BO->getOperand(0)));
+  e.varargs.push_back(lookup_or_add(BO->getOperand(1)));
   e.function = 0;
   e.type = BO->getType();
   e.opcode = getOpcode(BO);
@@ -319,9 +304,8 @@ Expression ValueTable::create_expression(BinaryOperator* BO) {
 Expression ValueTable::create_expression(CmpInst* C) {
   Expression e;
 
-  e.firstVN = lookup_or_add(C->getOperand(0));
-  e.secondVN = lookup_or_add(C->getOperand(1));
-  e.thirdVN = 0;
+  e.varargs.push_back(lookup_or_add(C->getOperand(0)));
+  e.varargs.push_back(lookup_or_add(C->getOperand(1)));
   e.function = 0;
   e.type = C->getType();
   e.opcode = getOpcode(C);
@@ -332,9 +316,7 @@ Expression ValueTable::create_expression(CmpInst* C) {
 Expression ValueTable::create_expression(CastInst* C) {
   Expression e;
 
-  e.firstVN = lookup_or_add(C->getOperand(0));
-  e.secondVN = 0;
-  e.thirdVN = 0;
+  e.varargs.push_back(lookup_or_add(C->getOperand(0)));
   e.function = 0;
   e.type = C->getType();
   e.opcode = getOpcode(C);
@@ -345,9 +327,9 @@ Expression ValueTable::create_expression(CastInst* C) {
 Expression ValueTable::create_expression(ShuffleVectorInst* S) {
   Expression e;
 
-  e.firstVN = lookup_or_add(S->getOperand(0));
-  e.secondVN = lookup_or_add(S->getOperand(1));
-  e.thirdVN = lookup_or_add(S->getOperand(2));
+  e.varargs.push_back(lookup_or_add(S->getOperand(0)));
+  e.varargs.push_back(lookup_or_add(S->getOperand(1)));
+  e.varargs.push_back(lookup_or_add(S->getOperand(2)));
   e.function = 0;
   e.type = S->getType();
   e.opcode = Expression::SHUFFLE;
@@ -358,9 +340,8 @@ Expression ValueTable::create_expression(ShuffleVectorInst* S) {
 Expression ValueTable::create_expression(ExtractElementInst* E) {
   Expression e;
 
-  e.firstVN = lookup_or_add(E->getOperand(0));
-  e.secondVN = lookup_or_add(E->getOperand(1));
-  e.thirdVN = 0;
+  e.varargs.push_back(lookup_or_add(E->getOperand(0)));
+  e.varargs.push_back(lookup_or_add(E->getOperand(1)));
   e.function = 0;
   e.type = E->getType();
   e.opcode = Expression::EXTRACT;
@@ -371,9 +352,9 @@ Expression ValueTable::create_expression(ExtractElementInst* E) {
 Expression ValueTable::create_expression(InsertElementInst* I) {
   Expression e;
 
-  e.firstVN = lookup_or_add(I->getOperand(0));
-  e.secondVN = lookup_or_add(I->getOperand(1));
-  e.thirdVN = lookup_or_add(I->getOperand(2));
+  e.varargs.push_back(lookup_or_add(I->getOperand(0)));
+  e.varargs.push_back(lookup_or_add(I->getOperand(1)));
+  e.varargs.push_back(lookup_or_add(I->getOperand(2)));
   e.function = 0;
   e.type = I->getType();
   e.opcode = Expression::INSERT;
@@ -384,9 +365,9 @@ Expression ValueTable::create_expression(InsertElementInst* I) {
 Expression ValueTable::create_expression(SelectInst* I) {
   Expression e;
 
-  e.firstVN = lookup_or_add(I->getCondition());
-  e.secondVN = lookup_or_add(I->getTrueValue());
-  e.thirdVN = lookup_or_add(I->getFalseValue());
+  e.varargs.push_back(lookup_or_add(I->getCondition()));
+  e.varargs.push_back(lookup_or_add(I->getTrueValue()));
+  e.varargs.push_back(lookup_or_add(I->getFalseValue()));
   e.function = 0;
   e.type = I->getType();
   e.opcode = Expression::SELECT;
@@ -397,9 +378,7 @@ Expression ValueTable::create_expression(SelectInst* I) {
 Expression ValueTable::create_expression(GetElementPtrInst* G) {
   Expression e;
 
-  e.firstVN = lookup_or_add(G->getPointerOperand());
-  e.secondVN = 0;
-  e.thirdVN = 0;
+  e.varargs.push_back(lookup_or_add(G->getPointerOperand()));
   e.function = 0;
   e.type = G->getType();
   e.opcode = Expression::GEP;
@@ -411,6 +390,35 @@ Expression ValueTable::create_expression(GetElementPtrInst* G) {
   return e;
 }
 
+Expression ValueTable::create_expression(ExtractValueInst* E) {
+  Expression e;
+
+  e.varargs.push_back(lookup_or_add(E->getAggregateOperand()));
+  for (ExtractValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+       II != IE; ++II)
+    e.varargs.push_back(*II);
+  e.function = 0;
+  e.type = E->getType();
+  e.opcode = Expression::EXTRACTVALUE;
+
+  return e;
+}
+
+Expression ValueTable::create_expression(InsertValueInst* E) {
+  Expression e;
+
+  e.varargs.push_back(lookup_or_add(E->getAggregateOperand()));
+  e.varargs.push_back(lookup_or_add(E->getInsertedValueOperand()));
+  for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+       II != IE; ++II)
+    e.varargs.push_back(*II);
+  e.function = 0;
+  e.type = E->getType();
+  e.opcode = Expression::INSERTVALUE;
+
+  return e;
+}
+
 //===----------------------------------------------------------------------===//
 //                     ValueTable External Functions
 //===----------------------------------------------------------------------===//
@@ -420,232 +428,197 @@ void ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
 }
 
-/// lookup_or_add - Returns the value number for the specified value, assigning
-/// it a new number if it did not have one before.
-uint32_t ValueTable::lookup_or_add(Value *V) {
-  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
-  if (VI != valueNumbering.end())
-    return VI->second;
-
-  if (CallInst* C = dyn_cast<CallInst>(V)) {
-    if (AA->doesNotAccessMemory(C)) {
-      Expression e = create_expression(C);
-
-      DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-      if (EI != expressionNumbering.end()) {
-        valueNumbering.insert(std::make_pair(V, EI->second));
-        return EI->second;
-      } else {
-        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-        valueNumbering.insert(std::make_pair(V, nextValueNumber));
-
-        return nextValueNumber++;
-      }
-    } else if (AA->onlyReadsMemory(C)) {
-      Expression e = create_expression(C);
-
-      if (expressionNumbering.find(e) == expressionNumbering.end()) {
-        expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-        valueNumbering.insert(std::make_pair(V, nextValueNumber));
-        return nextValueNumber++;
-      }
-
-      MemDepResult local_dep = MD->getDependency(C);
-
-      if (!local_dep.isDef() && !local_dep.isNonLocal()) {
-        valueNumbering.insert(std::make_pair(V, nextValueNumber));
-        return nextValueNumber++;
-      }
-
-      if (local_dep.isDef()) {
-        CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
-
-        if (local_cdep->getNumOperands() != C->getNumOperands()) {
-          valueNumbering.insert(std::make_pair(V, nextValueNumber));
-          return nextValueNumber++;
-        }
-
-        for (unsigned i = 1; i < C->getNumOperands(); ++i) {
-          uint32_t c_vn = lookup_or_add(C->getOperand(i));
-          uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
-          if (c_vn != cd_vn) {
-            valueNumbering.insert(std::make_pair(V, nextValueNumber));
-            return nextValueNumber++;
-          }
-        }
-
-        uint32_t v = lookup_or_add(local_cdep);
-        valueNumbering.insert(std::make_pair(V, v));
-        return v;
-      }
-
-      // Non-local case.
-      const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
-        MD->getNonLocalCallDependency(CallSite(C));
-      // FIXME: call/call dependencies for readonly calls should return def, not
-      // clobber!  Move the checking logic to MemDep!
-      CallInst* cdep = 0;
-
-      // Check to see if we have a single dominating call instruction that is
-      // identical to C.
-      for (unsigned i = 0, e = deps.size(); i != e; ++i) {
-        const MemoryDependenceAnalysis::NonLocalDepEntry *I = &deps[i];
-        // Ignore non-local dependencies.
-        if (I->second.isNonLocal())
-          continue;
+uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
+  if (AA->doesNotAccessMemory(C)) {
+    Expression exp = create_expression(C);
+    uint32_t& e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[C] = e;
+    return e;
+  } else if (AA->onlyReadsMemory(C)) {
+    Expression exp = create_expression(C);
+    uint32_t& e = expressionNumbering[exp];
+    if (!e) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
 
-        // We don't handle non-depedencies.  If we already have a call, reject
-        // instruction dependencies.
-        if (I->second.isClobber() || cdep != 0) {
-          cdep = 0;
-          break;
-        }
+    MemDepResult local_dep = MD->getDependency(C);
 
-        CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->second.getInst());
-        // FIXME: All duplicated with non-local case.
-        if (NonLocalDepCall && DT->properlyDominates(I->first, C->getParent())){
-          cdep = NonLocalDepCall;
-          continue;
-        }
+    if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+      valueNumbering[C] =  nextValueNumber;
+      return nextValueNumber++;
+    }
 
-        cdep = 0;
-        break;
-      }
+    if (local_dep.isDef()) {
+      CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
 
-      if (!cdep) {
-        valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      if (local_cdep->getNumOperands() != C->getNumOperands()) {
+        valueNumbering[C] = nextValueNumber;
         return nextValueNumber++;
       }
 
-      if (cdep->getNumOperands() != C->getNumOperands()) {
-        valueNumbering.insert(std::make_pair(V, nextValueNumber));
-        return nextValueNumber++;
-      }
       for (unsigned i = 1; i < C->getNumOperands(); ++i) {
         uint32_t c_vn = lookup_or_add(C->getOperand(i));
-        uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+        uint32_t cd_vn = lookup_or_add(local_cdep->getOperand(i));
         if (c_vn != cd_vn) {
-          valueNumbering.insert(std::make_pair(V, nextValueNumber));
+          valueNumbering[C] = nextValueNumber;
           return nextValueNumber++;
         }
       }
 
-      uint32_t v = lookup_or_add(cdep);
-      valueNumbering.insert(std::make_pair(V, v));
+      uint32_t v = lookup_or_add(local_cdep);
+      valueNumbering[C] = v;
       return v;
-
-    } else {
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
-      return nextValueNumber++;
     }
-  } else if (BinaryOperator* BO = dyn_cast<BinaryOperator>(V)) {
-    Expression e = create_expression(BO);
 
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    // Non-local case.
+    const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
+      MD->getNonLocalCallDependency(CallSite(C));
+    // FIXME: call/call dependencies for readonly calls should return def, not
+    // clobber!  Move the checking logic to MemDep!
+    CallInst* cdep = 0;
+
+    // Check to see if we have a single dominating call instruction that is
+    // identical to C.
+    for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+      const MemoryDependenceAnalysis::NonLocalDepEntry *I = &deps[i];
+      // Ignore non-local dependencies.
+      if (I->second.isNonLocal())
+        continue;
 
-      return nextValueNumber++;
-    }
-  } else if (CmpInst* C = dyn_cast<CmpInst>(V)) {
-    Expression e = create_expression(C);
+      // We don't handle non-depedencies.  If we already have a call, reject
+      // instruction dependencies.
+      if (I->second.isClobber() || cdep != 0) {
+        cdep = 0;
+        break;
+      }
 
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->second.getInst());
+      // FIXME: All duplicated with non-local case.
+      if (NonLocalDepCall && DT->properlyDominates(I->first, C->getParent())){
+        cdep = NonLocalDepCall;
+        continue;
+      }
 
-      return nextValueNumber++;
+      cdep = 0;
+      break;
     }
-  } else if (ShuffleVectorInst* U = dyn_cast<ShuffleVectorInst>(V)) {
-    Expression e = create_expression(U);
-
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
 
+    if (!cdep) {
+      valueNumbering[C] = nextValueNumber;
       return nextValueNumber++;
     }
-  } else if (ExtractElementInst* U = dyn_cast<ExtractElementInst>(V)) {
-    Expression e = create_expression(U);
-
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
 
+    if (cdep->getNumOperands() != C->getNumOperands()) {
+      valueNumbering[C] = nextValueNumber;
       return nextValueNumber++;
     }
-  } else if (InsertElementInst* U = dyn_cast<InsertElementInst>(V)) {
-    Expression e = create_expression(U);
-
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
-
-      return nextValueNumber++;
+    for (unsigned i = 1; i < C->getNumOperands(); ++i) {
+      uint32_t c_vn = lookup_or_add(C->getOperand(i));
+      uint32_t cd_vn = lookup_or_add(cdep->getOperand(i));
+      if (c_vn != cd_vn) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
     }
-  } else if (SelectInst* U = dyn_cast<SelectInst>(V)) {
-    Expression e = create_expression(U);
 
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    uint32_t v = lookup_or_add(cdep);
+    valueNumbering[C] = v;
+    return v;
 
-      return nextValueNumber++;
-    }
-  } else if (CastInst* U = dyn_cast<CastInst>(V)) {
-    Expression e = create_expression(U);
-
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
-
-      return nextValueNumber++;
-    }
-  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
-    Expression e = create_expression(U);
+  } else {
+    valueNumbering[C] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
 
-    DenseMap<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
-    if (EI != expressionNumbering.end()) {
-      valueNumbering.insert(std::make_pair(V, EI->second));
-      return EI->second;
-    } else {
-      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
-      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value *V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
 
-      return nextValueNumber++;
-    }
-  } else {
-    valueNumbering.insert(std::make_pair(V, nextValueNumber));
+  if (!isa<Instruction>(V)) {
+    valueNumbering[V] = nextValueNumber;
     return nextValueNumber++;
   }
+  
+  Instruction* I = cast<Instruction>(V);
+  Expression exp;
+  switch (I->getOpcode()) {
+    case Instruction::Call:
+      return lookup_or_add_call(cast<CallInst>(I));
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or :
+    case Instruction::Xor:
+      exp = create_expression(cast<BinaryOperator>(I));
+      break;
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      exp = create_expression(cast<CmpInst>(I));
+      break;
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      exp = create_expression(cast<CastInst>(I));
+      break;
+    case Instruction::Select:
+      exp = create_expression(cast<SelectInst>(I));
+      break;
+    case Instruction::ExtractElement:
+      exp = create_expression(cast<ExtractElementInst>(I));
+      break;
+    case Instruction::InsertElement:
+      exp = create_expression(cast<InsertElementInst>(I));
+      break;
+    case Instruction::ShuffleVector:
+      exp = create_expression(cast<ShuffleVectorInst>(I));
+      break;
+    case Instruction::ExtractValue:
+      exp = create_expression(cast<ExtractValueInst>(I));
+      break;
+    case Instruction::InsertValue:
+      exp = create_expression(cast<InsertValueInst>(I));
+      break;      
+    case Instruction::GetElementPtr:
+      exp = create_expression(cast<GetElementPtrInst>(I));
+      break;
+    default:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+  }
+
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  valueNumbering[V] = e;
+  return e;
 }
 
 /// lookup - Returns the value number of the specified value. Fails if
@@ -1557,15 +1530,18 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
     const TargetData *TD = 0;
-    if (StoredVal->getType() != L->getType() &&
-        (TD = getAnalysisIfAvailable<TargetData>())) {
-      StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(),
-                                                 L, *TD);
-      if (StoredVal == 0)
+    if (StoredVal->getType() != L->getType()) {
+      if ((TD = getAnalysisIfAvailable<TargetData>())) {
+        StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(),
+                                                   L, *TD);
+        if (StoredVal == 0)
+          return false;
+        
+        DEBUG(errs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
+                     << '\n' << *L << "\n\n\n");
+      }
+      else 
         return false;
-      
-      DEBUG(errs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
-                   << '\n' << *L << "\n\n\n");
     }
 
     // Remove it!
@@ -1584,14 +1560,17 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) {
     // the same type.  See if we know how to reuse the previously loaded value
     // (depending on its type).
     const TargetData *TD = 0;
-    if (DepLI->getType() != L->getType() &&
-        (TD = getAnalysisIfAvailable<TargetData>())) {
-      AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L,*TD);
-      if (AvailableVal == 0)
-        return false;
+    if (DepLI->getType() != L->getType()) {
+      if ((TD = getAnalysisIfAvailable<TargetData>())) {
+        AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L,*TD);
+        if (AvailableVal == 0)
+          return false;
       
-      DEBUG(errs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
-                   << "\n" << *L << "\n\n\n");
+        DEBUG(errs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
+                     << "\n" << *L << "\n\n\n");
+      }
+      else 
+        return false;
     }
     
     // Remove it!
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
index f635af3..b41b5d4 100644
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -6300,25 +6300,33 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
         break;
       }
-      case Instruction::Malloc:
-        // If we have (malloc != null), and if the malloc has a single use, we
-        // can assume it is successful and remove the malloc.
-        if (LHSI->hasOneUse() && isa<ConstantPointerNull>(RHSC)) {
-          Worklist.Add(LHSI);
-          return ReplaceInstUsesWith(I,
-                                     ConstantInt::get(Type::getInt1Ty(*Context),
-                                                      !I.isTrueWhenEqual()));
-        }
-        break;
       case Instruction::Call:
         // If we have (malloc != null), and if the malloc has a single use, we
         // can assume it is successful and remove the malloc.
         if (isMalloc(LHSI) && LHSI->hasOneUse() &&
             isa<ConstantPointerNull>(RHSC)) {
-          Worklist.Add(LHSI);
-          return ReplaceInstUsesWith(I,
+          // Need to explicitly erase malloc call here, instead of adding it to
+          // Worklist, because it won't get DCE'd from the Worklist since
+          // isInstructionTriviallyDead() returns false for function calls.
+          // It is OK to replace LHSI/MallocCall with Undef because the 
+          // instruction that uses it will be erased via Worklist.
+          if (extractMallocCall(LHSI)) {
+            LHSI->replaceAllUsesWith(UndefValue::get(LHSI->getType()));
+            EraseInstFromFunction(*LHSI);
+            return ReplaceInstUsesWith(I,
                                      ConstantInt::get(Type::getInt1Ty(*Context),
                                                       !I.isTrueWhenEqual()));
+          }
+          if (CallInst* MallocCall = extractMallocCallFromBitCast(LHSI))
+            if (MallocCall->hasOneUse()) {
+              MallocCall->replaceAllUsesWith(
+                                        UndefValue::get(MallocCall->getType()));
+              EraseInstFromFunction(*MallocCall);
+              Worklist.Add(LHSI); // The malloc's bitcast use.
+              return ReplaceInstUsesWith(I,
+                                     ConstantInt::get(Type::getInt1Ty(*Context),
+                                                      !I.isTrueWhenEqual()));
+            }
         }
         break;
       }
@@ -7809,11 +7817,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     Amt = AllocaBuilder.CreateAdd(Amt, Off, "tmp");
   }
   
-  AllocationInst *New;
-  if (isa<MallocInst>(AI))
-    New = AllocaBuilder.CreateMalloc(CastElTy, Amt);
-  else
-    New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
+  AllocationInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
   New->setAlignment(AI.getAlignment());
   New->takeName(&AI);
   
@@ -9270,14 +9274,44 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   return Changed ? &SI : 0;
 }
 
-/// isDefinedInBB - Return true if the value is an instruction defined in the
-/// specified basicblock.
-static bool isDefinedInBB(const Value *V, const BasicBlock *BB) {
+
+/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a
+/// PHI node (but the two may be in different blocks).  See if the true/false
+/// values (V) are live in all of the predecessor blocks of the PHI.  For
+/// example, cases like this cannot be mapped:
+///
+///   X = phi [ C1, BB1], [C2, BB2]
+///   Y = add
+///   Z = select X, Y, 0
+///
+/// because Y is not live in BB1/BB2.
+///
+static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
+                                                   const SelectInst &SI) {
+  // If the value is a non-instruction value like a constant or argument, it
+  // can always be mapped.
   const Instruction *I = dyn_cast<Instruction>(V);
-  return I != 0 && I->getParent() == BB;
+  if (I == 0) return true;
+  
+  // If V is a PHI node defined in the same block as the condition PHI, we can
+  // map the arguments.
+  const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
+  
+  if (const PHINode *VP = dyn_cast<PHINode>(I))
+    if (VP->getParent() == CondPHI->getParent())
+      return true;
+  
+  // Otherwise, if the PHI and select are defined in the same block and if V is
+  // defined in a different block, then we can transform it.
+  if (SI.getParent() == CondPHI->getParent() &&
+      I->getParent() != CondPHI->getParent())
+    return true;
+  
+  // Otherwise we have a 'hard' case and we can't tell without doing more
+  // detailed dominator based analysis, punt.
+  return false;
 }
 
-
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -9489,16 +9523,13 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       return FoldI;
   }
 
-  // See if we can fold the select into a phi node.  The true/false values have
-  // to be live in the predecessor blocks.  If they are instructions in SI's
-  // block, we can't map to the predecessor.
-  if (isa<PHINode>(SI.getCondition()) &&
-      (!isDefinedInBB(SI.getTrueValue(), SI.getParent()) ||
-       isa<PHINode>(SI.getTrueValue())) &&
-      (!isDefinedInBB(SI.getFalseValue(), SI.getParent()) ||
-       isa<PHINode>(SI.getFalseValue())))
-    if (Instruction *NV = FoldOpIntoPhi(SI))
-      return NV;
+  // See if we can fold the select into a phi node if the condition is a select.
+  if (isa<PHINode>(SI.getCondition())) 
+    // The true/false values have to be live in the PHI predecessor's blocks.
+    if (CanSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
+        CanSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
+      if (Instruction *NV = FoldOpIntoPhi(SI))
+        return NV;
 
   if (BinaryOperator::isNot(CondVal)) {
     SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
@@ -11213,15 +11244,8 @@ Instruction *InstCombiner::visitAllocationInst(AllocationInst &AI) {
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
       const Type *NewTy = 
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      AllocationInst *New = 0;
-
-      // Create and insert the replacement instruction...
-      if (isa<MallocInst>(AI))
-        New = Builder->CreateMalloc(NewTy, 0, AI.getName());
-      else {
-        assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
-        New = Builder->CreateAlloca(NewTy, 0, AI.getName());
-      }
+      assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
+      AllocationInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
       New->setAlignment(AI.getAlignment());
 
       // Scan to the end of the allocation instructions, to skip over a block of
@@ -11294,12 +11318,6 @@ Instruction *InstCombiner::visitFreeInst(FreeInst &FI) {
     }
   }
   
-  // Change free(malloc) into nothing, if the malloc has a single use.
-  if (MallocInst *MI = dyn_cast<MallocInst>(Op))
-    if (MI->hasOneUse()) {
-      EraseInstFromFunction(FI);
-      return EraseInstFromFunction(*MI);
-    }
   if (isMalloc(Op)) {
     if (CallInst* CI = extractMallocCallFromBitCast(Op)) {
       if (Op->hasOneUse() && CI->hasOneUse()) {
@@ -11327,40 +11345,6 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
   Value *CastOp = CI->getOperand(0);
   LLVMContext *Context = IC.getContext();
 
-  if (TD) {
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CI)) {
-      // Instead of loading constant c string, use corresponding integer value
-      // directly if string length is small enough.
-      std::string Str;
-      if (GetConstantStringInfo(CE->getOperand(0), Str) && !Str.empty()) {
-        unsigned len = Str.length();
-        const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
-        unsigned numBits = Ty->getPrimitiveSizeInBits();
-        // Replace LI with immediate integer store.
-        if ((numBits >> 3) == len + 1) {
-          APInt StrVal(numBits, 0);
-          APInt SingleChar(numBits, 0);
-          if (TD->isLittleEndian()) {
-            for (signed i = len-1; i >= 0; i--) {
-              SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
-              StrVal = (StrVal << 8) | SingleChar;
-            }
-          } else {
-            for (unsigned i = 0; i < len; i++) {
-              SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
-              StrVal = (StrVal << 8) | SingleChar;
-            }
-            // Append NULL at the end.
-            SingleChar = 0;
-            StrVal = (StrVal << 8) | SingleChar;
-          }
-          Value *NL = ConstantInt::get(*Context, StrVal);
-          return IC.ReplaceInstUsesWith(LI, NL);
-        }
-      }
-    }
-  }
-
   const PointerType *DestTy = cast<PointerType>(CI->getType());
   const Type *DestPTy = DestTy->getElementType();
   if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
@@ -11380,7 +11364,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
           if (ASrcTy->getNumElements() != 0) {
             Value *Idxs[2];
-            Idxs[0] = Idxs[1] = Constant::getNullValue(Type::getInt32Ty(*Context));
+            Idxs[0] = Constant::getNullValue(Type::getInt32Ty(*Context));
+            Idxs[1] = Idxs[0];
             CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
             SrcTy = cast<PointerType>(CastOp->getType());
             SrcPTy = SrcTy->getElementType();
@@ -11436,6 +11421,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
     return ReplaceInstUsesWith(LI, AvailableVal);
 
+  // load(gep null, ...) -> unreachable
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
     const Value *GEPI0 = GEPI->getOperand(0);
     // TODO: Consider a target hook for valid address spaces for this xform.
@@ -11450,60 +11436,24 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     }
   } 
 
-  if (Constant *C = dyn_cast<Constant>(Op)) {
-    // load null/undef -> undef
-    // TODO: Consider a target hook for valid address spaces for this xform.
-    if (isa<UndefValue>(C) ||
-        (C->isNullValue() && LI.getPointerAddressSpace() == 0)) {
-      // Insert a new store to null instruction before the load to indicate that
-      // this code is not reachable.  We do this instead of inserting an
-      // unreachable instruction directly because we cannot modify the CFG.
-      new StoreInst(UndefValue::get(LI.getType()),
-                    Constant::getNullValue(Op->getType()), &LI);
-      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
-    }
-
-    // Instcombine load (constant global) into the value loaded.
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op))
-      if (GV->isConstant() && GV->hasDefinitiveInitializer())
-        return ReplaceInstUsesWith(LI, GV->getInitializer());
-
-    // Instcombine load (constantexpr_GEP global, 0, ...) into the value loaded.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) {
-      if (CE->getOpcode() == Instruction::GetElementPtr) {
-        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
-          if (GV->isConstant() && GV->hasDefinitiveInitializer())
-            if (Constant *V = 
-               ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
-              return ReplaceInstUsesWith(LI, V);
-        if (CE->getOperand(0)->isNullValue()) {
-          // Insert a new store to null instruction before the load to indicate
-          // that this code is not reachable.  We do this instead of inserting
-          // an unreachable instruction directly because we cannot modify the
-          // CFG.
-          new StoreInst(UndefValue::get(LI.getType()),
-                        Constant::getNullValue(Op->getType()), &LI);
-          return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
-        }
-
-      } else if (CE->isCast()) {
-        if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
-          return Res;
-      }
-    }
-  }
-    
-  // If this load comes from anywhere in a constant global, and if the global
-  // is all undef or zero, we know what it loads.
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op->getUnderlyingObject())){
-    if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-      if (GV->getInitializer()->isNullValue())
-        return ReplaceInstUsesWith(LI, Constant::getNullValue(LI.getType()));
-      else if (isa<UndefValue>(GV->getInitializer()))
-        return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
-    }
+  // load null/undef -> unreachable
+  // TODO: Consider a target hook for valid address spaces for this xform.
+  if (isa<UndefValue>(Op) ||
+      (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0)) {
+    // Insert a new store to null instruction before the load to indicate that
+    // this code is not reachable.  We do this instead of inserting an
+    // unreachable instruction directly because we cannot modify the CFG.
+    new StoreInst(UndefValue::get(LI.getType()),
+                  Constant::getNullValue(Op->getType()), &LI);
+    return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
   }
 
+  // Instcombine load (constantexpr_cast global) -> cast (load global)
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op))
+    if (CE->isCast())
+      if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+        return Res;
+  
   if (Op->hasOneUse()) {
     // Change select and PHI nodes to select values instead of addresses: this
     // helps alias analysis out a lot, allows many others simplifications, and
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index f6de362..223d2b9 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -138,7 +138,6 @@ namespace {
     void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
 
     bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
-    unsigned getLoopUnswitchCost(Value *LIC);
     void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                   BasicBlock *ExitBlock);
     void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L);
@@ -400,25 +399,6 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
   return true;
 }
 
-/// getLoopUnswitchCost - Return the cost (code size growth) that will happen if
-/// we choose to unswitch current loop on the specified value.
-///
-unsigned LoopUnswitch::getLoopUnswitchCost(Value *LIC) {
-  // If the condition is trivial, always unswitch.  There is no code growth for
-  // this case.
-  if (IsTrivialUnswitchCondition(LIC))
-    return 0;
-  
-  // FIXME: This is overly conservative because it does not take into
-  // consideration code simplification opportunities.
-  CodeMetrics Metrics;
-  for (Loop::block_iterator I = currentLoop->block_begin(), 
-         E = currentLoop->block_end();
-       I != E; ++I)
-    Metrics.analyzeBasicBlock(*I);
-  return Metrics.NumInsts;
-}
-
 /// UnswitchIfProfitable - We have found that we can unswitch currentLoop when
 /// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
@@ -427,24 +407,35 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val){
   initLoopData();
   Function *F = loopHeader->getParent();
 
+  // If the condition is trivial, always unswitch.  There is no code growth for
+  // this case.
+  if (!IsTrivialUnswitchCondition(LoopCond)) {
+    // Check to see if it would be profitable to unswitch current loop.
 
-  // Check to see if it would be profitable to unswitch current loop.
-  unsigned Cost = getLoopUnswitchCost(LoopCond);
-
-  // Do not do non-trivial unswitch while optimizing for size.
-  if (Cost && OptimizeForSize)
-    return false;
-  if (Cost && !F->isDeclaration() && F->hasFnAttr(Attribute::OptimizeForSize))
-    return false;
+    // Do not do non-trivial unswitch while optimizing for size.
+    if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize))
+      return false;
 
-  if (Cost > Threshold) {
-    // FIXME: this should estimate growth by the amount of code shared by the
-    // resultant unswitched loops.
-    //
-    DEBUG(errs() << "NOT unswitching loop %"
-          << currentLoop->getHeader()->getName() << ", cost too high: "
-          << currentLoop->getBlocks().size() << "\n");
-    return false;
+    // FIXME: This is overly conservative because it does not take into
+    // consideration code simplification opportunities and code that can
+    // be shared by the resultant unswitched loops.
+    CodeMetrics Metrics;
+    for (Loop::block_iterator I = currentLoop->block_begin(), 
+           E = currentLoop->block_end();
+         I != E; ++I)
+      Metrics.analyzeBasicBlock(*I);
+
+    // Limit the number of instructions to avoid causing significant code
+    // expansion, and the number of basic blocks, to avoid loops with
+    // large numbers of branches which cause loop unswitching to go crazy.
+    // This is a very ad-hoc heuristic.
+    if (Metrics.NumInsts > Threshold ||
+        Metrics.NumBlocks * 5 > Threshold) {
+      DEBUG(errs() << "NOT unswitching loop %"
+            << currentLoop->getHeader()->getName() << ", cost too high: "
+            << currentLoop->getBlocks().size() << "\n");
+      return false;
+    }
   }
 
   Constant *CondVal;
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index e6ffac2..af29f97 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -121,7 +121,6 @@ static bool isUnmovableInstruction(Instruction *I) {
   if (I->getOpcode() == Instruction::PHI ||
       I->getOpcode() == Instruction::Alloca ||
       I->getOpcode() == Instruction::Load ||
-      I->getOpcode() == Instruction::Malloc ||
       I->getOpcode() == Instruction::Invoke ||
       (I->getOpcode() == Instruction::Call &&
        !isa<DbgInfoIntrinsic>(I)) ||
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index b5edf4e..b745097 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1229,7 +1229,10 @@ CallOverdefined:
     TMRVI = TrackedMultipleRetVals.find(std::make_pair(F, 0));
     if (TMRVI == TrackedMultipleRetVals.end())
       goto CallOverdefined;
-    
+
+    // Need to mark as overdefined, otherwise it stays undefined which
+    // creates extractvalue undef, <idx>
+    markOverdefined(I);
     // If we are tracking this callee, propagate the return values of the call
     // into this call site.  We do this by walking all the uses. Single-index
     // ExtractValueInst uses can be tracked; anything more complicated is
@@ -1271,7 +1274,6 @@ CallOverdefined:
   }
 }
 
-
 void SCCPSolver::Solve() {
   // Process the work lists until they are empty!
   while (!BBWorkList.empty() || !InstWorkList.empty() ||
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 4931ab3..35907fd 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -425,14 +425,26 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   if (L) {
     if (IsLoopEntry) {
-      if (Loop *PredLoop = LI->getLoopFor(Preds[0])) {
-        // Add the new block to the nearest enclosing loop (and not an
-        // adjacent loop).
-        while (PredLoop && !PredLoop->contains(BB))
-          PredLoop = PredLoop->getParentLoop();
-        if (PredLoop)
-          PredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
-      }
+      // Add the new block to the nearest enclosing loop (and not an
+      // adjacent loop). To find this, examine each of the predecessors and
+      // determine which loops enclose them, and select the most-nested loop
+      // which contains the loop containing the block being split.
+      Loop *InnermostPredLoop = 0;
+      for (unsigned i = 0; i != NumPreds; ++i)
+        if (Loop *PredLoop = LI->getLoopFor(Preds[i])) {
+          // Seek a loop which actually contains the block being split (to
+          // avoid adjacent loops).
+          while (PredLoop && !PredLoop->contains(BB))
+            PredLoop = PredLoop->getParentLoop();
+          // Select the most-nested of these loops which contains the block.
+          if (PredLoop &&
+              PredLoop->contains(BB) &&
+              (!InnermostPredLoop ||
+               InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+            InnermostPredLoop = PredLoop;
+        }
+      if (InnermostPredLoop)
+        InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
     } else {
       L->addBasicBlockToLoop(NewBB, LI->getBase());
       if (SplitMakesNewLoopHeader)
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 0d00d69..619c939 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -444,18 +444,15 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD,
   if (InlinedFunctionInfo.ContainsDynamicAllocas) {
     Module *M = Caller->getParent();
     // Get the two intrinsics we care about.
-    Constant *StackSave, *StackRestore;
-    StackSave    = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
-    StackRestore = Intrinsic::getDeclaration(M, Intrinsic::stackrestore);
+    Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+    Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
 
     // If we are preserving the callgraph, add edges to the stacksave/restore
     // functions for the calls we insert.
     CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0;
     if (CG) {
-      // We know that StackSave/StackRestore are Function*'s, because they are
-      // intrinsics which must have the right types.
-      StackSaveCGN    = CG->getOrInsertFunction(cast<Function>(StackSave));
-      StackRestoreCGN = CG->getOrInsertFunction(cast<Function>(StackRestore));
+      StackSaveCGN    = CG->getOrInsertFunction(StackSave);
+      StackRestoreCGN = CG->getOrInsertFunction(StackRestore);
       CallerNode = (*CG)[Caller];
     }
 
@@ -480,7 +477,8 @@ bool llvm::InlineFunction(CallSite CS, CallGraph *CG, const TargetData *TD,
       for (Function::iterator BB = FirstNewBlock, E = Caller->end();
            BB != E; ++BB)
         if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-          CallInst::Create(StackRestore, SavedPtr, "", UI);
+          CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", UI);
+          if (CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
           ++NumStackRestores;
         }
     }
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 1fa51a3..7f11acf 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -33,11 +33,11 @@ namespace {
       for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
            AI != AE; ++AI)
         if (!AI->hasName() && AI->getType() != Type::getVoidTy(F.getContext()))
-          AI->setName("tmp");
+          AI->setName("arg");
 
       for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
         if (!BB->hasName())
-          BB->setName("BB");
+          BB->setName("bb");
         
         for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
           if (!I->hasName() && I->getType() != Type::getVoidTy(F.getContext()))
diff --git a/lib/Transforms/Utils/LowerAllocations.cpp b/lib/Transforms/Utils/LowerAllocations.cpp
index f26d7c1..9c9113d 100644
--- a/lib/Transforms/Utils/LowerAllocations.cpp
+++ b/lib/Transforms/Utils/LowerAllocations.cpp
@@ -1,4 +1,4 @@
-//===- LowerAllocations.cpp - Reduce malloc & free insts to calls ---------===//
+//===- LowerAllocations.cpp - Reduce free insts to calls ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,18 +29,15 @@ using namespace llvm;
 STATISTIC(NumLowered, "Number of allocations lowered");
 
 namespace {
-  /// LowerAllocations - Turn malloc and free instructions into @malloc and
-  /// @free calls.
+  /// LowerAllocations - Turn free instructions into @free calls.
   ///
   class VISIBILITY_HIDDEN LowerAllocations : public BasicBlockPass {
     Constant *FreeFunc;   // Functions in the module we are processing
                           // Initialized by doInitialization
-    bool LowerMallocArgToInteger;
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LowerAllocations(bool LowerToInt = false)
-      : BasicBlockPass(&ID), FreeFunc(0), 
-        LowerMallocArgToInteger(LowerToInt) {}
+    explicit LowerAllocations()
+      : BasicBlockPass(&ID), FreeFunc(0) {}
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetData>();
@@ -54,7 +51,7 @@ namespace {
     }
 
     /// doPassInitialization - For the lower allocations pass, this ensures that
-    /// a module contains a declaration for a malloc and a free function.
+    /// a module contains a declaration for a free function.
     ///
     bool doInitialization(Module &M);
 
@@ -76,13 +73,13 @@ X("lowerallocs", "Lower allocations from instructions to calls");
 // Publically exposed interface to pass...
 const PassInfo *const llvm::LowerAllocationsID = &X;
 // createLowerAllocationsPass - Interface to this file...
-Pass *llvm::createLowerAllocationsPass(bool LowerMallocArgToInteger) {
-  return new LowerAllocations(LowerMallocArgToInteger);
+Pass *llvm::createLowerAllocationsPass() {
+  return new LowerAllocations();
 }
 
 
 // doInitialization - For the lower allocations pass, this ensures that a
-// module contains a declaration for a malloc and a free function.
+// module contains a declaration for a free function.
 //
 // This function is always successful.
 //
@@ -102,25 +99,9 @@ bool LowerAllocations::runOnBasicBlock(BasicBlock &BB) {
 
   BasicBlock::InstListType &BBIL = BB.getInstList();
 
-  const TargetData &TD = getAnalysis<TargetData>();
-  const Type *IntPtrTy = TD.getIntPtrType(BB.getContext());
-
-  // Loop over all of the instructions, looking for malloc or free instructions
+  // Loop over all of the instructions, looking for free instructions
   for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
-    if (MallocInst *MI = dyn_cast<MallocInst>(I)) {
-      Value *ArraySize = MI->getOperand(0);
-      if (ArraySize->getType() != IntPtrTy)
-        ArraySize = CastInst::CreateIntegerCast(ArraySize, IntPtrTy,
-                                                false /*ZExt*/, "", I);
-      Value *MCast = CallInst::CreateMalloc(I, IntPtrTy,
-                                            MI->getAllocatedType(), ArraySize);
-
-      // Replace all uses of the old malloc inst with the cast inst
-      MI->replaceAllUsesWith(MCast);
-      I = --BBIL.erase(I);         // remove and delete the malloc instr...
-      Changed = true;
-      ++NumLowered;
-    } else if (FreeInst *FI = dyn_cast<FreeInst>(I)) {
+    if (FreeInst *FI = dyn_cast<FreeInst>(I)) {
       Value *PtrCast = 
         new BitCastInst(FI->getOperand(0),
                Type::getInt8PtrTy(BB.getContext()), "", I);
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 780ee26..8a07c35 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -48,7 +48,7 @@ void SSAUpdater::Initialize(Value *ProtoValue) {
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
-  
+
   if (IPI == 0)
     IPI = new IncomingPredInfoTy();
   else
@@ -104,12 +104,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // GetValueAtEndOfBlock to do our work.
   if (!getAvailableVals(AV).count(BB))
     return GetValueAtEndOfBlock(BB);
-  
+
   // Otherwise, we have the hard case.  Get the live-in values for each
   // predecessor.
   SmallVector<std::pair<BasicBlock*, Value*>, 8> PredValues;
   Value *SingularValue = 0;
-  
+
   // We can get our predecessor info by walking the pred_iterator list, but it
   // is relatively slow.  If we already have PHI nodes in this block, walk one
   // of them to get the predecessor list instead.
@@ -118,7 +118,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
       BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
       Value *PredVal = GetValueAtEndOfBlock(PredBB);
       PredValues.push_back(std::make_pair(PredBB, PredVal));
-      
+
       // Compute SingularValue.
       if (i == 0)
         SingularValue = PredVal;
@@ -131,7 +131,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
       BasicBlock *PredBB = *PI;
       Value *PredVal = GetValueAtEndOfBlock(PredBB);
       PredValues.push_back(std::make_pair(PredBB, PredVal));
-      
+
       // Compute SingularValue.
       if (isFirstPred) {
         SingularValue = PredVal;
@@ -140,25 +140,25 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
         SingularValue = 0;
     }
   }
-  
+
   // If there are no predecessors, just return undef.
   if (PredValues.empty())
     return UndefValue::get(PrototypeValue->getType());
-  
+
   // Otherwise, if all the merged values are the same, just use it.
   if (SingularValue != 0)
     return SingularValue;
-  
+
   // Otherwise, we do need a PHI: insert one now.
   PHINode *InsertedPHI = PHINode::Create(PrototypeValue->getType(),
                                          PrototypeValue->getName(),
                                          &BB->front());
   InsertedPHI->reserveOperandSpace(PredValues.size());
-  
+
   // Fill in all the predecessors of the PHI.
   for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
     InsertedPHI->addIncoming(PredValues[i].second, PredValues[i].first);
-  
+
   // See if the PHI node can be merged to a single value.  This can happen in
   // loop cases when we get a PHI of itself and one other value.
   if (Value *ConstVal = InsertedPHI->hasConstantValue()) {
@@ -168,7 +168,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
 
   // If the client wants to know about all new instructions, tell it.
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
-  
+
   DEBUG(errs() << "  Inserted PHI: " << *InsertedPHI << "\n");
   return InsertedPHI;
 }
@@ -177,11 +177,14 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
 /// which use their value in the corresponding predecessor.
 void SSAUpdater::RewriteUse(Use &U) {
   Instruction *User = cast<Instruction>(U.getUser());
-  BasicBlock *UseBB = User->getParent();
-  if (PHINode *UserPN = dyn_cast<PHINode>(User))
-    UseBB = UserPN->getIncomingBlock(U);
   
-  U.set(GetValueInMiddleOfBlock(UseBB));
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueInMiddleOfBlock(User->getParent());
+
+  U.set(V);
 }
 
 
@@ -192,11 +195,11 @@ void SSAUpdater::RewriteUse(Use &U) {
 ///
 Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   AvailableValsTy &AvailableVals = getAvailableVals(AV);
-  
+
   // Query AvailableVals by doing an insertion of null.
   std::pair<AvailableValsTy::iterator, bool> InsertRes =
   AvailableVals.insert(std::make_pair(BB, WeakVH()));
-  
+
   // Handle the case when the insertion fails because we have already seen BB.
   if (!InsertRes.second) {
     // If the insertion failed, there are two cases.  The first case is that the
@@ -204,7 +207,7 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
     // return the value.
     if (InsertRes.first->second != 0)
       return InsertRes.first->second;
-    
+
     // Otherwise, if the value we find is null, then this is the value is not
     // known but it is being computed elsewhere in our recursion.  This means
     // that we have a cycle.  Handle this by inserting a PHI node and returning
@@ -214,7 +217,7 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
     PHINode::Create(PrototypeValue->getType(), PrototypeValue->getName(),
                     &BB->front());
   }
-  
+
   // Okay, the value isn't in the map and we just inserted a null in the entry
   // to indicate that we're processing the block.  Since we have no idea what
   // value is in this block, we have to recurse through our predecessors.
@@ -225,13 +228,13 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   // of the recursion, just use IncomingPredInfo as an explicit stack.
   IncomingPredInfoTy &IncomingPredInfo = getIncomingPredInfo(IPI);
   unsigned FirstPredInfoEntry = IncomingPredInfo.size();
-  
+
   // As we're walking the predecessors, keep track of whether they are all
   // producing the same value.  If so, this value will capture it, if not, it
   // will get reset to null.  We distinguish the no-predecessor case explicitly
   // below.
   TrackingVH<Value> SingularValue;
-  
+
   // We can get our predecessor info by walking the pred_iterator list, but it
   // is relatively slow.  If we already have PHI nodes in this block, walk one
   // of them to get the predecessor list instead.
@@ -240,7 +243,7 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
       BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
       Value *PredVal = GetValueAtEndOfBlockInternal(PredBB);
       IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal));
-      
+
       // Compute SingularValue.
       if (i == 0)
         SingularValue = PredVal;
@@ -253,7 +256,7 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
       BasicBlock *PredBB = *PI;
       Value *PredVal = GetValueAtEndOfBlockInternal(PredBB);
       IncomingPredInfo.push_back(std::make_pair(PredBB, PredVal));
-      
+
       // Compute SingularValue.
       if (isFirstPred) {
         SingularValue = PredVal;
@@ -262,19 +265,19 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
         SingularValue = 0;
     }
   }
-  
+
   // If there are no predecessors, then we must have found an unreachable block
   // just return 'undef'.  Since there are no predecessors, InsertRes must not
   // be invalidated.
   if (IncomingPredInfo.size() == FirstPredInfoEntry)
     return InsertRes.first->second = UndefValue::get(PrototypeValue->getType());
-  
+
   /// Look up BB's entry in AvailableVals.  'InsertRes' may be invalidated.  If
   /// this block is involved in a loop, a no-entry PHI node will have been
   /// inserted as InsertedVal.  Otherwise, we'll still have the null we inserted
   /// above.
   TrackingVH<Value> &InsertedVal = AvailableVals[BB];
-  
+
   // If all the predecessor values are the same then we don't need to insert a
   // PHI.  This is the simple and common case.
   if (SingularValue) {
@@ -291,31 +294,31 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
     } else {
       InsertedVal = SingularValue;
     }
-    
+
     // Drop the entries we added in IncomingPredInfo to restore the stack.
     IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
                            IncomingPredInfo.end());
     return InsertedVal;
   }
-  
+
   // Otherwise, we do need a PHI: insert one now if we don't already have one.
   if (InsertedVal == 0)
     InsertedVal = PHINode::Create(PrototypeValue->getType(),
                                   PrototypeValue->getName(), &BB->front());
-  
+
   PHINode *InsertedPHI = cast<PHINode>(InsertedVal);
   InsertedPHI->reserveOperandSpace(IncomingPredInfo.size()-FirstPredInfoEntry);
-  
+
   // Fill in all the predecessors of the PHI.
   for (IncomingPredInfoTy::iterator I =
          IncomingPredInfo.begin()+FirstPredInfoEntry,
        E = IncomingPredInfo.end(); I != E; ++I)
     InsertedPHI->addIncoming(I->second, I->first);
-  
+
   // Drop the entries we added in IncomingPredInfo to restore the stack.
   IncomingPredInfo.erase(IncomingPredInfo.begin()+FirstPredInfoEntry,
                          IncomingPredInfo.end());
-  
+
   // See if the PHI node can be merged to a single value.  This can happen in
   // loop cases when we get a PHI of itself and one other value.
   if (Value *ConstVal = InsertedPHI->hasConstantValue()) {
@@ -324,12 +327,10 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
     InsertedVal = ConstVal;
   } else {
     DEBUG(errs() << "  Inserted PHI: " << *InsertedPHI << "\n");
-    
+
     // If the client wants to know about all new instructions, tell it.
     if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
   }
-  
+
   return InsertedVal;
 }
-
-
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index b5ae81b..d8a708d 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -680,6 +680,8 @@ void SlotTracker::processFunction() {
   ST_DEBUG("Inserting Instructions:\n");
 
   MetadataContext &TheMetadata = TheFunction->getContext().getMetadata();
+  typedef SmallVector<std::pair<unsigned, TrackingVH<MDNode> >, 2> MDMapTy;
+  MDMapTy MDs;
 
   // Add all of the basic blocks and instructions with no names.
   for (Function::const_iterator BB = TheFunction->begin(),
@@ -696,12 +698,11 @@ void SlotTracker::processFunction() {
           CreateMetadataSlot(N);
 
       // Process metadata attached with this instruction.
-      const MetadataContext::MDMapTy *MDs = TheMetadata.getMDs(I);
-      if (MDs)
-        for (MetadataContext::MDMapTy::const_iterator MI = MDs->begin(),
-               ME = MDs->end(); MI != ME; ++MI)
-          if (MDNode *MDN = dyn_cast_or_null<MDNode>(MI->second))
-            CreateMetadataSlot(MDN);
+      MDs.clear();
+      TheMetadata.getMDs(I, MDs);
+      for (MDMapTy::const_iterator MI = MDs.begin(), ME = MDs.end(); MI != ME; 
+           ++MI)
+        CreateMetadataSlot(MI->second);
     }
   }
 
@@ -818,9 +819,8 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
   unsigned DestSlot = mdnNext++;
   mdnMap[N] = DestSlot;
 
-  for (MDNode::const_elem_iterator MDI = N->elem_begin(),
-         MDE = N->elem_end(); MDI != MDE; ++MDI) {
-    const Value *TV = *MDI;
+  for (unsigned i = 0, e = N->getNumElements(); i != e; ++i) {
+    const Value *TV = N->getElement(i);
     if (TV)
       if (const MDNode *N2 = dyn_cast<MDNode>(TV))
         CreateMetadataSlot(N2);
@@ -906,9 +906,8 @@ static void WriteMDNodes(formatted_raw_ostream &Out, TypePrinting &TypePrinter,
     Out << '!' << i << " = metadata ";
     const MDNode *Node = Nodes[i];
     Out << "!{";
-    for (MDNode::const_elem_iterator NI = Node->elem_begin(),
-           NE = Node->elem_end(); NI != NE;) {
-      const Value *V = *NI;
+    for (unsigned mi = 0, me = Node->getNumElements(); mi != me; ++mi) {
+      const Value *V = Node->getElement(mi);
       if (!V)
         Out << "null";
       else if (const MDNode *N = dyn_cast<MDNode>(V)) {
@@ -916,11 +915,12 @@ static void WriteMDNodes(formatted_raw_ostream &Out, TypePrinting &TypePrinter,
         Out << '!' << Machine.getMetadataSlot(N);
       }
       else {
-        TypePrinter.print((*NI)->getType(), Out);
+        TypePrinter.print(V->getType(), Out);
         Out << ' ';
-        WriteAsOperandInternal(Out, *NI, &TypePrinter, &Machine);
+        WriteAsOperandInternal(Out, Node->getElement(mi), 
+                               &TypePrinter, &Machine);
       }
-      if (++NI != NE)
+      if (mi + 1 != me)
         Out << ", ";
     }
 
@@ -1206,8 +1206,8 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     Out << "asm ";
     if (IA->hasSideEffects())
       Out << "sideeffect ";
-    if (IA->isMsAsm())
-      Out << "msasm ";
+    if (IA->isAlignStack())
+      Out << "alignstack ";
     Out << '"';
     PrintEscapedString(IA->getAsmString(), Out);
     Out << "\", \"";
@@ -1296,7 +1296,7 @@ class AssemblyWriter {
   TypePrinting TypePrinter;
   AssemblyAnnotationWriter *AnnotationWriter;
   std::vector<const Type*> NumberedTypes;
-  DenseMap<unsigned, const char *> MDNames;
+  DenseMap<unsigned, StringRef> MDNames;
 
 public:
   inline AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
@@ -1307,11 +1307,12 @@ public:
     // FIXME: Provide MDPrinter
     if (M) {
       MetadataContext &TheMetadata = M->getContext().getMetadata();
-      const StringMap<unsigned> *Names = TheMetadata.getHandlerNames();
-      for (StringMapConstIterator<unsigned> I = Names->begin(),
-             E = Names->end(); I != E; ++I) {
-        const StringMapEntry<unsigned> &Entry = *I;
-        MDNames[I->second] = Entry.getKeyData();
+      SmallVector<std::pair<unsigned, StringRef>, 4> Names;
+      TheMetadata.getHandlerNames(Names);
+      for (SmallVector<std::pair<unsigned, StringRef>, 4>::iterator 
+             I = Names.begin(),
+             E = Names.end(); I != E; ++I) {
+      MDNames[I->first] = I->second;
       }
     }
   }
@@ -2035,13 +2036,13 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
   // Print Metadata info
   if (!MDNames.empty()) {
     MetadataContext &TheMetadata = I.getContext().getMetadata();
-    const MetadataContext::MDMapTy *MDMap = TheMetadata.getMDs(&I);
-    if (MDMap)
-      for (MetadataContext::MDMapTy::const_iterator MI = MDMap->begin(),
-             ME = MDMap->end(); MI != ME; ++MI)
-        if (const MDNode *MD = dyn_cast_or_null<MDNode>(MI->second))
-          Out << ", !" << MDNames[MI->first]
-              << " !" << Machine.getMetadataSlot(MD);
+    typedef SmallVector<std::pair<unsigned, TrackingVH<MDNode> >, 2> MDMapTy;
+    MDMapTy MDs;
+    TheMetadata.getMDs(&I, MDs);
+    for (MDMapTy::const_iterator MI = MDs.begin(), ME = MDs.end(); MI != ME; 
+         ++MI)
+      Out << ", !" << MDNames[MI->first]
+          << " !" << Machine.getMetadataSlot(MI->second);
   }
   printInfoComment(I);
 }
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index c1fcc5f..2c0a67f 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -179,6 +179,151 @@ static Constant *FoldBitCast(LLVMContext &Context,
 }
 
 
+/// ExtractConstantBytes - V is an integer constant which only has a subset of
+/// its bytes used.  The bytes used are indicated by ByteStart (which is the
+/// first byte used, counting from the least significant byte) and ByteSize,
+/// which is the number of bytes used.
+///
+/// This function analyzes the specified constant to see if the specified byte
+/// range can be returned as a simplified constant.  If so, the constant is
+/// returned, otherwise null is returned.
+/// 
+static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
+                                      unsigned ByteSize) {
+  assert(isa<IntegerType>(C->getType()) &&
+         (cast<IntegerType>(C->getType())->getBitWidth() & 7) == 0 &&
+         "Non-byte sized integer input");
+  unsigned CSize = cast<IntegerType>(C->getType())->getBitWidth()/8;
+  assert(ByteSize && "Must be accessing some piece");
+  assert(ByteStart+ByteSize <= CSize && "Extracting invalid piece from input");
+  assert(ByteSize != CSize && "Should not extract everything");
+  
+  // Constant Integers are simple.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    APInt V = CI->getValue();
+    if (ByteStart)
+      V = V.lshr(ByteStart*8);
+    V.trunc(ByteSize*8);
+    return ConstantInt::get(CI->getContext(), V);
+  }
+  
+  // In the input is a constant expr, we might be able to recursively simplify.
+  // If not, we definitely can't do anything.
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (CE == 0) return 0;
+  
+  switch (CE->getOpcode()) {
+  default: return 0;
+  case Instruction::Or: {
+    Constant *RHS = ExtractConstantBytes(C->getOperand(1), ByteStart, ByteSize);
+    if (RHS == 0)
+      return 0;
+    
+    // X | -1 -> -1.
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS))
+      if (RHSC->isAllOnesValue())
+        return RHSC;
+    
+    Constant *LHS = ExtractConstantBytes(C->getOperand(0), ByteStart, ByteSize);
+    if (LHS == 0)
+      return 0;
+    return ConstantExpr::getOr(LHS, RHS);
+  }
+  case Instruction::And: {
+    Constant *RHS = ExtractConstantBytes(C->getOperand(1), ByteStart, ByteSize);
+    if (RHS == 0)
+      return 0;
+    
+    // X & 0 -> 0.
+    if (RHS->isNullValue())
+      return RHS;
+    
+    Constant *LHS = ExtractConstantBytes(C->getOperand(0), ByteStart, ByteSize);
+    if (LHS == 0)
+      return 0;
+    return ConstantExpr::getAnd(LHS, RHS);
+  }
+  case Instruction::LShr: {
+    ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
+    if (Amt == 0)
+      return 0;
+    unsigned ShAmt = Amt->getZExtValue();
+    // Cannot analyze non-byte shifts.
+    if ((ShAmt & 7) != 0)
+      return 0;
+    ShAmt >>= 3;
+    
+    // If the extract is known to be all zeros, return zero.
+    if (ByteStart >= CSize-ShAmt)
+      return Constant::getNullValue(IntegerType::get(CE->getContext(),
+                                                     ByteSize*8));
+    // If the extract is known to be fully in the input, extract it.
+    if (ByteStart+ByteSize+ShAmt <= CSize)
+      return ExtractConstantBytes(C->getOperand(0), ByteStart+ShAmt, ByteSize);
+    
+    // TODO: Handle the 'partially zero' case.
+    return 0;
+  }
+    
+  case Instruction::Shl: {
+    ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
+    if (Amt == 0)
+      return 0;
+    unsigned ShAmt = Amt->getZExtValue();
+    // Cannot analyze non-byte shifts.
+    if ((ShAmt & 7) != 0)
+      return 0;
+    ShAmt >>= 3;
+    
+    // If the extract is known to be all zeros, return zero.
+    if (ByteStart+ByteSize <= ShAmt)
+      return Constant::getNullValue(IntegerType::get(CE->getContext(),
+                                                     ByteSize*8));
+    // If the extract is known to be fully in the input, extract it.
+    if (ByteStart >= ShAmt)
+      return ExtractConstantBytes(C->getOperand(0), ByteStart-ShAmt, ByteSize);
+    
+    // TODO: Handle the 'partially zero' case.
+    return 0;
+  }
+      
+  case Instruction::ZExt: {
+    unsigned SrcBitSize =
+      cast<IntegerType>(C->getOperand(0)->getType())->getBitWidth();
+    
+    // If extracting something that is completely zero, return 0.
+    if (ByteStart*8 >= SrcBitSize)
+      return Constant::getNullValue(IntegerType::get(CE->getContext(),
+                                                     ByteSize*8));
+
+    // If exactly extracting the input, return it.
+    if (ByteStart == 0 && ByteSize*8 == SrcBitSize)
+      return C->getOperand(0);
+    
+    // If extracting something completely in the input, if if the input is a
+    // multiple of 8 bits, recurse.
+    if ((SrcBitSize&7) == 0 && (ByteStart+ByteSize)*8 <= SrcBitSize)
+      return ExtractConstantBytes(C->getOperand(0), ByteStart, ByteSize);
+      
+    // Otherwise, if extracting a subset of the input, which is not multiple of
+    // 8 bits, do a shift and trunc to get the bits.
+    if ((ByteStart+ByteSize)*8 < SrcBitSize) {
+      assert((SrcBitSize&7) && "Shouldn't get byte sized case here");
+      Constant *Res = C->getOperand(0);
+      if (ByteStart)
+        Res = ConstantExpr::getLShr(Res, 
+                                 ConstantInt::get(Res->getType(), ByteStart*8));
+      return ConstantExpr::getTrunc(Res, IntegerType::get(C->getContext(),
+                                                          ByteSize*8));
+    }
+    
+    // TODO: Handle the 'partially zero' case.
+    return 0;
+  }
+  }
+}
+
+
 Constant *llvm::ConstantFoldCastInstruction(LLVMContext &Context, 
                                             unsigned opc, Constant *V,
                                             const Type *DestTy) {
@@ -236,6 +381,8 @@ Constant *llvm::ConstantFoldCastInstruction(LLVMContext &Context,
   // We actually have to do a cast now. Perform the cast according to the
   // opcode specified.
   switch (opc) {
+  default:
+    llvm_unreachable("Failed to cast constant expression");
   case Instruction::FPTrunc:
   case Instruction::FPExt:
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
@@ -300,23 +447,27 @@ Constant *llvm::ConstantFoldCastInstruction(LLVMContext &Context,
       return ConstantInt::get(Context, Result);
     }
     return 0;
-  case Instruction::Trunc:
+  case Instruction::Trunc: {
+    uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-      uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
       APInt Result(CI->getValue());
-      Result.trunc(BitWidth);
+      Result.trunc(DestBitWidth);
       return ConstantInt::get(Context, Result);
     }
+    
+    // The input must be a constantexpr.  See if we can simplify this based on
+    // the bytes we are demanding.  Only do this if the source and dest are an
+    // even multiple of a byte.
+    if ((DestBitWidth & 7) == 0 &&
+        (cast<IntegerType>(V->getType())->getBitWidth() & 7) == 0)
+      if (Constant *Res = ExtractConstantBytes(V, 0, DestBitWidth / 8))
+        return Res;
+      
     return 0;
+  }
   case Instruction::BitCast:
     return FoldBitCast(Context, V, DestTy);
-  default:
-    assert(!"Invalid CE CastInst opcode");
-    break;
   }
-
-  llvm_unreachable("Failed to cast constant expression");
-  return 0;
 }
 
 Constant *llvm::ConstantFoldSelectInstruction(LLVMContext&,
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 529c455..02c3352 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -232,7 +232,6 @@ ConstantInt::ConstantInt(const IntegerType *Ty, const APInt& V)
 
 ConstantInt* ConstantInt::getTrue(LLVMContext &Context) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  sys::SmartScopedWriter<true>(pImpl->ConstantsLock);
   if (pImpl->TheTrueVal)
     return pImpl->TheTrueVal;
   else
@@ -242,7 +241,6 @@ ConstantInt* ConstantInt::getTrue(LLVMContext &Context) {
 
 ConstantInt* ConstantInt::getFalse(LLVMContext &Context) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  sys::SmartScopedWriter<true>(pImpl->ConstantsLock);
   if (pImpl->TheFalseVal)
     return pImpl->TheFalseVal;
   else
@@ -261,22 +259,9 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt& V) {
   const IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
   // get an existing value or the insertion position
   DenseMapAPIntKeyInfo::KeyTy Key(V, ITy);
-  
-  Context.pImpl->ConstantsLock.reader_acquire();
   ConstantInt *&Slot = Context.pImpl->IntConstants[Key]; 
-  Context.pImpl->ConstantsLock.reader_release();
-    
-  if (!Slot) {
-    sys::SmartScopedWriter<true> Writer(Context.pImpl->ConstantsLock);
-    ConstantInt *&NewSlot = Context.pImpl->IntConstants[Key]; 
-    if (!Slot) {
-      NewSlot = new ConstantInt(ITy, V);
-    }
-    
-    return NewSlot;
-  } else {
-    return Slot;
-  }
+  if (!Slot) Slot = new ConstantInt(ITy, V);
+  return Slot;
 }
 
 Constant* ConstantInt::get(const Type* Ty, uint64_t V, bool isSigned) {
@@ -405,32 +390,24 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   
   LLVMContextImpl* pImpl = Context.pImpl;
   
-  pImpl->ConstantsLock.reader_acquire();
   ConstantFP *&Slot = pImpl->FPConstants[Key];
-  pImpl->ConstantsLock.reader_release();
     
   if (!Slot) {
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
-    ConstantFP *&NewSlot = pImpl->FPConstants[Key];
-    if (!NewSlot) {
-      const Type *Ty;
-      if (&V.getSemantics() == &APFloat::IEEEsingle)
-        Ty = Type::getFloatTy(Context);
-      else if (&V.getSemantics() == &APFloat::IEEEdouble)
-        Ty = Type::getDoubleTy(Context);
-      else if (&V.getSemantics() == &APFloat::x87DoubleExtended)
-        Ty = Type::getX86_FP80Ty(Context);
-      else if (&V.getSemantics() == &APFloat::IEEEquad)
-        Ty = Type::getFP128Ty(Context);
-      else {
-        assert(&V.getSemantics() == &APFloat::PPCDoubleDouble && 
-               "Unknown FP format");
-        Ty = Type::getPPC_FP128Ty(Context);
-      }
-      NewSlot = new ConstantFP(Ty, V);
+    const Type *Ty;
+    if (&V.getSemantics() == &APFloat::IEEEsingle)
+      Ty = Type::getFloatTy(Context);
+    else if (&V.getSemantics() == &APFloat::IEEEdouble)
+      Ty = Type::getDoubleTy(Context);
+    else if (&V.getSemantics() == &APFloat::x87DoubleExtended)
+      Ty = Type::getX86_FP80Ty(Context);
+    else if (&V.getSemantics() == &APFloat::IEEEquad)
+      Ty = Type::getFP128Ty(Context);
+    else {
+      assert(&V.getSemantics() == &APFloat::PPCDoubleDouble && 
+             "Unknown FP format");
+      Ty = Type::getPPC_FP128Ty(Context);
     }
-    
-    return NewSlot;
+    Slot = new ConstantFP(Ty, V);
   }
   
   return Slot;
@@ -1908,7 +1885,6 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Replacement = ConstantAggregateZero::get(getType());
   } else {
     // Check to see if we have this array type already.
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
     bool Exists;
     LLVMContextImpl::ArrayConstantsTy::MapTy::iterator I =
       pImpl->ArrayConstants.InsertOrGetItem(Lookup, Exists);
@@ -1987,7 +1963,6 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
     Replacement = ConstantAggregateZero::get(getType());
   } else {
     // Check to see if we have this array type already.
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
     bool Exists;
     LLVMContextImpl::StructConstantsTy::MapTy::iterator I =
       pImpl->StructConstants.InsertOrGetItem(Lookup, Exists);
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index bff3087..a28037d 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -885,9 +885,9 @@ LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
 
 LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString, 
                                 const char *Constraints, int HasSideEffects,
-                                int IsMsAsm) {
+                                int IsAlignStack) {
   return wrap(InlineAsm::get(dyn_cast<FunctionType>(unwrap(Ty)), AsmString, 
-                             Constraints, HasSideEffects, IsMsAsm));
+                             Constraints, HasSideEffects, IsAlignStack));
 }
 
 /*--.. Operations on global variables, functions, and aliases (globals) ....--*/
@@ -1699,12 +1699,18 @@ LLVMValueRef LLVMBuildNot(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
 
 LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
                              const char *Name) {
-  return wrap(unwrap(B)->CreateMalloc(unwrap(Ty), 0, Name));
+  const Type* IntPtrT = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
+  return wrap(unwrap(B)->Insert(CallInst::CreateMalloc(
+      unwrap(B)->GetInsertBlock(), IntPtrT, unwrap(Ty), 0, 0, ""),
+      Twine(Name)));
 }
 
 LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
                                   LLVMValueRef Val, const char *Name) {
-  return wrap(unwrap(B)->CreateMalloc(unwrap(Ty), unwrap(Val), Name));
+  const Type* IntPtrT = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
+  return wrap(unwrap(B)->Insert(CallInst::CreateMalloc(
+      unwrap(B)->GetInsertBlock(), IntPtrT, unwrap(Ty), unwrap(Val), 0, ""),
+      Twine(Name)));
 }
 
 LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index 0520dfa..3a36a1b 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -28,18 +28,20 @@ InlineAsm::~InlineAsm() {
 
 InlineAsm *InlineAsm::get(const FunctionType *Ty, const StringRef &AsmString,
                           const StringRef &Constraints, bool hasSideEffects,
-                          bool isMsAsm) {
+                          bool isAlignStack) {
   // FIXME: memoize!
-  return new InlineAsm(Ty, AsmString, Constraints, hasSideEffects, isMsAsm);  
+  return new InlineAsm(Ty, AsmString, Constraints, hasSideEffects, 
+                       isAlignStack);
 }
 
 InlineAsm::InlineAsm(const FunctionType *Ty, const StringRef &asmString,
                      const StringRef &constraints, bool hasSideEffects,
-                     bool isMsAsm)
+                     bool isAlignStack)
   : Value(PointerType::getUnqual(Ty), 
           Value::InlineAsmVal), 
     AsmString(asmString), 
-    Constraints(constraints), HasSideEffects(hasSideEffects), IsMsAsm(isMsAsm) {
+    Constraints(constraints), HasSideEffects(hasSideEffects), 
+    IsAlignStack(isAlignStack) {
 
   // Do various checks on the constraint string and type.
   assert(Verify(Ty, constraints) && "Function type not legal for constraints!");
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 4df536e..dd8a543 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -127,7 +127,6 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case Xor: return "xor";
 
   // Memory instructions...
-  case Malloc:        return "malloc";
   case Free:          return "free";
   case Alloca:        return "alloca";
   case Load:          return "load";
@@ -442,7 +441,6 @@ bool Instruction::isSafeToSpeculativelyExecute() const {
                   // overflow-checking arithmetic, etc.)
   case VAArg:
   case Alloca:
-  case Malloc:
   case Invoke:
   case PHI:
   case Store:
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index f3d15cb..e212d5c 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -460,9 +460,10 @@ static Value *checkArraySize(Value *Amt, const Type *IntPtrTy) {
   return Amt;
 }
 
-static Value *createMalloc(Instruction *InsertBefore, BasicBlock *InsertAtEnd,
-                           const Type *IntPtrTy, const Type *AllocTy,
-                           Value *ArraySize, const Twine &NameStr) {
+static Instruction *createMalloc(Instruction *InsertBefore,
+                                 BasicBlock *InsertAtEnd, const Type *IntPtrTy,
+                                 const Type *AllocTy, Value *ArraySize,
+                                 Function *MallocF, const Twine &NameStr) {
   assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
          "createMalloc needs either InsertBefore or InsertAtEnd");
 
@@ -499,27 +500,34 @@ static Value *createMalloc(Instruction *InsertBefore, BasicBlock *InsertAtEnd,
   BasicBlock* BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
   Module* M = BB->getParent()->getParent();
   const Type *BPTy = Type::getInt8PtrTy(BB->getContext());
-  // prototype malloc as "void *malloc(size_t)"
-  Constant *MallocF = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, NULL);
-  if (!cast<Function>(MallocF)->doesNotAlias(0))
-    cast<Function>(MallocF)->setDoesNotAlias(0);
+  if (!MallocF)
+    // prototype malloc as "void *malloc(size_t)"
+    MallocF = cast<Function>(M->getOrInsertFunction("malloc", BPTy,
+                                                    IntPtrTy, NULL));
+  if (!MallocF->doesNotAlias(0)) MallocF->setDoesNotAlias(0);
   const PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
   CallInst *MCall = NULL;
-  Value    *MCast = NULL;
+  Instruction *Result = NULL;
   if (InsertBefore) {
     MCall = CallInst::Create(MallocF, AllocSize, "malloccall", InsertBefore);
-    // Create a cast instruction to convert to the right type...
-    MCast = new BitCastInst(MCall, AllocPtrType, NameStr, InsertBefore);
+    Result = MCall;
+    if (Result->getType() != AllocPtrType)
+      // Create a cast instruction to convert to the right type...
+      Result = new BitCastInst(MCall, AllocPtrType, NameStr, InsertBefore);
   } else {
-    MCall = CallInst::Create(MallocF, AllocSize, "malloccall", InsertAtEnd);
-    // Create a cast instruction to convert to the right type...
-    MCast = new BitCastInst(MCall, AllocPtrType, NameStr);
+    MCall = CallInst::Create(MallocF, AllocSize, "malloccall");
+    Result = MCall;
+    if (Result->getType() != AllocPtrType) {
+      InsertAtEnd->getInstList().push_back(MCall);
+      // Create a cast instruction to convert to the right type...
+      Result = new BitCastInst(MCall, AllocPtrType, NameStr);
+    }
   }
   MCall->setTailCall();
   assert(MCall->getType() != Type::getVoidTy(BB->getContext()) &&
          "Malloc has void return type");
 
-  return MCast;
+  return Result;
 }
 
 /// CreateMalloc - Generate the IR for a call to malloc:
@@ -528,10 +536,11 @@ static Value *createMalloc(Instruction *InsertBefore, BasicBlock *InsertAtEnd,
 ///    constant 1.
 /// 2. Call malloc with that argument.
 /// 3. Bitcast the result of the malloc call to the specified type.
-Value *CallInst::CreateMalloc(Instruction *InsertBefore, const Type *IntPtrTy,
-                              const Type *AllocTy, Value *ArraySize,
-                              const Twine &Name) {
-  return createMalloc(InsertBefore, NULL, IntPtrTy, AllocTy, ArraySize, Name);
+Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
+                                    const Type *IntPtrTy, const Type *AllocTy,
+                                    Value *ArraySize, const Twine &Name) {
+  return createMalloc(InsertBefore, NULL, IntPtrTy, AllocTy, 
+                      ArraySize, NULL, Name);
 }
 
 /// CreateMalloc - Generate the IR for a call to malloc:
@@ -542,10 +551,12 @@ Value *CallInst::CreateMalloc(Instruction *InsertBefore, const Type *IntPtrTy,
 /// 3. Bitcast the result of the malloc call to the specified type.
 /// Note: This function does not add the bitcast to the basic block, that is the
 /// responsibility of the caller.
-Value *CallInst::CreateMalloc(BasicBlock *InsertAtEnd, const Type *IntPtrTy,
-                              const Type *AllocTy, Value *ArraySize, 
-                              const Twine &Name) {
-  return createMalloc(NULL, InsertAtEnd, IntPtrTy, AllocTy, ArraySize, Name);
+Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
+                                    const Type *IntPtrTy, const Type *AllocTy,
+                                    Value *ArraySize, Function* MallocF,
+                                    const Twine &Name) {
+  return createMalloc(NULL, InsertAtEnd, IntPtrTy, AllocTy,
+                      ArraySize, MallocF, Name);
 }
 
 //===----------------------------------------------------------------------===//
@@ -837,7 +848,7 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
     assert(!isa<BasicBlock>(Amt) &&
            "Passed basic block into allocation size parameter! Use other ctor");
     assert(Amt->getType() == Type::getInt32Ty(Context) &&
-           "Malloc/Allocation array size is not a 32-bit integer!");
+           "Allocation array size is not a 32-bit integer!");
   }
   return Amt;
 }
@@ -3073,18 +3084,6 @@ InsertValueInst *InsertValueInst::clone() const {
   return New;
 }
 
-MallocInst *MallocInst::clone() const {
-  MallocInst *New = new MallocInst(getAllocatedType(),
-                                   (Value*)getOperand(0),
-                                   getAlignment());
-  New->SubclassOptionalData = SubclassOptionalData;
-  if (hasMetadata()) {
-    LLVMContext &Context = getContext();
-    Context.pImpl->TheMetadata.ValueIsCloned(this, New);
-  }
-  return New;
-}
-
 AllocaInst *AllocaInst::clone() const {
   AllocaInst *New = new AllocaInst(getAllocatedType(),
                                    (Value*)getOperand(0),
diff --git a/lib/VMCore/LLVMContext.cpp b/lib/VMCore/LLVMContext.cpp
index 39ed7ed..3b4a1a3 100644
--- a/lib/VMCore/LLVMContext.cpp
+++ b/lib/VMCore/LLVMContext.cpp
@@ -45,32 +45,6 @@ GetElementPtrConstantExpr::GetElementPtrConstantExpr
     OperandList[i+1] = IdxList[i];
 }
 
-bool LLVMContext::RemoveDeadMetadata() {
-  std::vector<WeakVH> DeadMDNodes;
-  bool Changed = false;
-  while (1) {
-
-    for (FoldingSet<MDNode>::iterator 
-           I = pImpl->MDNodeSet.begin(),
-           E = pImpl->MDNodeSet.end(); I != E; ++I) {
-      MDNode *N = &(*I);
-      if (N->use_empty()) 
-        DeadMDNodes.push_back(WeakVH(N));
-    }
-    
-    if (DeadMDNodes.empty())
-      return Changed;
-
-    while (!DeadMDNodes.empty()) {
-      Value *V = DeadMDNodes.back(); DeadMDNodes.pop_back();
-      if (const MDNode *N = dyn_cast_or_null<MDNode>(V))
-        if (N->use_empty())
-          delete N;
-    }
-  }
-  return Changed;
-}
-
 MetadataContext &LLVMContext::getMetadata() {
   return pImpl->TheMetadata;
 }
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 83888c3..84902d5 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -96,7 +96,6 @@ struct DenseMapAPFloatKeyInfo {
 
 class LLVMContextImpl {
 public:
-  sys::SmartRWMutex<true> ConstantsLock;
   typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt*, 
                          DenseMapAPIntKeyInfo> IntMapTy;
   IntMapTy IntConstants;
@@ -204,9 +203,6 @@ public:
     AggZeroConstants.freeConstants();
     NullPtrConstants.freeConstants();
     UndefValueConstants.freeConstants();
-    for (FoldingSet<MDNode>::iterator I = MDNodeSet.begin(), 
-           E = MDNodeSet.end(); I != E; ++I)
-      I->dropAllReferences();
     for (IntMapTy::iterator I = IntConstants.begin(), E = IntConstants.end(); 
          I != E; ++I) {
       if (I->second->use_empty())
@@ -217,6 +213,7 @@ public:
       if (I->second->use_empty())
         delete I->second;
     }
+    MDNodeSet.clear();
   }
 };
 
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 110c5e3..ad9653f 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -16,73 +16,44 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Instruction.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include "SymbolTableListTraitsImpl.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-//MetadataBase implementation
+// MetadataBase implementation.
 //
 
-/// resizeOperands - Metadata keeps track of other metadata uses using 
-/// OperandList. Resize this list to hold anticipated number of metadata
-/// operands.
-void MetadataBase::resizeOperands(unsigned NumOps) {
-  unsigned e = getNumOperands();
-  if (NumOps == 0) {
-    NumOps = e*2;
-    if (NumOps < 2) NumOps = 2;  
-  } else if (NumOps > NumOperands) {
-    // No resize needed.
-    if (ReservedSpace >= NumOps) return;
-  } else if (NumOps == NumOperands) {
-    if (ReservedSpace == NumOps) return;
-  } else {
-    return;
-  }
-
-  ReservedSpace = NumOps;
-  Use *OldOps = OperandList;
-  Use *NewOps = allocHungoffUses(NumOps);
-  std::copy(OldOps, OldOps + e, NewOps);
-  OperandList = NewOps;
-  if (OldOps) Use::zap(OldOps, OldOps + e, true);
-}
 //===----------------------------------------------------------------------===//
-//MDString implementation
+// MDString implementation.
 //
-MDString *MDString::get(LLVMContext &Context, const StringRef &Str) {
+MDString *MDString::get(LLVMContext &Context, StringRef Str) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
   StringMapEntry<MDString *> &Entry = 
     pImpl->MDStringCache.GetOrCreateValue(Str);
   MDString *&S = Entry.getValue();
-  if (!S) S = new MDString(Context, Entry.getKeyData(),
-                           Entry.getKeyLength());
-
-  return S;
+  if (S) return S;
+  
+  return S = 
+    new MDString(Context, Entry.getKey());
 }
 
 //===----------------------------------------------------------------------===//
-//MDNode implementation
+// MDNode implementation.
 //
-MDNode::MDNode(LLVMContext &C, Value*const* Vals, unsigned NumVals)
+MDNode::MDNode(LLVMContext &C, Value *const *Vals, unsigned NumVals)
   : MetadataBase(Type::getMetadataTy(C), Value::MDNodeVal) {
-  NumOperands = 0;
-  resizeOperands(NumVals);
-  for (unsigned i = 0; i != NumVals; ++i) {
-    // Only record metadata uses.
-    if (MetadataBase *MB = dyn_cast_or_null<MetadataBase>(Vals[i]))
-      OperandList[NumOperands++] = MB;
-    else if(Vals[i] && 
-            Vals[i]->getType()->getTypeID() == Type::MetadataTyID)
-      OperandList[NumOperands++] = Vals[i];
-    Node.push_back(ElementVH(Vals[i], this));
-  }
+  NodeSize = NumVals;
+  Node = new ElementVH[NodeSize];
+  ElementVH *Ptr = Node;
+  for (unsigned i = 0; i != NumVals; ++i) 
+    *Ptr++ = ElementVH(Vals[i], this);
 }
 
 void MDNode::Profile(FoldingSetNodeID &ID) const {
-  for (const_elem_iterator I = elem_begin(), E = elem_end(); I != E; ++I)
-    ID.AddPointer(*I);
+  for (unsigned i = 0, e = getNumElements(); i != e; ++i)
+    ID.AddPointer(getElement(i));
 }
 
 MDNode *MDNode::get(LLVMContext &Context, Value*const* Vals, unsigned NumVals) {
@@ -91,37 +62,31 @@ MDNode *MDNode::get(LLVMContext &Context, Value*const* Vals, unsigned NumVals) {
   for (unsigned i = 0; i != NumVals; ++i)
     ID.AddPointer(Vals[i]);
 
-  pImpl->ConstantsLock.reader_acquire();
   void *InsertPoint;
-  MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
-  pImpl->ConstantsLock.reader_release();
+  MDNode *N;
+  {
+    N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
+  }  
+  if (N) return N;
   
+  N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
   if (!N) {
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
-    N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
-    if (!N) {
-      // InsertPoint will have been set by the FindNodeOrInsertPos call.
-      N = new MDNode(Context, Vals, NumVals);
-      pImpl->MDNodeSet.InsertNode(N, InsertPoint);
-    }
+    // InsertPoint will have been set by the FindNodeOrInsertPos call.
+    N = new MDNode(Context, Vals, NumVals);
+    pImpl->MDNodeSet.InsertNode(N, InsertPoint);
   }
 
   return N;
 }
 
-/// dropAllReferences - Remove all uses and clear node vector.
-void MDNode::dropAllReferences() {
-  User::dropAllReferences();
-  Node.clear();
-}
-
+/// ~MDNode - Destroy MDNode.
 MDNode::~MDNode() {
   {
     LLVMContextImpl *pImpl = getType()->getContext().pImpl;
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
     pImpl->MDNodeSet.RemoveNode(this);
   }
-  dropAllReferences();
+  delete [] Node;
+  Node = NULL;
 }
 
 // Replace value from this node's element list.
@@ -136,9 +101,8 @@ void MDNode::replaceElement(Value *From, Value *To) {
   // From in this MDNode's element list.
   SmallVector<unsigned, 4> Indexes;
   unsigned Index = 0;
-  for (SmallVector<ElementVH, 4>::iterator I = Node.begin(),
-         E = Node.end(); I != E; ++I, ++Index) {
-    Value *V = *I;
+  for (unsigned i = 0, e = getNumElements(); i != e; ++i, ++Index) {
+    Value *V = getElement(i);
     if (V && V == From) 
       Indexes.push_back(Index);
   }
@@ -147,31 +111,7 @@ void MDNode::replaceElement(Value *From, Value *To) {
     return;
 
   // Remove "this" from the context map. 
-  {
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
-    pImpl->MDNodeSet.RemoveNode(this);
-  }
-
-  // MDNode only lists metadata elements in operand list, because MDNode
-  // used by MDNode is considered a valid use. However on the side, MDNode
-  // using a non-metadata value is not considered a "use" of non-metadata
-  // value.
-  SmallVector<unsigned, 4> OpIndexes;
-  unsigned OpIndex = 0;
-  for (User::op_iterator OI = op_begin(), OE = op_end();
-       OI != OE; ++OI, OpIndex++) {
-    if (*OI == From)
-      OpIndexes.push_back(OpIndex);
-  }
-  if (MetadataBase *MDTo = dyn_cast_or_null<MetadataBase>(To)) {
-    for (SmallVector<unsigned, 4>::iterator OI = OpIndexes.begin(),
-           OE = OpIndexes.end(); OI != OE; ++OI)
-      setOperand(*OI, MDTo);
-  } else {
-    for (SmallVector<unsigned, 4>::iterator OI = OpIndexes.begin(),
-           OE = OpIndexes.end(); OI != OE; ++OI)
-      setOperand(*OI, 0);
-  }
+  pImpl->MDNodeSet.RemoveNode(this);
 
   // Replace From element(s) in place.
   for (SmallVector<unsigned, 4>::iterator I = Indexes.begin(), E = Indexes.end(); 
@@ -186,10 +126,8 @@ void MDNode::replaceElement(Value *From, Value *To) {
   // node with updated "this" node.
   FoldingSetNodeID ID;
   Profile(ID);
-  pImpl->ConstantsLock.reader_acquire();
   void *InsertPoint;
   MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
-  pImpl->ConstantsLock.reader_release();
 
   if (N) {
     N->replaceAllUsesWith(this);
@@ -197,39 +135,32 @@ void MDNode::replaceElement(Value *From, Value *To) {
     N = 0;
   }
 
-  {
-    sys::SmartScopedWriter<true> Writer(pImpl->ConstantsLock);
-    N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
-    if (!N) {
-      // InsertPoint will have been set by the FindNodeOrInsertPos call.
-      N = this;
-      pImpl->MDNodeSet.InsertNode(N, InsertPoint);
-    }
+  N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
+  if (!N) {
+    // InsertPoint will have been set by the FindNodeOrInsertPos call.
+    N = this;
+    pImpl->MDNodeSet.InsertNode(N, InsertPoint);
   }
 }
 
 //===----------------------------------------------------------------------===//
-//NamedMDNode implementation
+// NamedMDNode implementation.
 //
 NamedMDNode::NamedMDNode(LLVMContext &C, const Twine &N,
-                         MetadataBase*const* MDs, 
+                         MetadataBase *const *MDs, 
                          unsigned NumMDs, Module *ParentModule)
   : MetadataBase(Type::getMetadataTy(C), Value::NamedMDNodeVal), Parent(0) {
   setName(N);
-  NumOperands = 0;
-  resizeOperands(NumMDs);
 
-  for (unsigned i = 0; i != NumMDs; ++i) {
-    if (MDs[i])
-      OperandList[NumOperands++] = MDs[i];
-    Node.push_back(WeakMetadataVH(MDs[i]));
-  }
+  for (unsigned i = 0; i != NumMDs; ++i)
+    Node.push_back(TrackingVH<MetadataBase>(MDs[i]));
+
   if (ParentModule)
     ParentModule->getNamedMDList().push_back(this);
 }
 
 NamedMDNode *NamedMDNode::Create(const NamedMDNode *NMD, Module *M) {
-  assert (NMD && "Invalid source NamedMDNode!");
+  assert(NMD && "Invalid source NamedMDNode!");
   SmallVector<MetadataBase *, 4> Elems;
   for (unsigned i = 0, e = NMD->getNumElements(); i != e; ++i)
     Elems.push_back(NMD->getElement(i));
@@ -245,7 +176,6 @@ void NamedMDNode::eraseFromParent() {
 
 /// dropAllReferences - Remove all uses and clear node vector.
 void NamedMDNode::dropAllReferences() {
-  User::dropAllReferences();
   Node.clear();
 }
 
@@ -254,65 +184,100 @@ NamedMDNode::~NamedMDNode() {
 }
 
 //===----------------------------------------------------------------------===//
-//Metadata implementation
+// MetadataContextImpl implementation.
 //
+namespace llvm {
+class MetadataContextImpl {
+public:
+  typedef std::pair<unsigned, TrackingVH<MDNode> > MDPairTy;
+  typedef SmallVector<MDPairTy, 2> MDMapTy;
+  typedef DenseMap<const Instruction *, MDMapTy> MDStoreTy;
+  friend class BitcodeReader;
+private:
+
+  /// MetadataStore - Collection of metadata used in this context.
+  MDStoreTy MetadataStore;
+
+  /// MDHandlerNames - Map to hold metadata handler names.
+  StringMap<unsigned> MDHandlerNames;
+
+public:
+  /// registerMDKind - Register a new metadata kind and return its ID.
+  /// A metadata kind can be registered only once. 
+  unsigned registerMDKind(StringRef Name);
+
+  /// getMDKind - Return metadata kind. If the requested metadata kind
+  /// is not registered then return 0.
+  unsigned getMDKind(StringRef Name) const;
+
+  /// getMD - Get the metadata of given kind attached to an Instruction.
+  /// If the metadata is not found then return 0.
+  MDNode *getMD(unsigned Kind, const Instruction *Inst);
+
+  /// getMDs - Get the metadata attached to an Instruction.
+  void getMDs(const Instruction *Inst, SmallVectorImpl<MDPairTy> &MDs) const;
+
+  /// addMD - Attach the metadata of given kind to an Instruction.
+  void addMD(unsigned Kind, MDNode *Node, Instruction *Inst);
+  
+  /// removeMD - Remove metadata of given kind attached with an instuction.
+  void removeMD(unsigned Kind, Instruction *Inst);
+  
+  /// removeAllMetadata - Remove all metadata attached with an instruction.
+  void removeAllMetadata(Instruction *Inst);
+
+  /// copyMD - If metadata is attached with Instruction In1 then attach
+  /// the same metadata to In2.
+  void copyMD(Instruction *In1, Instruction *In2);
+
+  /// getHandlerNames - Populate client supplied smallvector using custome
+  /// metadata name and ID.
+  void getHandlerNames(SmallVectorImpl<std::pair<unsigned, StringRef> >&) const;
+
+  /// ValueIsDeleted - This handler is used to update metadata store
+  /// when a value is deleted.
+  void ValueIsDeleted(const Value *) {}
+  void ValueIsDeleted(Instruction *Inst) {
+    removeAllMetadata(Inst);
+  }
+  void ValueIsRAUWd(Value *V1, Value *V2);
 
-/// RegisterMDKind - Register a new metadata kind and return its ID.
-/// A metadata kind can be registered only once. 
-unsigned MetadataContext::RegisterMDKind(const char *Name) {
-  assert (validName(Name) && "Invalid custome metadata name!");
-  unsigned Count = MDHandlerNames.size();
-  assert(MDHandlerNames.find(Name) == MDHandlerNames.end() 
-         && "Already registered MDKind!");
-  MDHandlerNames[Name] = Count + 1;
-  return Count + 1;
+  /// ValueIsCloned - This handler is used to update metadata store
+  /// when In1 is cloned to create In2.
+  void ValueIsCloned(const Instruction *In1, Instruction *In2);
+};
 }
 
-/// validName - Return true if Name is a valid custom metadata handler name.
-bool MetadataContext::validName(const char *Name) {
-  if (!Name)
-    return false;
-
-  if (!isalpha(*Name))
-    return false;
-
-  unsigned Length = strlen(Name);  
-  unsigned Count = 1;
-  ++Name;
-  while (Name &&
-         (isalnum(*Name) || *Name == '_' || *Name == '-' || *Name == '.')) {
-    ++Name;
-    ++Count;
-  }
-  if (Length != Count)
-    return false;
-  return true;
+/// registerMDKind - Register a new metadata kind and return its ID.
+/// A metadata kind can be registered only once. 
+unsigned MetadataContextImpl::registerMDKind(StringRef Name) {
+  unsigned Count = MDHandlerNames.size();
+  assert(MDHandlerNames.count(Name) == 0 && "Already registered MDKind!");
+  return MDHandlerNames[Name] = Count + 1;
 }
 
 /// getMDKind - Return metadata kind. If the requested metadata kind
 /// is not registered then return 0.
-unsigned MetadataContext::getMDKind(const char *Name) {
-  assert (validName(Name) && "Invalid custome metadata name!");
-  StringMap<unsigned>::iterator I = MDHandlerNames.find(Name);
+unsigned MetadataContextImpl::getMDKind(StringRef Name) const {
+  StringMap<unsigned>::const_iterator I = MDHandlerNames.find(Name);
   if (I == MDHandlerNames.end())
     return 0;
 
   return I->getValue();
 }
 
-/// addMD - Attach the metadata of given kind with an Instruction.
-void MetadataContext::addMD(unsigned MDKind, MDNode *Node, Instruction *Inst) {
-  assert (Node && "Unable to add custome metadata");
+/// addMD - Attach the metadata of given kind to an Instruction.
+void MetadataContextImpl::addMD(unsigned MDKind, MDNode *Node, 
+                                Instruction *Inst) {
+  assert(Node && "Invalid null MDNode");
   Inst->HasMetadata = true;
-  MDStoreTy::iterator I = MetadataStore.find(Inst);
-  if (I == MetadataStore.end()) {
-    MDMapTy Info;
+  MDMapTy &Info = MetadataStore[Inst];
+  if (Info.empty()) {
     Info.push_back(std::make_pair(MDKind, Node));
     MetadataStore.insert(std::make_pair(Inst, Info));
     return;
   }
 
-  MDMapTy &Info = I->second;
   // If there is an entry for this MDKind then replace it.
   for (unsigned i = 0, e = Info.size(); i != e; ++i) {
     MDPairTy &P = Info[i];
@@ -324,11 +289,10 @@ void MetadataContext::addMD(unsigned MDKind, MDNode *Node, Instruction *Inst) {
 
   // Otherwise add a new entry.
   Info.push_back(std::make_pair(MDKind, Node));
-  return;
 }
 
 /// removeMD - Remove metadata of given kind attached with an instuction.
-void MetadataContext::removeMD(unsigned Kind, Instruction *Inst) {
+void MetadataContextImpl::removeMD(unsigned Kind, Instruction *Inst) {
   MDStoreTy::iterator I = MetadataStore.find(Inst);
   if (I == MetadataStore.end())
     return;
@@ -341,87 +305,80 @@ void MetadataContext::removeMD(unsigned Kind, Instruction *Inst) {
       return;
     }
   }
-
-  return;
 }
   
-/// removeMDs - Remove all metadata attached with an instruction.
-void MetadataContext::removeMDs(const Instruction *Inst) {
-  // Find Metadata handles for this instruction.
-  MDStoreTy::iterator I = MetadataStore.find(Inst);
-  assert (I != MetadataStore.end() && "Invalid custom metadata info!");
-  MDMapTy &Info = I->second;
-  
-  // FIXME : Give all metadata handlers a chance to adjust.
-  
-  // Remove the entries for this instruction.
-  Info.clear();
-  MetadataStore.erase(I);
+/// removeAllMetadata - Remove all metadata attached with an instruction.
+void MetadataContextImpl::removeAllMetadata(Instruction *Inst) {
+  MetadataStore.erase(Inst);
+  Inst->HasMetadata = false;
 }
 
 /// copyMD - If metadata is attached with Instruction In1 then attach
 /// the same metadata to In2.
-void MetadataContext::copyMD(Instruction *In1, Instruction *In2) {
-  assert (In1 && In2 && "Invalid instruction!");
-   MDStoreTy::iterator I = MetadataStore.find(In1);
-  if (I == MetadataStore.end())
+void MetadataContextImpl::copyMD(Instruction *In1, Instruction *In2) {
+  assert(In1 && In2 && "Invalid instruction!");
+  MDMapTy &In1Info = MetadataStore[In1];
+  if (In1Info.empty())
     return;
 
-  MDMapTy &In1Info = I->second;
-  MDMapTy In2Info;
   for (MDMapTy::iterator I = In1Info.begin(), E = In1Info.end(); I != E; ++I)
-    if (MDNode *MD = dyn_cast_or_null<MDNode>(I->second))
-      addMD(I->first, MD, In2);
+    addMD(I->first, I->second, In2);
 }
 
-/// getMD - Get the metadata of given kind attached with an Instruction.
+/// getMD - Get the metadata of given kind attached to an Instruction.
 /// If the metadata is not found then return 0.
-MDNode *MetadataContext::getMD(unsigned MDKind, const Instruction *Inst) {
-  MDStoreTy::iterator I = MetadataStore.find(Inst);
-  if (I == MetadataStore.end())
+MDNode *MetadataContextImpl::getMD(unsigned MDKind, const Instruction *Inst) {
+  MDMapTy &Info = MetadataStore[Inst];
+  if (Info.empty())
     return NULL;
-  
-  MDMapTy &Info = I->second;
+
   for (MDMapTy::iterator I = Info.begin(), E = Info.end(); I != E; ++I)
     if (I->first == MDKind)
-      return dyn_cast_or_null<MDNode>(I->second);
+      return I->second;
   return NULL;
 }
 
-/// getMDs - Get the metadata attached with an Instruction.
-const MetadataContext::MDMapTy *MetadataContext::getMDs(const Instruction *Inst) {
+/// getMDs - Get the metadata attached to an Instruction.
+void MetadataContextImpl::
+getMDs(const Instruction *Inst, SmallVectorImpl<MDPairTy> &MDs) const {
   MDStoreTy::iterator I = MetadataStore.find(Inst);
   if (I == MetadataStore.end())
-    return NULL;
-  
-  return &(I->second);
+    return;
+  for (MDMapTy::iterator MI = I->second.begin(), ME = I->second.end();
+       MI != ME; ++MI)
+    MDs.push_back(std::make_pair(MI->first, MI->second));
+  std::sort(MDs.begin(), MDs.end());
 }
 
-/// getHandlerNames - Get handler names. This is used by bitcode
-/// writer.
-const StringMap<unsigned> *MetadataContext::getHandlerNames() {
-  return &MDHandlerNames;
+/// getHandlerNames - Populate client supplied smallvector using custome
+/// metadata name and ID.
+void MetadataContextImpl::
+getHandlerNames(SmallVectorImpl<std::pair<unsigned, StringRef> >&Names) const {
+  for (StringMap<unsigned>::const_iterator I = MDHandlerNames.begin(),
+         E = MDHandlerNames.end(); I != E; ++I) 
+    Names.push_back(std::make_pair(I->second, I->first()));
+  std::sort(Names.begin(), Names.end());
 }
 
 /// ValueIsCloned - This handler is used to update metadata store
 /// when In1 is cloned to create In2.
-void MetadataContext::ValueIsCloned(const Instruction *In1, Instruction *In2) {
+void MetadataContextImpl::ValueIsCloned(const Instruction *In1, 
+                                        Instruction *In2) {
   // Find Metadata handles for In1.
   MDStoreTy::iterator I = MetadataStore.find(In1);
-  assert (I != MetadataStore.end() && "Invalid custom metadata info!");
+  assert(I != MetadataStore.end() && "Invalid custom metadata info!");
 
   // FIXME : Give all metadata handlers a chance to adjust.
 
   MDMapTy &In1Info = I->second;
   MDMapTy In2Info;
   for (MDMapTy::iterator I = In1Info.begin(), E = In1Info.end(); I != E; ++I)
-    if (MDNode *MD = dyn_cast_or_null<MDNode>(I->second))
-      addMD(I->first, MD, In2);
+    addMD(I->first, I->second, In2);
 }
 
 /// ValueIsRAUWd - This handler is used when V1's all uses are replaced by
 /// V2.
-void MetadataContext::ValueIsRAUWd(Value *V1, Value *V2) {
+void MetadataContextImpl::ValueIsRAUWd(Value *V1, Value *V2) {
   Instruction *I1 = dyn_cast<Instruction>(V1);
   Instruction *I2 = dyn_cast<Instruction>(V2);
   if (!I1 || !I2)
@@ -431,3 +388,94 @@ void MetadataContext::ValueIsRAUWd(Value *V1, Value *V2) {
   ValueIsCloned(I1, I2);
 }
 
+//===----------------------------------------------------------------------===//
+// MetadataContext implementation.
+//
+MetadataContext::MetadataContext() 
+  : pImpl(new MetadataContextImpl()) { }
+MetadataContext::~MetadataContext() { delete pImpl; }
+
+/// isValidName - Return true if Name is a valid custom metadata handler name.
+bool MetadataContext::isValidName(StringRef MDName) {
+  if (MDName.empty())
+    return false;
+
+  if (!isalpha(MDName[0]))
+    return false;
+
+  for (StringRef::iterator I = MDName.begin() + 1, E = MDName.end(); I != E;
+       ++I) {
+    if (!isalnum(*I) && *I != '_' && *I != '-' && *I != '.')
+        return false;
+  }
+  return true;
+}
+
+/// registerMDKind - Register a new metadata kind and return its ID.
+/// A metadata kind can be registered only once. 
+unsigned MetadataContext::registerMDKind(StringRef Name) {
+  assert(isValidName(Name) && "Invalid custome metadata name!");
+  return pImpl->registerMDKind(Name);
+}
+
+/// getMDKind - Return metadata kind. If the requested metadata kind
+/// is not registered then return 0.
+unsigned MetadataContext::getMDKind(StringRef Name) const {
+  return pImpl->getMDKind(Name);
+}
+
+/// getMD - Get the metadata of given kind attached to an Instruction.
+/// If the metadata is not found then return 0.
+MDNode *MetadataContext::getMD(unsigned Kind, const Instruction *Inst) {
+  return pImpl->getMD(Kind, Inst);
+}
+
+/// getMDs - Get the metadata attached to an Instruction.
+void MetadataContext::
+getMDs(const Instruction *Inst, 
+       SmallVectorImpl<std::pair<unsigned, TrackingVH<MDNode> > > &MDs) const {
+  return pImpl->getMDs(Inst, MDs);
+}
+
+/// addMD - Attach the metadata of given kind to an Instruction.
+void MetadataContext::addMD(unsigned Kind, MDNode *Node, Instruction *Inst) {
+  pImpl->addMD(Kind, Node, Inst);
+}
+  
+/// removeMD - Remove metadata of given kind attached with an instuction.
+void MetadataContext::removeMD(unsigned Kind, Instruction *Inst) {
+  pImpl->removeMD(Kind, Inst);
+}
+  
+/// removeAllMetadata - Remove all metadata attached with an instruction.
+void MetadataContext::removeAllMetadata(Instruction *Inst) {
+  pImpl->removeAllMetadata(Inst);
+}
+
+/// copyMD - If metadata is attached with Instruction In1 then attach
+/// the same metadata to In2.
+void MetadataContext::copyMD(Instruction *In1, Instruction *In2) {
+  pImpl->copyMD(In1, In2);
+}
+
+/// getHandlerNames - Populate client supplied smallvector using custome
+/// metadata name and ID.
+void MetadataContext::
+getHandlerNames(SmallVectorImpl<std::pair<unsigned, StringRef> >&N) const {
+  pImpl->getHandlerNames(N);
+}
+
+/// ValueIsDeleted - This handler is used to update metadata store
+/// when a value is deleted.
+void MetadataContext::ValueIsDeleted(Instruction *Inst) {
+  pImpl->ValueIsDeleted(Inst);
+}
+void MetadataContext::ValueIsRAUWd(Value *V1, Value *V2) {
+  pImpl->ValueIsRAUWd(V1, V2);
+}
+
+/// ValueIsCloned - This handler is used to update metadata store
+/// when In1 is cloned to create In2.
+void MetadataContext::ValueIsCloned(const Instruction *In1, Instruction *In2) {
+  pImpl->ValueIsCloned(In1, In2);
+}
diff --git a/lib/VMCore/Pass.cpp b/lib/VMCore/Pass.cpp
index a2831d3..a17eed8 100644
--- a/lib/VMCore/Pass.cpp
+++ b/lib/VMCore/Pass.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Module.h"
 #include "llvm/ModuleProvider.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/System/Atomic.h"
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index ba72af6..35ec9be 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/System/RWMutex.h"
 #include "llvm/System/Threading.h"
 #include "llvm/ADT/DenseMap.h"
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 75ea4c3..3bfd47c 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -339,10 +339,10 @@ namespace {
     void WriteValue(const Value *V) {
       if (!V) return;
       if (isa<Instruction>(V)) {
-        MessagesStr << *V;
+        MessagesStr << *V << '\n';
       } else {
         WriteAsOperand(MessagesStr, V, true, Mod);
-        MessagesStr << "\n";
+        MessagesStr << '\n';
       }
     }
author	rdivacky <rdivacky@FreeBSD.org>	2009-10-23 14:19:52 +0000
committer	rdivacky <rdivacky@FreeBSD.org>	2009-10-23 14:19:52 +0000
commit	9643cca39fb9fb3b49a8912926de98acf882283c (patch)
tree	22cc59e4b240d84c3a5a60531119c4eca914a256 /lib
parent	1adacceba9c9ee0f16e54388e56c9a249b296f75 (diff)
download	FreeBSD-src-9643cca39fb9fb3b49a8912926de98acf882283c.zip FreeBSD-src-9643cca39fb9fb3b49a8912926de98acf882283c.tar.gz