59 files changed, 2202 insertions, 912 deletions
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index fd97db8..98ab6f4 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -937,6 +937,48 @@ SCEVHandle ScalarEvolution::getSignExtendExpr(const SCEVHandle &Op,
   return Result;
 }
 
+/// getAnyExtendExpr - Return a SCEV for the given operand extended with
+/// unspecified bits out to the given type.
+///
+SCEVHandle ScalarEvolution::getAnyExtendExpr(const SCEVHandle &Op,
+                                             const Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
+         "This is not an extending conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  // Sign-extend negative constants.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    if (SC->getValue()->getValue().isNegative())
+      return getSignExtendExpr(Op, Ty);
+
+  // Peel off a truncate cast.
+  if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Op)) {
+    SCEVHandle NewOp = T->getOperand();
+    if (getTypeSizeInBits(NewOp->getType()) < getTypeSizeInBits(Ty))
+      return getAnyExtendExpr(NewOp, Ty);
+    return getTruncateOrNoop(NewOp, Ty);
+  }
+
+  // Next try a zext cast. If the cast is folded, use it.
+  SCEVHandle ZExt = getZeroExtendExpr(Op, Ty);
+  if (!isa<SCEVZeroExtendExpr>(ZExt))
+    return ZExt;
+
+  // Next try a sext cast. If the cast is folded, use it.
+  SCEVHandle SExt = getSignExtendExpr(Op, Ty);
+  if (!isa<SCEVSignExtendExpr>(SExt))
+    return SExt;
+
+  // If the expression is obviously signed, use the sext cast value.
+  if (isa<SCEVSMaxExpr>(Op))
+    return SExt;
+
+  // Absent any other information, use the zext cast value.
+  return ZExt;
+}
+
 /// getAddExpr - Get a canonical add expression, or something simpler if
 /// possible.
 SCEVHandle ScalarEvolution::getAddExpr(std::vector<SCEVHandle> &Ops) {
@@ -1903,6 +1945,23 @@ ScalarEvolution::getNoopOrSignExtend(const SCEVHandle &V, const Type *Ty) {
   return getSignExtendExpr(V, Ty);
 }
 
+/// getNoopOrAnyExtend - Return a SCEV corresponding to a conversion of
+/// the input value to the specified type. If the type must be extended,
+/// it is extended with unspecified bits. The conversion must not be
+/// narrowing.
+SCEVHandle
+ScalarEvolution::getNoopOrAnyExtend(const SCEVHandle &V, const Type *Ty) {
+  const Type *SrcTy = V->getType();
+  assert((SrcTy->isInteger() || (TD && isa<PointerType>(SrcTy))) &&
+         (Ty->isInteger() || (TD && isa<PointerType>(Ty))) &&
+         "Cannot noop or any extend with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
+         "getNoopOrAnyExtend cannot truncate!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getAnyExtendExpr(V, Ty);
+}
+
 /// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  The conversion must not be widening.
 SCEVHandle
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index ef77e46..e1f8fa4 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 /// InsertCastOfTo - Insert a cast of V to the specified type, doing what
@@ -319,8 +320,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
   if (!AnyNonZeroIndices) {
     V = InsertNoopCastOfTo(V,
                            Type::Int8Ty->getPointerTo(PTy->getAddressSpace()));
-    Value *Idx = expand(SE.getAddExpr(Ops));
-    Idx = InsertNoopCastOfTo(Idx, Ty);
+    Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty);
 
     // Fold a GEP with constant operands.
     if (Constant *CLHS = dyn_cast<Constant>(V))
@@ -374,8 +374,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
 
   // Emit a bunch of add instructions
   for (int i = S->getNumOperands()-2; i >= 0; --i) {
-    Value *W = expand(S->getOperand(i));
-    W = InsertNoopCastOfTo(W, Ty);
+    Value *W = expandCodeFor(S->getOperand(i), Ty);
     V = InsertBinop(Instruction::Add, V, W, InsertPt);
   }
   return V;
@@ -389,13 +388,11 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
       FirstOp = 1;
 
   int i = S->getNumOperands()-2;
-  Value *V = expand(S->getOperand(i+1));
-  V = InsertNoopCastOfTo(V, Ty);
+  Value *V = expandCodeFor(S->getOperand(i+1), Ty);
 
   // Emit a bunch of multiply instructions
   for (; i >= FirstOp; --i) {
-    Value *W = expand(S->getOperand(i));
-    W = InsertNoopCastOfTo(W, Ty);
+    Value *W = expandCodeFor(S->getOperand(i), Ty);
     V = InsertBinop(Instruction::Mul, V, W, InsertPt);
   }
 
@@ -408,8 +405,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
 Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
 
-  Value *LHS = expand(S->getLHS());
-  LHS = InsertNoopCastOfTo(LHS, Ty);
+  Value *LHS = expandCodeFor(S->getLHS(), Ty);
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
     const APInt &RHS = SC->getValue()->getValue();
     if (RHS.isPowerOf2())
@@ -418,8 +414,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
                          InsertPt);
   }
 
-  Value *RHS = expand(S->getRHS());
-  RHS = InsertNoopCastOfTo(RHS, Ty);
+  Value *RHS = expandCodeFor(S->getRHS(), Ty);
   return InsertBinop(Instruction::UDiv, LHS, RHS, InsertPt);
 }
 
@@ -448,6 +443,34 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
   const Loop *L = S->getLoop();
 
+  // First check for an existing canonical IV in a suitable type.
+  PHINode *CanonicalIV = 0;
+  if (PHINode *PN = L->getCanonicalInductionVariable())
+    if (SE.isSCEVable(PN->getType()) &&
+        isa<IntegerType>(SE.getEffectiveSCEVType(PN->getType())) &&
+        SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
+      CanonicalIV = PN;
+
+  // Rewrite an AddRec in terms of the canonical induction variable, if
+  // its type is more narrow.
+  if (CanonicalIV &&
+      SE.getTypeSizeInBits(CanonicalIV->getType()) >
+      SE.getTypeSizeInBits(Ty)) {
+    SCEVHandle Start = SE.getAnyExtendExpr(S->getStart(),
+                                           CanonicalIV->getType());
+    SCEVHandle Step = SE.getAnyExtendExpr(S->getStepRecurrence(SE),
+                                          CanonicalIV->getType());
+    Value *V = expand(SE.getAddRecExpr(Start, Step, S->getLoop()));
+    BasicBlock::iterator SaveInsertPt = getInsertionPoint();
+    BasicBlock::iterator NewInsertPt =
+      next(BasicBlock::iterator(cast<Instruction>(V)));
+    while (isa<PHINode>(NewInsertPt)) ++NewInsertPt;
+    V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
+                      NewInsertPt);
+    setInsertionPoint(SaveInsertPt);
+    return V;
+  }
+
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
     std::vector<SCEVHandle> NewOps(S->getOperands());
@@ -481,6 +504,14 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // {0,+,1} --> Insert a canonical induction variable into the loop!
   if (S->isAffine() &&
       S->getOperand(1) == SE.getIntegerSCEV(1, Ty)) {
+    // If there's a canonical IV, just use it.
+    if (CanonicalIV) {
+      assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
+             "IVs with types different from the canonical IV should "
+             "already have been handled!");
+      return CanonicalIV;
+    }
+
     // Create and insert the PHI node for the induction variable in the
     // specified loop.
     BasicBlock *Header = L->getHeader();
@@ -508,19 +539,16 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     return PN;
   }
 
+  // {0,+,F} --> {0,+,1} * F
   // Get the canonical induction variable I for this loop.
-  Value *I = getOrInsertCanonicalInductionVariable(L, Ty);
+  Value *I = CanonicalIV ?
+             CanonicalIV :
+             getOrInsertCanonicalInductionVariable(L, Ty);
 
   // If this is a simple linear addrec, emit it now as a special case.
   if (S->isAffine()) {   // {0,+,F} --> i*F
-    Value *F = expand(S->getOperand(1));
-    F = InsertNoopCastOfTo(F, Ty);
-    
-    // IF the step is by one, just return the inserted IV.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(F))
-      if (CI->getValue() == 1)
-        return I;
-    
+    Value *F = expandCodeFor(S->getOperand(1), Ty);
+
     // If the insert point is directly inside of the loop, emit the multiply at
     // the insert point.  Otherwise, L is a loop that is a parent of the insert
     // point loop.  If we can, move the multiply to the outer most loop that it
@@ -555,16 +583,24 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   // into this folder.
   SCEVHandle IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.
 
-  SCEVHandle V = S->evaluateAtIteration(IH, SE);
+  // Promote S up to the canonical IV type, if the cast is foldable.
+  SCEVHandle NewS = S;
+  SCEVHandle Ext = SE.getNoopOrAnyExtend(S, I->getType());
+  if (isa<SCEVAddRecExpr>(Ext))
+    NewS = Ext;
+
+  SCEVHandle V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
   //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
 
+  // Truncate the result down to the original type, if needed.
+  SCEVHandle T = SE.getTruncateOrNoop(V, Ty);
   return expand(V);
 }
 
 Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *V = expand(S->getOperand());
-  V = InsertNoopCastOfTo(V, SE.getEffectiveSCEVType(V->getType()));
+  Value *V = expandCodeFor(S->getOperand(),
+                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
   Instruction *I = new TruncInst(V, Ty, "tmp.", InsertPt);
   InsertedValues.insert(I);
   return I;
@@ -572,8 +608,8 @@ Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
 
 Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *V = expand(S->getOperand());
-  V = InsertNoopCastOfTo(V, SE.getEffectiveSCEVType(V->getType()));
+  Value *V = expandCodeFor(S->getOperand(),
+                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
   Instruction *I = new ZExtInst(V, Ty, "tmp.", InsertPt);
   InsertedValues.insert(I);
   return I;
@@ -581,8 +617,8 @@ Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
 
 Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *V = expand(S->getOperand());
-  V = InsertNoopCastOfTo(V, SE.getEffectiveSCEVType(V->getType()));
+  Value *V = expandCodeFor(S->getOperand(),
+                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
   Instruction *I = new SExtInst(V, Ty, "tmp.", InsertPt);
   InsertedValues.insert(I);
   return I;
@@ -590,11 +626,9 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
 
 Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *LHS = expand(S->getOperand(0));
-  LHS = InsertNoopCastOfTo(LHS, Ty);
+  Value *LHS = expandCodeFor(S->getOperand(0), Ty);
   for (unsigned i = 1; i < S->getNumOperands(); ++i) {
-    Value *RHS = expand(S->getOperand(i));
-    RHS = InsertNoopCastOfTo(RHS, Ty);
+    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
     Instruction *ICmp =
       new ICmpInst(ICmpInst::ICMP_SGT, LHS, RHS, "tmp", InsertPt);
     InsertedValues.insert(ICmp);
@@ -607,11 +641,9 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
 
 Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
   const Type *Ty = SE.getEffectiveSCEVType(S->getType());
-  Value *LHS = expand(S->getOperand(0));
-  LHS = InsertNoopCastOfTo(LHS, Ty);
+  Value *LHS = expandCodeFor(S->getOperand(0), Ty);
   for (unsigned i = 1; i < S->getNumOperands(); ++i) {
-    Value *RHS = expand(S->getOperand(i));
-    RHS = InsertNoopCastOfTo(RHS, Ty);
+    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
     Instruction *ICmp =
       new ICmpInst(ICmpInst::ICMP_UGT, LHS, RHS, "tmp", InsertPt);
     InsertedValues.insert(ICmp);
diff --git a/lib/Archive/ArchiveWriter.cpp b/lib/Archive/ArchiveWriter.cpp
index 336a2bd..cebb087 100644
--- a/lib/Archive/ArchiveWriter.cpp
+++ b/lib/Archive/ArchiveWriter.cpp
@@ -167,10 +167,11 @@ Archive::addFileBefore(const sys::Path& filePath, iterator where,
   mbr->data = 0;
   mbr->path = filePath;
   const sys::FileStatus *FSInfo = mbr->path.getFileStatus(false, ErrMsg);
-  if (FSInfo)
-    mbr->info = *FSInfo;
-  else
+  if (!FSInfo) {
+    delete mbr;
     return true;
+  }
+  mbr->info = *FSInfo;
 
   unsigned flags = 0;
   bool hasSlash = filePath.toString().find('/') != std::string::npos;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 9f16728..6dcdded 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1308,16 +1308,6 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   // Emit constants.
   WriteModuleConstants(VE, Stream);
   
-  // If we have any aggregate values in the value table, purge them - these can
-  // only be used to initialize global variables.  Doing so makes the value
-  // namespace smaller for code in functions.
-  int NumNonAggregates = VE.PurgeAggregateValues();
-  if (NumNonAggregates != -1) {
-    SmallVector<unsigned, 1> Vals;
-    Vals.push_back(NumNonAggregates);
-    Stream.EmitRecord(bitc::MODULE_CODE_PURGEVALS, Vals);
-  }
-  
   // Emit function bodies.
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
     if (!I->isDeclaration())
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8002a36..32b2819 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -277,22 +277,6 @@ void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) {
 }
 
 
-/// PurgeAggregateValues - If there are any aggregate values at the end of the
-/// value list, remove them and return the count of the remaining values.  If
-/// there are none, return -1.
-int ValueEnumerator::PurgeAggregateValues() {
-  // If there are no aggregate values at the end of the list, return -1.
-  if (Values.empty() || Values.back().first->getType()->isSingleValueType())
-    return -1;
-  
-  // Otherwise, remove aggregate values...
-  while (!Values.empty() && !Values.back().first->getType()->isSingleValueType())
-    Values.pop_back();
-  
-  // ... and return the new size.
-  return Values.size();
-}
-
 void ValueEnumerator::incorporateFunction(const Function &F) {
   NumModuleValues = Values.size();
   
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index bb0324b..40eeabb 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -99,11 +99,6 @@ public:
     return Attributes;
   }
 
-  /// PurgeAggregateValues - If there are any aggregate values at the end of the
-  /// value list, remove them and return the count of the remaining values.  If
-  /// there are none, return -1.
-  int PurgeAggregateValues();
-  
   /// incorporateFunction/purgeFunction - If you'd like to deal with a function,
   /// use these two methods to get its data into the ValueEnumerator!
   ///
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5a66f4b..c773378 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1581,6 +1581,7 @@ void DwarfDebug::EndFunction(MachineFunction *MF) {
     FunctionDbgScope = NULL;
     LexicalScopeStack.clear();
     AbstractInstanceRootList.clear();
+    AbstractInstanceRootMap.clear();
   }
 
   Lines.clear();
@@ -1669,7 +1670,11 @@ unsigned DwarfDebug::RecordRegionEnd(GlobalVariable *V) {
   DbgScope *Scope = getOrCreateScope(V);
   unsigned ID = MMI->NextLabelID();
   Scope->setEndLabelID(ID);
-  if (LexicalScopeStack.size() != 0)
+  // FIXME : region.end() may not be in the last basic block.
+  // For now, do not pop last lexical scope because next basic
+  // block may start new inlined function's body.
+  unsigned LSSize = LexicalScopeStack.size();
+  if (LSSize != 0 && LSSize != 1)
     LexicalScopeStack.pop_back();
 
   if (TimePassesIsEnabled)
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index ff917a7..5ba8b3c 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_library(LLVMCodeGen
   IntrinsicLowering.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
+  LazyLiveness.cpp
   LiveInterval.cpp
   LiveIntervalAnalysis.cpp
   LiveStackAnalysis.cpp
diff --git a/lib/CodeGen/ELF.h b/lib/CodeGen/ELF.h
index c22f6ed..796bc2c 100644
--- a/lib/CodeGen/ELF.h
+++ b/lib/CodeGen/ELF.h
@@ -10,23 +10,24 @@
 // This header contains common, non-processor-specific data structures and
 // constants for the ELF file format.
 //
-// The details of the ELF32 bits in this file are largely based on
-// the Tool Interface Standard (TIS) Executable and Linking Format
-// (ELF) Specification Version 1.2, May 1995. The ELF64 stuff is not
-// standardized, as far as I can tell. It was largely based on information
-// I found in OpenBSD header files.
+// The details of the ELF32 bits in this file are largely based on the Tool
+// Interface Standard (TIS) Executable and Linking Format (ELF) Specification
+// Version 1.2, May 1995. The ELF64 is based on HP/Intel definition of the
+// ELF-64 object file format document, Version 1.5 Draft 2 May 27, 1998
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef CODEGEN_ELF_H
 #define CODEGEN_ELF_H
 
+#include "llvm/GlobalVariable.h"
+#include "llvm/CodeGen/BinaryObject.h"
 #include "llvm/CodeGen/MachineRelocation.h"
 #include "llvm/Support/DataTypes.h"
 #include <cstring>
 
 namespace llvm {
-  class GlobalVariable;
+  class BinaryObject;
 
   // Identification Indexes
   enum {
@@ -47,71 +48,28 @@ namespace llvm {
     ET_HIPROC = 0xffff  // Processor-specific
   };
 
-  // Object file classes.
-  enum {
-    ELFCLASS32 = 1, // 32-bit object file
-    ELFCLASS64 = 2  // 64-bit object file
-  };
-
-  // Object file byte orderings.
-  enum {
-    ELFDATA2LSB = 1, // Little-endian object file
-    ELFDATA2MSB = 2  // Big-endian object file
-  };
-
   // Versioning
   enum {
     EV_NONE = 0,
     EV_CURRENT = 1
   };
 
-  struct ELFHeader {
-    // e_machine - This field is the target specific value to emit as the
-    // e_machine member of the ELF header.
-    unsigned short e_machine;
-
-    // e_flags - The machine flags for the target.  This defaults to zero.
-    unsigned e_flags;
-
-    // e_size - Holds the ELF header's size in bytes
-    unsigned e_ehsize;
-
-    // Endianess and ELF Class (64 or 32 bits)
-    unsigned ByteOrder;
-    unsigned ElfClass;
-
-    unsigned getByteOrder() const { return ByteOrder; }
-    unsigned getElfClass() const { return ElfClass; }
-    unsigned getSize() const { return e_ehsize; }
-    unsigned getMachine() const { return e_machine; }
-    unsigned getFlags() const { return e_flags; }
-
-    ELFHeader(unsigned short machine, unsigned flags,
-              bool is64Bit, bool isLittleEndian)
-      : e_machine(machine), e_flags(flags) {
-        ElfClass  = is64Bit ? ELFCLASS64 : ELFCLASS32;
-        ByteOrder = isLittleEndian ? ELFDATA2LSB : ELFDATA2MSB;
-        e_ehsize  = is64Bit ? 64 : 52;
-      }
-  };
-
   /// ELFSection - This struct contains information about each section that is
   /// emitted to the file.  This is eventually turned into the section header
   /// table at the end of the file.
-  struct ELFSection {
-
+  class ELFSection : public BinaryObject {
+    public:
     // ELF specific fields
-    std::string Name;       // Name of the section.
-    unsigned NameIdx;       // Index in .shstrtab of name, once emitted.
-    unsigned Type;
-    unsigned Flags;
-    uint64_t Addr;
-    unsigned Offset;
-    unsigned Size;
-    unsigned Link;
-    unsigned Info;
-    unsigned Align;
-    unsigned EntSize;
+    unsigned NameIdx;   // sh_name - .shstrtab idx of name, once emitted.
+    unsigned Type;      // sh_type - Section contents & semantics 
+    unsigned Flags;     // sh_flags - Section flags.
+    uint64_t Addr;      // sh_addr - The mem addr this section is in.
+    unsigned Offset;    // sh_offset - Offset from the file start
+    unsigned Size;      // sh_size - The section size.
+    unsigned Link;      // sh_link - Section header table index link.
+    unsigned Info;      // sh_info - Auxillary information.
+    unsigned Align;     // sh_addralign - Alignment of section.
+    unsigned EntSize;   // sh_entsize - Size of entries in the section e
 
     // Section Header Flags
     enum {
@@ -141,8 +99,8 @@ namespace llvm {
       SHT_REL      = 9,  // Relocation entries; no explicit addends.
       SHT_SHLIB    = 10, // Reserved.
       SHT_DYNSYM   = 11, // Symbol table.
-      SHT_LOPROC   = 0x70000000, // Lowest processor architecture-specific type.
-      SHT_HIPROC   = 0x7fffffff, // Highest processor architecture-specific type.
+      SHT_LOPROC   = 0x70000000, // Lowest processor arch-specific type.
+      SHT_HIPROC   = 0x7fffffff, // Highest processor arch-specific type.
       SHT_LOUSER   = 0x80000000, // Lowest type reserved for applications.
       SHT_HIUSER   = 0xffffffff  // Highest type reserved for applications.
     };
@@ -161,22 +119,9 @@ namespace llvm {
     /// SectionIdx - The number of the section in the Section Table.
     unsigned short SectionIdx;
 
-    /// SectionData - The actual data for this section which we are building
-    /// up for emission to the file.
-    std::vector<unsigned char> SectionData;
-
-    /// Relocations - The relocations that we have encountered so far in this 
-    /// section that we will need to convert to Elf relocation entries when
-    /// the file is written.
-    std::vector<MachineRelocation> Relocations;
-
-    /// Section Header Size 
-    static unsigned getSectionHdrSize(bool is64Bit)
-      { return is64Bit ? 64 : 40; }
-
-    ELFSection(const std::string &name)
-      : Name(name), Type(0), Flags(0), Addr(0), Offset(0), Size(0),
-        Link(0), Info(0), Align(0), EntSize(0) {}
+    ELFSection(const std::string &name, bool isLittleEndian, bool is64Bit)
+      : BinaryObject(name, isLittleEndian, is64Bit), Type(0), Flags(0), Addr(0),
+        Offset(0), Size(0), Link(0), Info(0), Align(0), EntSize(0) {}
   };
 
   /// ELFSym - This struct contains information about each symbol that is
@@ -207,9 +152,33 @@ namespace llvm {
       STT_FILE = 4 
     };
 
+    enum {
+      STV_DEFAULT = 0,  // Visibility is specified by binding type
+      STV_INTERNAL = 1, // Defined by processor supplements
+      STV_HIDDEN = 2,   // Not visible to other components
+      STV_PROTECTED = 3 // Visible in other components but not preemptable
+    };
+
     ELFSym(const GlobalValue *gv) : GV(gv), NameIdx(0), Value(0),
                                     Size(0), Info(0), Other(0),
-                                    SectionIdx(ELFSection::SHN_UNDEF) {}
+                                    SectionIdx(ELFSection::SHN_UNDEF) {
+      if (!GV)
+        return;
+
+      switch (GV->getVisibility()) {
+      default:
+        assert(0 && "unknown visibility type");
+      case GlobalValue::DefaultVisibility:
+        Other = STV_DEFAULT;
+        break;
+      case GlobalValue::HiddenVisibility:
+        Other = STV_HIDDEN;
+        break;
+      case GlobalValue::ProtectedVisibility:
+        Other = STV_PROTECTED;
+        break;
+      }
+    }
 
     void SetBind(unsigned X) {
       assert(X == (X & 0xF) && "Bind value out of range!");
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index c7bd873..ca68396 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -13,9 +13,9 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/BinaryObject.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
 
@@ -28,27 +28,22 @@ namespace llvm {
 /// startFunction - This callback is invoked when a new machine function is
 /// about to be emitted.
 void ELFCodeEmitter::startFunction(MachineFunction &MF) {
-  const TargetData *TD = TM.getTargetData();
-  const Function *F = MF.getFunction();
-
-  // Align the output buffer to the appropriate alignment, power of 2.
-  unsigned FnAlign = F->getAlignment();
-  unsigned TDAlign = TD->getPrefTypeAlignment(F->getType());
-  unsigned Align = std::max(FnAlign, TDAlign);
-  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
-
   // Get the ELF Section that this function belongs in.
   ES = &EW.getTextSection();
 
-  // FIXME: better memory management, this will be replaced by BinaryObjects
-  ES->SectionData.reserve(4096);
-  BufferBegin = &ES->SectionData[0];
-  BufferEnd = BufferBegin + ES->SectionData.capacity();
+  DOUT << "processing function: " << MF.getFunction()->getName() << "\n";
 
-  // Upgrade the section alignment if required.
+  // FIXME: better memory management, this will be replaced by BinaryObjects
+  BinaryData &BD = ES->getData();
+  BD.reserve(4096);
+  BufferBegin = &BD[0];
+  BufferEnd = BufferBegin + BD.capacity();
+
+  // Align the output buffer with function alignment, and
+  // upgrade the section alignment if required
+  unsigned Align =
+    TM.getELFWriterInfo()->getFunctionAlignment(MF.getFunction());
   if (ES->Align < Align) ES->Align = Align;
-
-  // Round the size up to the correct alignment for starting the new function.
   ES->Size = (ES->Size + (Align-1)) & (-Align);
 
   // Snaity check on allocated space for text section
@@ -107,7 +102,7 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
   FnSym.Value = FnStartPtr-BufferBegin;
 
   // Finally, add it to the symtab.
-  EW.SymbolTable.push_back(FnSym);
+  EW.SymbolList.push_back(FnSym);
 
   // Relocations
   // -----------
@@ -128,7 +123,7 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
     } else {
       assert(0 && "Unhandled relocation type");
     }
-    ES->Relocations.push_back(MR);
+    ES->addRelocation(MR);
   }
   Relocations.clear();
 
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
index 3859ea3..aeccefb 100644
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -26,9 +26,6 @@
 //  ...
 //  #N. ".shstrtab" entry - String table for the section names.
 //
-// NOTE: This code should eventually be extended to support 64-bit ELF (this
-// won't be hard), but we haven't done so yet!
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "elfwriter"
@@ -36,18 +33,18 @@
 #include "ELFWriter.h"
 #include "ELFCodeEmitter.h"
 #include "ELF.h"
+#include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/BinaryObject.h"
 #include "llvm/CodeGen/FileWriters.h"
 #include "llvm/CodeGen/MachineCodeEmitter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetELFWriterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Mangler.h"
-#include "llvm/Support/OutputBuffer.h"
 #include "llvm/Support/Streams.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
@@ -70,21 +67,23 @@ MachineCodeEmitter *llvm::AddELFWriter(PassManagerBase &PM,
 //===----------------------------------------------------------------------===//
 
 ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm)
-  : MachineFunctionPass(&ID), O(o), TM(tm), ElfHdr() {
-  is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
-  isLittleEndian = TM.getTargetData()->isLittleEndian();
+  : MachineFunctionPass(&ID), O(o), TM(tm),
+    is64Bit(TM.getTargetData()->getPointerSizeInBits() == 64),
+    isLittleEndian(TM.getTargetData()->isLittleEndian()),
+    ElfHdr(isLittleEndian, is64Bit) {
 
-  ElfHdr = new ELFHeader(TM.getELFWriterInfo()->getEMachine(), 0,
-                         is64Bit, isLittleEndian);
+  TAI = TM.getTargetAsmInfo();
+  TEW = TM.getELFWriterInfo();
 
   // Create the machine code emitter object for this target.
   MCE = new ELFCodeEmitter(*this);
+
+  // Inital number of sections
   NumSections = 0;
 }
 
 ELFWriter::~ELFWriter() {
   delete MCE;
-  delete ElfHdr;
 }
 
 // doInitialization - Emit the file header and all of the global variables for
@@ -92,10 +91,6 @@ ELFWriter::~ELFWriter() {
 bool ELFWriter::doInitialization(Module &M) {
   Mang = new Mangler(M);
 
-  // Local alias to shortenify coming code.
-  std::vector<unsigned char> &FH = FileHeader;
-  OutputBuffer FHOut(FH, is64Bit, isLittleEndian);
-
   // ELF Header
   // ----------
   // Fields e_shnum e_shstrndx are only known after all section have
@@ -104,54 +99,58 @@ bool ELFWriter::doInitialization(Module &M) {
   //
   // Note
   // ----
-  // FHOut.outaddr method behaves differently for ELF32 and ELF64 writing
+  // emitWord method behaves differently for ELF32 and ELF64, writing
   // 4 bytes in the former and 8 in the last for *_off and *_addr elf types
 
-  FHOut.outbyte(0x7f); // e_ident[EI_MAG0]
-  FHOut.outbyte('E');  // e_ident[EI_MAG1]
-  FHOut.outbyte('L');  // e_ident[EI_MAG2]
-  FHOut.outbyte('F');  // e_ident[EI_MAG3]
-
-  FHOut.outbyte(ElfHdr->getElfClass());   // e_ident[EI_CLASS]
-  FHOut.outbyte(ElfHdr->getByteOrder());  // e_ident[EI_DATA]
-  FHOut.outbyte(EV_CURRENT);  // e_ident[EI_VERSION]
-
-  FH.resize(16);  // e_ident[EI_NIDENT-EI_PAD]
-
-  FHOut.outhalf(ET_REL);               // e_type
-  FHOut.outhalf(ElfHdr->getMachine()); // e_machine = target
-  FHOut.outword(EV_CURRENT);           // e_version
-  FHOut.outaddr(0);                    // e_entry = 0, no entry point in .o file
-  FHOut.outaddr(0);                    // e_phoff = 0, no program header for .o
-  ELFHdr_e_shoff_Offset = FH.size();
-  FHOut.outaddr(0);                    // e_shoff = sec hdr table off in bytes
-  FHOut.outword(ElfHdr->getFlags());   // e_flags = whatever the target wants
-  FHOut.outhalf(ElfHdr->getSize());    // e_ehsize = ELF header size
-  FHOut.outhalf(0);                    // e_phentsize = prog header entry size
-  FHOut.outhalf(0);                    // e_phnum = # prog header entries = 0
+  ElfHdr.emitByte(0x7f); // e_ident[EI_MAG0]
+  ElfHdr.emitByte('E');  // e_ident[EI_MAG1]
+  ElfHdr.emitByte('L');  // e_ident[EI_MAG2]
+  ElfHdr.emitByte('F');  // e_ident[EI_MAG3]
+
+  ElfHdr.emitByte(TEW->getEIClass()); // e_ident[EI_CLASS]
+  ElfHdr.emitByte(TEW->getEIData());  // e_ident[EI_DATA]
+  ElfHdr.emitByte(EV_CURRENT);        // e_ident[EI_VERSION]
+  ElfHdr.emitAlignment(16);           // e_ident[EI_NIDENT-EI_PAD]
+
+  ElfHdr.emitWord16(ET_REL);             // e_type
+  ElfHdr.emitWord16(TEW->getEMachine()); // e_machine = target
+  ElfHdr.emitWord32(EV_CURRENT);         // e_version
+  ElfHdr.emitWord(0);                    // e_entry, no entry point in .o file
+  ElfHdr.emitWord(0);                    // e_phoff, no program header for .o
+  ELFHdr_e_shoff_Offset = ElfHdr.size();
+  ElfHdr.emitWord(0);                    // e_shoff = sec hdr table off in bytes
+  ElfHdr.emitWord32(TEW->getEFlags());   // e_flags = whatever the target wants
+  ElfHdr.emitWord16(TEW->getHdrSize());  // e_ehsize = ELF header size
+  ElfHdr.emitWord16(0);                  // e_phentsize = prog header entry size
+  ElfHdr.emitWord16(0);                  // e_phnum = # prog header entries = 0
 
   // e_shentsize = Section header entry size
-  FHOut.outhalf(ELFSection::getSectionHdrSize(is64Bit));
+  ElfHdr.emitWord16(TEW->getSHdrSize());
 
   // e_shnum     = # of section header ents
-  ELFHdr_e_shnum_Offset = FH.size();
-  FHOut.outhalf(0);
+  ELFHdr_e_shnum_Offset = ElfHdr.size();
+  ElfHdr.emitWord16(0); // Placeholder
 
   // e_shstrndx  = Section # of '.shstrtab'
-  ELFHdr_e_shstrndx_Offset = FH.size();
-  FHOut.outhalf(0);
+  ELFHdr_e_shstrndx_Offset = ElfHdr.size();
+  ElfHdr.emitWord16(0); // Placeholder
 
   // Add the null section, which is required to be first in the file.
   getSection("", ELFSection::SHT_NULL, 0);
 
-  // Start up the symbol table.  The first entry in the symtab is the null
+  // Start up the symbol table.  The first entry in the symtab is the null 
   // entry.
-  SymbolTable.push_back(ELFSym(0));
+  SymbolList.push_back(ELFSym(0));
 
   return false;
 }
 
 void ELFWriter::EmitGlobal(GlobalVariable *GV) {
+
+  // XXX: put local symbols *before* global ones!
+  const Section *S = TAI->SectionForGlobal(GV);
+  DOUT << "Section " << S->getName() << " for global " << GV->getName() << "\n";
+
   // If this is an external global, emit it now.  TODO: Note that it would be
   // better to ignore the symbol here and only add it to the symbol table if
   // referenced.
@@ -160,17 +159,17 @@ void ELFWriter::EmitGlobal(GlobalVariable *GV) {
     ExternalSym.SetBind(ELFSym::STB_GLOBAL);
     ExternalSym.SetType(ELFSym::STT_NOTYPE);
     ExternalSym.SectionIdx = ELFSection::SHN_UNDEF;
-    SymbolTable.push_back(ExternalSym);
+    SymbolList.push_back(ExternalSym);
     return;
   }
 
-  unsigned Align = TM.getTargetData()->getPreferredAlignment(GV);
-  unsigned Size  =
-    TM.getTargetData()->getTypeAllocSize(GV->getType()->getElementType());
+  const TargetData *TD = TM.getTargetData();
+  unsigned Align = TD->getPreferredAlignment(GV);
+  Constant *CV = GV->getInitializer();
+  unsigned Size = TD->getTypeAllocSize(CV->getType());
 
-  // If this global has a zero initializer, it is part of the .bss or common
-  // section.
-  if (GV->getInitializer()->isNullValue()) {
+  // If this global has a zero initializer, go to .bss or common section.
+  if (CV->isNullValue() || isa<UndefValue>(CV)) {
     // If this global is part of the common block, add it now.  Variables are
     // part of the common block if they are zero initialized and allowed to be
     // merged with other symbols.
@@ -182,14 +181,14 @@ void ELFWriter::EmitGlobal(GlobalVariable *GV) {
       CommonSym.Size  = Size;
       CommonSym.SetBind(ELFSym::STB_GLOBAL);
       CommonSym.SetType(ELFSym::STT_OBJECT);
-      // TODO SOMEDAY: add ELF visibility.
       CommonSym.SectionIdx = ELFSection::SHN_COMMON;
-      SymbolTable.push_back(CommonSym);
+      SymbolList.push_back(CommonSym);
+      getSection(S->getName(), ELFSection::SHT_NOBITS,
+        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 1);
       return;
     }
 
     // Otherwise, this symbol is part of the .bss section.  Emit it now.
-
     // Handle alignment.  Ensure section is aligned at least as much as required
     // by this symbol.
     ELFSection &BSSSection = getBSSSection();
@@ -220,18 +219,128 @@ void ELFWriter::EmitGlobal(GlobalVariable *GV) {
     // Set the idx of the .bss section
     BSSSym.SectionIdx = BSSSection.SectionIdx;
     if (!GV->hasPrivateLinkage())
-      SymbolTable.push_back(BSSSym);
+      SymbolList.push_back(BSSSym);
 
     // Reserve space in the .bss section for this symbol.
     BSSSection.Size += Size;
     return;
   }
 
-  // FIXME: handle .rodata
-  //assert(!GV->isConstant() && "unimp");
+  /// Emit the Global symbol to the right ELF section
+  ELFSym GblSym(GV);
+  GblSym.Size = Size;
+  GblSym.SetType(ELFSym::STT_OBJECT);
+  GblSym.SetBind(ELFSym::STB_GLOBAL);
+  unsigned Flags = S->getFlags();
+  unsigned SectType = ELFSection::SHT_PROGBITS;
+  unsigned SHdrFlags = ELFSection::SHF_ALLOC;
+
+  if (Flags & SectionFlags::Code)
+    SHdrFlags |= ELFSection::SHF_EXECINSTR;
+  if (Flags & SectionFlags::Writeable)
+    SHdrFlags |= ELFSection::SHF_WRITE;
+  if (Flags & SectionFlags::Mergeable)
+    SHdrFlags |= ELFSection::SHF_MERGE;
+  if (Flags & SectionFlags::TLS)
+    SHdrFlags |= ELFSection::SHF_TLS;
+  if (Flags & SectionFlags::Strings)
+    SHdrFlags |= ELFSection::SHF_STRINGS;
+
+  // Remove tab from section name prefix
+  std::string SectionName(S->getName());
+  size_t Pos = SectionName.find("\t");
+  if (Pos != std::string::npos)
+    SectionName.erase(Pos, 1);
+
+  // The section alignment should be bound to the element with
+  // the largest alignment
+  ELFSection &ElfS = getSection(SectionName, SectType, SHdrFlags);
+  GblSym.SectionIdx = ElfS.SectionIdx;
+  if (Align > ElfS.Align)
+    ElfS.Align = Align;
+
+  // S.Value should contain the symbol index inside the section,
+  // and all symbols should start on their required alignment boundary
+  GblSym.Value = (ElfS.size() + (Align-1)) & (-Align);
+  ElfS.emitAlignment(Align);
+
+  // Emit the constant symbol to its section
+  EmitGlobalConstant(CV, ElfS);
+  SymbolList.push_back(GblSym);
+}
 
-  // FIXME: handle .data
-  //assert(0 && "unimp");
+void ELFWriter::EmitGlobalConstantStruct(const ConstantStruct *CVS,
+                                         ELFSection &GblS) {
+
+  // Print the fields in successive locations. Pad to align if needed!
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(CVS->getType());
+  const StructLayout *cvsLayout = TD->getStructLayout(CVS->getType());
+  uint64_t sizeSoFar = 0;
+  for (unsigned i = 0, e = CVS->getNumOperands(); i != e; ++i) {
+    const Constant* field = CVS->getOperand(i);
+
+    // Check if padding is needed and insert one or more 0s.
+    uint64_t fieldSize = TD->getTypeAllocSize(field->getType());
+    uint64_t padSize = ((i == e-1 ? Size : cvsLayout->getElementOffset(i+1))
+                        - cvsLayout->getElementOffset(i)) - fieldSize;
+    sizeSoFar += fieldSize + padSize;
+
+    // Now print the actual field value.
+    EmitGlobalConstant(field, GblS);
+
+    // Insert padding - this may include padding to increase the size of the
+    // current field up to the ABI size (if the struct is not packed) as well
+    // as padding to ensure that the next field starts at the right offset.
+    for (unsigned p=0; p < padSize; p++)
+      GblS.emitByte(0);
+  }
+  assert(sizeSoFar == cvsLayout->getSizeInBytes() &&
+         "Layout of constant struct may be incorrect!");
+}
+
+void ELFWriter::EmitGlobalConstant(const Constant *CV, ELFSection &GblS) {
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(CV->getType());
+
+  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV)) {
+    if (CVA->isString()) {
+      std::string GblStr = CVA->getAsString();
+      GblS.emitString(GblStr);
+    } else { // Not a string.  Print the values in successive locations
+      for (unsigned i = 0, e = CVA->getNumOperands(); i != e; ++i)
+        EmitGlobalConstant(CVA->getOperand(i), GblS);
+    }
+    return;
+  } else if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV)) {
+    EmitGlobalConstantStruct(CVS, GblS);
+    return;
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+    if (CFP->getType() == Type::DoubleTy)
+      GblS.emitWord64(Val);
+    else if (CFP->getType() == Type::FloatTy)
+      GblS.emitWord32(Val);
+    else if (CFP->getType() == Type::X86_FP80Ty) {
+      assert(0 && "X86_FP80Ty global emission not implemented");
+    } else if (CFP->getType() == Type::PPC_FP128Ty)
+      assert(0 && "PPC_FP128Ty global emission not implemented");
+    return;
+  } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    if (Size == 4)
+      GblS.emitWord32(CI->getZExtValue());
+    else if (Size == 8)
+      GblS.emitWord64(CI->getZExtValue());
+    else
+      assert(0 && "LargeInt global emission not implemented");
+    return;
+  } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+    const VectorType *PTy = CP->getType();
+    for (unsigned I = 0, E = PTy->getNumElements(); I < E; ++I)
+      EmitGlobalConstant(CP->getOperand(I), GblS);
+    return;
+  }
+  assert(0 && "unknown global constant");
 }
 
 
@@ -243,22 +352,41 @@ bool ELFWriter::runOnMachineFunction(MachineFunction &MF) {
 /// doFinalization - Now that the module has been completely processed, emit
 /// the ELF file to 'O'.
 bool ELFWriter::doFinalization(Module &M) {
-  // Okay, the ELF header and .text sections have been completed, build the
-  // .data, .bss, and "common" sections next.
+  /// FIXME: This should be removed when moving to ObjectCodeEmiter. Since the
+  /// current ELFCodeEmiter uses CurrBuff, ... it doesn't update S.Data
+  /// vector size for .text sections, so this is a quick dirty fix
+  ELFSection &TS = getTextSection();
+  if (TS.Size) {
+    BinaryData &BD = TS.getData();
+    for (unsigned e=0; e<TS.Size; ++e)
+      BD.push_back(BD[e]);
+  }
+
+  // Emit .data section placeholder
+  getDataSection();
+
+  // Emit .bss section placeholder
+  getBSSSection();
+
+  // Build and emit data, bss and "common" sections.
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I)
     EmitGlobal(I);
 
+  // Emit non-executable stack note
+  if (TAI->getNonexecutableStackDirective())
+    getNonExecStackSection();
+
   // Emit the symbol table now, if non-empty.
   EmitSymbolTable();
 
   // Emit the relocation sections.
   EmitRelocations();
 
-  // Emit the string table for the sections in the ELF file.
+  // Emit the sections string table.
   EmitSectionTableStringTable();
 
-  // Emit the sections to the .o file, and emit the section table for the file.
+  // Dump the sections and section table to the .o file.
   OutputSectionsAndSectionTable();
 
   // We are done with the abstract symbols.
@@ -274,78 +402,97 @@ bool ELFWriter::doFinalization(Module &M) {
 void ELFWriter::EmitRelocations() {
 }
 
+/// EmitSymbol - Write symbol 'Sym' to the symbol table 'SymbolTable'
+void ELFWriter::EmitSymbol(BinaryObject &SymbolTable, ELFSym &Sym) {
+  if (is64Bit) {
+    SymbolTable.emitWord32(Sym.NameIdx);
+    SymbolTable.emitByte(Sym.Info);
+    SymbolTable.emitByte(Sym.Other);
+    SymbolTable.emitWord16(Sym.SectionIdx);
+    SymbolTable.emitWord64(Sym.Value);
+    SymbolTable.emitWord64(Sym.Size);
+  } else {
+    SymbolTable.emitWord32(Sym.NameIdx);
+    SymbolTable.emitWord32(Sym.Value);
+    SymbolTable.emitWord32(Sym.Size);
+    SymbolTable.emitByte(Sym.Info);
+    SymbolTable.emitByte(Sym.Other);
+    SymbolTable.emitWord16(Sym.SectionIdx);
+  }
+}
+
+/// EmitSectionHeader - Write section 'Section' header in 'SHdrTab'
+/// Section Header Table
+void ELFWriter::EmitSectionHeader(BinaryObject &SHdrTab, 
+                                  const ELFSection &SHdr) {
+  SHdrTab.emitWord32(SHdr.NameIdx);
+  SHdrTab.emitWord32(SHdr.Type);
+  if (is64Bit) {
+    SHdrTab.emitWord64(SHdr.Flags);
+    SHdrTab.emitWord(SHdr.Addr);
+    SHdrTab.emitWord(SHdr.Offset);
+    SHdrTab.emitWord64(SHdr.Size);
+    SHdrTab.emitWord32(SHdr.Link);
+    SHdrTab.emitWord32(SHdr.Info);
+    SHdrTab.emitWord64(SHdr.Align);
+    SHdrTab.emitWord64(SHdr.EntSize);
+  } else {
+    SHdrTab.emitWord32(SHdr.Flags);
+    SHdrTab.emitWord(SHdr.Addr);
+    SHdrTab.emitWord(SHdr.Offset);
+    SHdrTab.emitWord32(SHdr.Size);
+    SHdrTab.emitWord32(SHdr.Link);
+    SHdrTab.emitWord32(SHdr.Info);
+    SHdrTab.emitWord32(SHdr.Align);
+    SHdrTab.emitWord32(SHdr.EntSize);
+  }
+}
+
 /// EmitSymbolTable - If the current symbol table is non-empty, emit the string
 /// table for it and then the symbol table itself.
 void ELFWriter::EmitSymbolTable() {
-  if (SymbolTable.size() == 1) return;  // Only the null entry.
+  if (SymbolList.size() == 1) return;  // Only the null entry.
 
   // FIXME: compact all local symbols to the start of the symtab.
   unsigned FirstNonLocalSymbol = 1;
 
-  ELFSection &StrTab = getSection(".strtab", ELFSection::SHT_STRTAB, 0);
-  StrTab.Align = 1;
-
-  DataBuffer &StrTabBuf = StrTab.SectionData;
-  OutputBuffer StrTabOut(StrTabBuf, is64Bit, isLittleEndian);
+  ELFSection &StrTab = getStringTableSection();
 
   // Set the zero'th symbol to a null byte, as required.
-  StrTabOut.outbyte(0);
+  StrTab.emitByte(0);
+
   unsigned Index = 1;
-  for (unsigned i = 1, e = SymbolTable.size(); i != e; ++i) {
+  for (unsigned i = 1, e = SymbolList.size(); i != e; ++i) {
     // Use the name mangler to uniquify the LLVM symbol.
-    std::string Name = Mang->getValueName(SymbolTable[i].GV);
+    std::string Name = Mang->getValueName(SymbolList[i].GV);
 
     if (Name.empty()) {
-      SymbolTable[i].NameIdx = 0;
+      SymbolList[i].NameIdx = 0;
     } else {
-      SymbolTable[i].NameIdx = Index;
-
-      // Add the name to the output buffer, including the null terminator.
-      StrTabBuf.insert(StrTabBuf.end(), Name.begin(), Name.end());
-
-      // Add a null terminator.
-      StrTabBuf.push_back(0);
+      SymbolList[i].NameIdx = Index;
+      StrTab.emitString(Name);
 
       // Keep track of the number of bytes emitted to this section.
       Index += Name.size()+1;
     }
   }
-  assert(Index == StrTabBuf.size());
+  assert(Index == StrTab.size());
   StrTab.Size = Index;
 
   // Now that we have emitted the string table and know the offset into the
   // string table of each symbol, emit the symbol table itself.
-  ELFSection &SymTab = getSection(".symtab", ELFSection::SHT_SYMTAB, 0);
-  SymTab.Align = is64Bit ? 8 : 4;
-  SymTab.Link = StrTab.SectionIdx;      // Section Index of .strtab.
-  SymTab.Info = FirstNonLocalSymbol;    // First non-STB_LOCAL symbol.
-  SymTab.EntSize = is64Bit ? 24 : 16;   // Size of each symtab entry. 
-  DataBuffer &SymTabBuf = SymTab.SectionData;
-  OutputBuffer SymTabOut(SymTabBuf, is64Bit, isLittleEndian);
-
-  if (!is64Bit) {   // 32-bit and 64-bit formats are shuffled a bit.
-    for (unsigned i = 0, e = SymbolTable.size(); i != e; ++i) {
-      ELFSym &Sym = SymbolTable[i];
-      SymTabOut.outword(Sym.NameIdx);
-      SymTabOut.outaddr32(Sym.Value);
-      SymTabOut.outword(Sym.Size);
-      SymTabOut.outbyte(Sym.Info);
-      SymTabOut.outbyte(Sym.Other);
-      SymTabOut.outhalf(Sym.SectionIdx);
-    }
-  } else {
-    for (unsigned i = 0, e = SymbolTable.size(); i != e; ++i) {
-      ELFSym &Sym = SymbolTable[i];
-      SymTabOut.outword(Sym.NameIdx);
-      SymTabOut.outbyte(Sym.Info);
-      SymTabOut.outbyte(Sym.Other);
-      SymTabOut.outhalf(Sym.SectionIdx);
-      SymTabOut.outaddr64(Sym.Value);
-      SymTabOut.outxword(Sym.Size);
-    }
-  }
+  ELFSection &SymTab = getSymbolTableSection();
+  SymTab.Align = TEW->getSymTabAlignment();
+  SymTab.Link  = StrTab.SectionIdx;      // Section Index of .strtab.
+  SymTab.Info  = FirstNonLocalSymbol;    // First non-STB_LOCAL symbol.
+
+  // Size of each symtab entry.
+  SymTab.EntSize = TEW->getSymTabEntrySize();
+
+  for (unsigned i = 0, e = SymbolList.size(); i != e; ++i)
+    EmitSymbol(SymTab, SymbolList[i]);
 
-  SymTab.Size = SymTabBuf.size();
+  SymTab.Size = SymTab.size();
 }
 
 /// EmitSectionTableStringTable - This method adds and emits a section for the
@@ -357,32 +504,25 @@ void ELFWriter::EmitSectionTableStringTable() {
 
   // Now that we know which section number is the .shstrtab section, update the
   // e_shstrndx entry in the ELF header.
-  OutputBuffer FHOut(FileHeader, is64Bit, isLittleEndian);
-  FHOut.fixhalf(SHStrTab.SectionIdx, ELFHdr_e_shstrndx_Offset);
+  ElfHdr.fixWord16(SHStrTab.SectionIdx, ELFHdr_e_shstrndx_Offset);
 
   // Set the NameIdx of each section in the string table and emit the bytes for
   // the string table.
   unsigned Index = 0;
-  DataBuffer &Buf = SHStrTab.SectionData;
 
   for (std::list<ELFSection>::iterator I = SectionList.begin(),
          E = SectionList.end(); I != E; ++I) {
     // Set the index into the table.  Note if we have lots of entries with
     // common suffixes, we could memoize them here if we cared.
     I->NameIdx = Index;
-
-    // Add the name to the output buffer, including the null terminator.
-    Buf.insert(Buf.end(), I->Name.begin(), I->Name.end());
-
-    // Add a null terminator.
-    Buf.push_back(0);
+    SHStrTab.emitString(I->getName());
 
     // Keep track of the number of bytes emitted to this section.
-    Index += I->Name.size()+1;
+    Index += I->getName().size()+1;
   }
 
   // Set the size of .shstrtab now that we know what it is.
-  assert(Index == Buf.size());
+  assert(Index == SHStrTab.size());
   SHStrTab.Size = Index;
 }
 
@@ -391,9 +531,9 @@ void ELFWriter::EmitSectionTableStringTable() {
 /// SectionTable.
 void ELFWriter::OutputSectionsAndSectionTable() {
   // Pass #1: Compute the file offset for each section.
-  size_t FileOff = FileHeader.size();   // File header first.
+  size_t FileOff = ElfHdr.size();   // File header first.
 
-  // Emit all of the section data in order.
+  // Adjust alignment of all section if needed.
   for (std::list<ELFSection>::iterator I = SectionList.begin(),
          E = SectionList.end(); I != E; ++I) {
 
@@ -401,9 +541,14 @@ void ELFWriter::OutputSectionsAndSectionTable() {
     if (!I->SectionIdx)
       continue;
 
+    if (!I->size()) {
+      I->Offset = FileOff;
+      continue;
+    }
+
     // Update Section size
     if (!I->Size)
-      I->Size = I->SectionData.size();
+      I->Size = I->size();
 
     // Align FileOff to whatever the alignment restrictions of the section are.
     if (I->Align)
@@ -419,49 +564,40 @@ void ELFWriter::OutputSectionsAndSectionTable() {
 
   // Now that we know where all of the sections will be emitted, set the e_shnum
   // entry in the ELF header.
-  OutputBuffer FHOut(FileHeader, is64Bit, isLittleEndian);
-  FHOut.fixhalf(NumSections, ELFHdr_e_shnum_Offset);
+  ElfHdr.fixWord16(NumSections, ELFHdr_e_shnum_Offset);
 
   // Now that we know the offset in the file of the section table, update the
   // e_shoff address in the ELF header.
-  FHOut.fixaddr(FileOff, ELFHdr_e_shoff_Offset);
+  ElfHdr.fixWord(FileOff, ELFHdr_e_shoff_Offset);
 
   // Now that we know all of the data in the file header, emit it and all of the
   // sections!
-  O.write((char*)&FileHeader[0], FileHeader.size());
-  FileOff = FileHeader.size();
-  DataBuffer().swap(FileHeader);
+  O.write((char *)&ElfHdr.getData()[0], ElfHdr.size());
+  FileOff = ElfHdr.size();
 
-  DataBuffer Table;
-  OutputBuffer TableOut(Table, is64Bit, isLittleEndian);
+  // Section Header Table blob
+  BinaryObject SHdrTable(isLittleEndian, is64Bit);
 
-  // Emit all of the section data and build the section table itself.
+  // Emit all of sections to the file and build the section header table.
   while (!SectionList.empty()) {
-    const ELFSection &S = *SectionList.begin();
+    ELFSection &S = *SectionList.begin();
+    DOUT << "SectionIdx: " << S.SectionIdx << ", Name: " << S.getName()
+         << ", Size: " << S.Size << ", Offset: " << S.Offset
+         << ", SectionData Size: " << S.size() << "\n";
 
     // Align FileOff to whatever the alignment restrictions of the section are.
-    if (S.Align)
+    if (S.Align) {
       for (size_t NewFileOff = (FileOff+S.Align-1) & ~(S.Align-1);
-           FileOff != NewFileOff; ++FileOff)
+        FileOff != NewFileOff; ++FileOff)
         O << (char)0xAB;
-    O.write((char*)&S.SectionData[0], S.Size);
-
-    DOUT << "SectionIdx: " << S.SectionIdx << ", Name: " << S.Name
-         << ", Size: " << S.Size << ", Offset: " << S.Offset << "\n";
-
-    FileOff += S.Size;
+    }
 
-    TableOut.outword(S.NameIdx);  // sh_name - Symbol table name idx
-    TableOut.outword(S.Type);     // sh_type - Section contents & semantics
-    TableOut.outaddr(S.Flags);    // sh_flags - Section flags.
-    TableOut.outaddr(S.Addr);     // sh_addr - The mem addr this section is in.
-    TableOut.outaddr(S.Offset);   // sh_offset - Offset from the file start.
-    TableOut.outaddr(S.Size);     // sh_size - The section size.
-    TableOut.outword(S.Link);     // sh_link - Section header table index link.
-    TableOut.outword(S.Info);     // sh_info - Auxillary information.
-    TableOut.outaddr(S.Align);    // sh_addralign - Alignment of section.
-    TableOut.outaddr(S.EntSize);  // sh_entsize - Size of entries in the section
+    if (S.size()) {
+      O.write((char *)&S.getData()[0], S.Size);
+      FileOff += S.Size;
+    }
 
+    EmitSectionHeader(SHdrTable, S);
     SectionList.pop_front();
   }
 
@@ -471,5 +607,5 @@ void ELFWriter::OutputSectionsAndSectionTable() {
     O << (char)0xAB;
 
   // Emit the section table itself.
-  O.write((char*)&Table[0], Table.size());
+  O.write((char *)&SHdrTable.getData()[0], SHdrTable.size());
 }
diff --git a/lib/CodeGen/ELFWriter.h b/lib/CodeGen/ELFWriter.h
index 14a44f0..8a380f0 100644
--- a/lib/CodeGen/ELFWriter.h
+++ b/lib/CodeGen/ELFWriter.h
@@ -16,15 +16,20 @@
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/OutputBuffer.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetELFWriterInfo.h"
 #include "ELF.h"
 #include <list>
 #include <map>
 
 namespace llvm {
+  class BinaryObject;
+  class ConstantStruct;
+  class ELFCodeEmitter;
   class GlobalVariable;
   class Mangler;
   class MachineCodeEmitter;
-  class ELFCodeEmitter;
   class raw_ostream;
 
   /// ELFWriter - This class implements the common target-independent code for
@@ -52,6 +57,9 @@ namespace llvm {
     /// Target machine description.
     TargetMachine &TM;
 
+    /// Target Elf Writer description.
+    const TargetELFWriterInfo *TEW;
+
     /// Mang - The object used to perform name mangling for this module.
     Mangler *Mang;
 
@@ -59,6 +67,10 @@ namespace llvm {
     /// code for functions to the .o file.
     ELFCodeEmitter *MCE;
 
+    /// TAI - Target Asm Info, provide information about section names for
+    /// globals and other target specific stuff.
+    const TargetAsmInfo *TAI;
+
     //===------------------------------------------------------------------===//
     // Properties inferred automatically from the target machine.
     //===------------------------------------------------------------------===//
@@ -77,13 +89,8 @@ namespace llvm {
     bool doFinalization(Module &M);
 
   private:
-    // The buffer we accumulate the file header into.  Note that this should be
-    // changed into something much more efficient later (and the bitcode writer
-    // as well!).
-    DataBuffer FileHeader;
-
-    /// ElfHdr - Hold information about the ELF Header
-    ELFHeader *ElfHdr;
+    // Blob containing the Elf header
+    BinaryObject ElfHdr;
 
     /// SectionList - This is the list of sections that we have emitted to the
     /// file.  Once the file has been completely built, the section header table
@@ -97,17 +104,18 @@ namespace llvm {
 
     /// getSection - Return the section with the specified name, creating a new
     /// section if one does not already exist.
-    ELFSection &getSection(const std::string &Name,
-                           unsigned Type, unsigned Flags = 0) {
+    ELFSection &getSection(const std::string &Name, unsigned Type, 
+                           unsigned Flags = 0, unsigned Align = 0) {
       ELFSection *&SN = SectionLookup[Name];
       if (SN) return *SN;
 
-      SectionList.push_back(Name);
+      SectionList.push_back(ELFSection(Name, isLittleEndian, is64Bit));
       SN = &SectionList.back();
       SN->SectionIdx = NumSections++;
       SN->Type = Type;
       SN->Flags = Flags;
       SN->Link = ELFSection::SHN_UNDEF;
+      SN->Align = Align;
       return *SN;
     }
 
@@ -116,23 +124,36 @@ namespace llvm {
                         ELFSection::SHF_EXECINSTR | ELFSection::SHF_ALLOC);
     }
 
+    ELFSection &getNonExecStackSection() {
+      return getSection(".note.GNU-stack", ELFSection::SHT_PROGBITS, 0, 1);
+    }
+
+    ELFSection &getSymbolTableSection() {
+      return getSection(".symtab", ELFSection::SHT_SYMTAB, 0);
+    }
+
+    ELFSection &getStringTableSection() {
+      return getSection(".strtab", ELFSection::SHT_STRTAB, 0, 1);
+    }
+
     ELFSection &getDataSection() {
       return getSection(".data", ELFSection::SHT_PROGBITS,
                         ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
     }
+
     ELFSection &getBSSSection() {
       return getSection(".bss", ELFSection::SHT_NOBITS,
                         ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
     }
 
-    /// SymbolTable - This is the list of symbols we have emitted to the file.
+    /// SymbolList - This is the list of symbols we have emitted to the file.
     /// This actually gets rearranged before emission to the file (to put the
     /// local symbols first in the list).
-    std::vector<ELFSym> SymbolTable;
+    std::vector<ELFSym> SymbolList;
 
-    /// PendingSyms - This is a list of externally defined symbols that we have
-    /// been asked to emit, but have not seen a reference to.  When a reference
-    /// is seen, the symbol will move from this list to the SymbolTable.
+    /// PendingGlobals - List of externally defined symbols that we have been
+    /// asked to emit, but have not seen a reference to.  When a reference
+    /// is seen, the symbol will move from this list to the SymbolList.
     SetVector<GlobalValue*> PendingGlobals;
 
     // As we complete the ELF file, we need to update fields in the ELF header
@@ -142,11 +163,17 @@ namespace llvm {
     unsigned ELFHdr_e_shoff_Offset;     // e_shoff    in ELF header.
     unsigned ELFHdr_e_shstrndx_Offset;  // e_shstrndx in ELF header.
     unsigned ELFHdr_e_shnum_Offset;     // e_shnum    in ELF header.
+
   private:
     void EmitGlobal(GlobalVariable *GV);
-    void EmitSymbolTable();
+    void EmitGlobalConstant(const Constant *C, ELFSection &GblS);
+    void EmitGlobalConstantStruct(const ConstantStruct *CVS,
+                                  ELFSection &GblS);
     void EmitRelocations();
+    void EmitSectionHeader(BinaryObject &SHdrTab, const ELFSection &SHdr);
     void EmitSectionTableStringTable();
+    void EmitSymbol(BinaryObject &SymbolTable, ELFSym &Sym);
+    void EmitSymbolTable();
     void OutputSectionsAndSectionTable();
   };
 }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index b3c60e6..a163cac 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -240,7 +240,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   if (OptLevel != CodeGenOpt::None) {
     PM.add(createMachineLICMPass());
     PM.add(createMachineSinkingPass());
-    printAndVerify(PM, /* allowDoubleDefs= */ true);
+    printAndVerify(PM, /* allowDoubleDefs= */ false);
   }
 
   // Run pre-ra passes.
diff --git a/lib/CodeGen/LazyLiveness.cpp b/lib/CodeGen/LazyLiveness.cpp
new file mode 100644
index 0000000..6fb35d2
--- /dev/null
+++ b/lib/CodeGen/LazyLiveness.cpp
@@ -0,0 +1,158 @@
+//===- LazyLiveness.cpp - Lazy, CFG-invariant liveness information --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a lazy liveness analysis as per "Fast Liveness Checking
+// for SSA-form Programs," by Boissinot, et al.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lazyliveness"
+#include "llvm/CodeGen/LazyLiveness.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+using namespace llvm;
+
+char LazyLiveness::ID = 0;
+static RegisterPass<LazyLiveness> X("lazy-liveness", "Lazy Liveness Analysis");
+
+void LazyLiveness::computeBackedgeChain(MachineFunction& mf, 
+                                        MachineBasicBlock* MBB) {
+  SparseBitVector<128> tmp = rv[MBB];
+  tmp.set(preorder[MBB]);
+  tmp &= backedge_source;
+  calculated.set(preorder[MBB]);
+  
+  for (SparseBitVector<128>::iterator I = tmp.begin(); I != tmp.end(); ++I) {
+    MachineBasicBlock* SrcMBB = rev_preorder[*I];
+    
+    for (MachineBasicBlock::succ_iterator SI = SrcMBB->succ_begin();
+         SI != SrcMBB->succ_end(); ++SI) {
+      MachineBasicBlock* TgtMBB = *SI;
+      
+      if (backedges.count(std::make_pair(SrcMBB, TgtMBB)) &&
+          !rv[MBB].test(preorder[TgtMBB])) {
+        if (!calculated.test(preorder[TgtMBB]))
+          computeBackedgeChain(mf, TgtMBB);
+        
+        tv[MBB].set(preorder[TgtMBB]);
+        tv[MBB] |= tv[TgtMBB];
+      }
+    }
+    
+    tv[MBB].reset(preorder[MBB]);
+  }
+}
+
+bool LazyLiveness::runOnMachineFunction(MachineFunction &mf) {
+  rv.clear();
+  tv.clear();
+  backedges.clear();
+  backedge_source.clear();
+  backedge_target.clear();
+  calculated.clear();
+  preorder.clear();
+  
+  MRI = &mf.getRegInfo();
+  MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+  
+  // Step 0: Compute preorder numbering for all MBBs.
+  unsigned num = 0;
+  for (df_iterator<MachineDomTreeNode*> DI = df_begin(MDT.getRootNode()),
+       DE = df_end(MDT.getRootNode()); DI != DE; ++DI) {
+    preorder[(*DI)->getBlock()] = num++;
+    rev_preorder.push_back((*DI)->getBlock());
+  }
+  
+  // Step 1: Compute the transitive closure of the CFG, ignoring backedges.
+  for (po_iterator<MachineBasicBlock*> POI = po_begin(&*mf.begin()),
+       POE = po_end(&*mf.begin()); POI != POE; ++POI) {
+    MachineBasicBlock* MBB = *POI;
+    SparseBitVector<128>& entry = rv[MBB];
+    entry.set(preorder[MBB]);
+    
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+      DenseMap<MachineBasicBlock*, SparseBitVector<128> >::iterator SII = 
+                                                         rv.find(*SI);
+      
+      // Because we're iterating in postorder, any successor that does not yet
+      // have an rv entry must be on a backedge.
+      if (SII != rv.end()) {
+        entry |= SII->second;
+      } else {
+        backedges.insert(std::make_pair(MBB, *SI));
+        backedge_source.set(preorder[MBB]);
+        backedge_target.set(preorder[*SI]);
+      }
+    }
+  }
+  
+  for (SparseBitVector<128>::iterator I = backedge_source.begin();
+       I != backedge_source.end(); ++I)
+    computeBackedgeChain(mf, rev_preorder[*I]);
+  
+  for (po_iterator<MachineBasicBlock*> POI = po_begin(&*mf.begin()),
+       POE = po_end(&*mf.begin()); POI != POE; ++POI)
+    if (!backedge_target.test(preorder[*POI]))
+      for (MachineBasicBlock::succ_iterator SI = (*POI)->succ_begin(),
+           SE = (*POI)->succ_end(); SI != SE; ++SI)
+        if (!backedges.count(std::make_pair(*POI, *SI)) && tv.count(*SI)) {
+          SparseBitVector<128>& PBV = tv[*POI];
+          PBV = tv[*SI];
+        }
+  
+  for (po_iterator<MachineBasicBlock*> POI = po_begin(&*mf.begin()),
+       POE = po_end(&*mf.begin()); POI != POE; ++POI)
+    tv[*POI].set(preorder[*POI]);
+  
+  return false;
+}
+
+bool LazyLiveness::vregLiveIntoMBB(unsigned vreg, MachineBasicBlock* MBB) {
+  MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>();
+  
+  MachineBasicBlock* DefMBB = MRI->def_begin(vreg)->getParent();
+  unsigned def = preorder[DefMBB];
+  unsigned max_dom = 0;
+  for (df_iterator<MachineDomTreeNode*> DI = df_begin(MDT[DefMBB]),
+       DE = df_end(MDT[DefMBB]); DI != DE; ++DI) {
+    if (preorder[DI->getBlock()] > max_dom) {
+      max_dom = preorder[(*DI)->getBlock()];
+    }
+  }
+  
+  if (preorder[MBB] <= def || max_dom < preorder[MBB])
+    return false;
+  
+  SparseBitVector<128>::iterator I = tv[MBB].begin();
+  while (I != tv[MBB].end() && *I <= def) ++I;
+  while (I != tv[MBB].end() && *I < max_dom) {
+    for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(vreg),
+         UE = MachineRegisterInfo::use_end(); UI != UE; ++UI) {
+      MachineBasicBlock* UseMBB = UI->getParent();
+      if (rv[rev_preorder[*I]].test(preorder[UseMBB]))
+        return true;
+      
+      unsigned t_dom = 0;
+      for (df_iterator<MachineDomTreeNode*> DI =
+           df_begin(MDT[rev_preorder[*I]]), DE = df_end(MDT[rev_preorder[*I]]); 
+           DI != DE; ++DI)
+        if (preorder[DI->getBlock()] > t_dom) {
+          max_dom = preorder[(*DI)->getBlock()];
+        }
+      I = tv[MBB].begin();
+      while (I != tv[MBB].end() && *I < t_dom) ++I;
+    }
+  }
+  
+  return false;
+}
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 944468e..3feb92f 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -214,26 +214,33 @@ void RegScavenger::forward() {
   }
 
   // Process uses first.
-  BitVector UseRegs(NumPhysRegs);
+  BitVector KillRegs(NumPhysRegs);
   for (unsigned i = 0, e = UseMOs.size(); i != e; ++i) {
     const MachineOperand MO = *UseMOs[i].first;
     unsigned Reg = MO.getReg();
 
     assert(isUsed(Reg) && "Using an undefined register!");
 
-    if (MO.isKill() && !isReserved(Reg)) {
-      UseRegs.set(Reg);
+    // Kill of implicit_def defined registers are ignored. e.g.
+    // entry: 0x2029ab8, LLVM BB @0x1b06080, ID#0:
+    // Live Ins: %R0
+    //  %R0<def> = IMPLICIT_DEF
+    //  %R0<def> = IMPLICIT_DEF
+    //  STR %R0<kill>, %R0, %reg0, 0, 14, %reg0, Mem:ST(4,4) [0x1b06510 + 0]
+    //  %R1<def> = LDR %R0, %reg0, 24, 14, %reg0, Mem:LD(4,4) [0x1b065bc + 0]
+    if (MO.isKill() && !isReserved(Reg) && !isImplicitlyDefined(Reg)) {
+      KillRegs.set(Reg);
 
       // Mark sub-registers as used.
       for (const unsigned *SubRegs = TRI->getSubRegisters(Reg);
            unsigned SubReg = *SubRegs; ++SubRegs)
-        UseRegs.set(SubReg);
+        KillRegs.set(SubReg);
     }
   }
 
   // Change states of all registers after all the uses are processed to guard
   // against multiple uses.
-  setUnused(UseRegs);
+  setUnused(KillRegs);
 
   // Process early clobber defs then process defs. We can have a early clobber
   // that is dead, it should not conflict with a def that happens one "slot"
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index f8ae884..43995cb 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -563,6 +563,11 @@ void *JIT::getPointerToFunction(Function *F) {
     return Addr;   // Check if function already code gen'd
 
   MutexGuard locked(lock);
+  
+  // Now that this thread owns the lock, check if another thread has already
+  // code gen'd the function.
+  if (void *Addr = getPointerToGlobalIfAvailable(F))
+    return Addr;  
 
   // Make sure we read in the function if it exists in this Module.
   if (F->hasNotBeenReadFromBitcode()) {
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index ac7de91..7edd118 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -98,12 +98,12 @@ FunctionPass *createARMCodePrinterPass(raw_ostream &O,
 FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
                                        MachineCodeEmitter &MCE);
 
-FunctionPass *createARMCodeEmitterPass( ARMTargetMachine &TM,
-                                        MachineCodeEmitter &MCE);
-FunctionPass *createARMJITCodeEmitterPass( ARMTargetMachine &TM, 
-                                           JITCodeEmitter &JCE);
+FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+FunctionPass *createARMJITCodeEmitterPass(ARMTargetMachine &TM, 
+                                          JITCodeEmitter &JCE);
 
-FunctionPass *createARMLoadStoreOptimizationPass();
+FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMConstantIslandPass();
 
 } // end namespace llvm;
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 4ac6857..594811d 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -28,6 +28,8 @@ def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
                                    "ARM v5TE, v5TEj, v5TExp">;
 def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
                                    "ARM v6">;
+def ArchV6T2    : SubtargetFeature<"v6t2", "ARMArchVersion", "V6T2",
+                                   "ARM v6t2">;
 def ArchV7A     : SubtargetFeature<"v7a", "ARMArchVersion", "V7A",
                                    "ARM v7A">;
 def FeatureVFP2 : SubtargetFeature<"vfp2", "ARMFPUType", "VFPv2",
@@ -92,9 +94,11 @@ def : Proc<"arm1176jzf-s",    [ArchV6, FeatureVFP2]>;
 def : Proc<"mpcorenovfp",     [ArchV6]>;
 def : Proc<"mpcore",          [ArchV6, FeatureVFP2]>;
 
-def : Proc<"arm1156t2-s",     [ArchV6, FeatureThumb2]>;
-def : Proc<"arm1156t2f-s",    [ArchV6, FeatureThumb2, FeatureVFP2]>;
+// V6T2 Processors.
+def : Proc<"arm1156t2-s",     [ArchV6T2, FeatureThumb2]>;
+def : Proc<"arm1156t2f-s",    [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
 
+// V7 Processors.
 def : Proc<"cortex-a8",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
 def : Proc<"cortex-a9",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 6cd786e..f126760 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -17,6 +17,11 @@ class CCIfSubtarget<string F, CCAction A>:
 class CCIfAlign<string Align, CCAction A>:
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
 
+/// CCIfFloatABI - Match of the float ABI and the arg. ABIType may be "Hard" or
+///                "Soft".
+class CCIfFloatABI<string ABIType, CCAction A>:
+  CCIf<!strconcat("llvm::FloatABIType == llvm::FloatABI::", ABIType), A>;
+
 //===----------------------------------------------------------------------===//
 // ARM APCS Calling Convention
 //===----------------------------------------------------------------------===//
@@ -43,9 +48,10 @@ def RetCC_ARM_APCS : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
-// ARM AAPCS (EABI) Calling Convention
+// ARM AAPCS (EABI) Calling Convention, common parts
 //===----------------------------------------------------------------------===//
-def CC_ARM_AAPCS : CallingConv<[
+
+def CC_ARM_AAPCS_Common : CallingConv<[
 
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
@@ -53,23 +59,51 @@ def CC_ARM_AAPCS : CallingConv<[
   // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
   // (and the same is true for f64 if VFP is not enabled)
   CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
-  CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
-
-  CCIfType<[f32], CCBitConvertToType<i32>>,
   CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&"
                        "ArgFlags.getOrigAlign() != 8",
                        CCAssignToReg<[R0, R1, R2, R3]>>>,
 
-  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[f64], CCAssignToStack<8, 8>>
 ]>;
 
-def RetCC_ARM_AAPCS : CallingConv<[
+def RetCC_ARM_AAPCS_Common : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS : CallingConv<[
+  CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
   CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
 
-  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+//===----------------------------------------------------------------------===//
+// ARM AAPCS-VFP (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_VFP : CallingConv<[
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -77,11 +111,19 @@ def RetCC_ARM_AAPCS : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_ARM : CallingConv<[
+  CCIfSubtarget<"isAAPCS_ABI()",
+                CCIfSubtarget<"hasVFP2()",
+                              CCIfFloatABI<"Hard",
+                                           CCDelegateTo<CC_ARM_AAPCS_VFP>>>>,
   CCIfSubtarget<"isAAPCS_ABI()", CCDelegateTo<CC_ARM_AAPCS>>,
   CCDelegateTo<CC_ARM_APCS>
 ]>;
 
 def RetCC_ARM : CallingConv<[
+  CCIfSubtarget<"isAAPCS_ABI()",
+                CCIfSubtarget<"hasVFP2()",
+                              CCIfFloatABI<"Hard",
+                                           CCDelegateTo<RetCC_ARM_AAPCS_VFP>>>>,
   CCIfSubtarget<"isAAPCS_ABI()", CCDelegateTo<RetCC_ARM_AAPCS>>,
   CCDelegateTo<RetCC_ARM_APCS>
 ]>;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index c0fd9dc..ec8bd1f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1101,7 +1101,12 @@ ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
       else
         RC = ARM::GPRRegisterClass;
 
-      if (RegVT == MVT::f64) {
+      if (FloatABIType == FloatABI::Hard) {
+        if (RegVT == MVT::f32)
+          RC = ARM::SPRRegisterClass;
+        else if (RegVT == MVT::f64)
+          RC = ARM::DPRRegisterClass;
+      } else if (RegVT == MVT::f64) {
         // f64 is passed in pairs of GPRs and must be combined.
         RegVT = MVT::i32;
       } else if (!((RegVT == MVT::i32) || (RegVT == MVT::f32)))
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 680e772..cc9f1a5 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -451,7 +451,7 @@ multiclass AsXI1_bin_c_irs<bits<4> opcod, string opc, PatFrag opnode> {
 /// the function.  The first operand is the ID# for this instruction, the second
 /// is the index into the MachineConstantPool that this is, the third is the
 /// size in bytes of this constant pool entry.
-let isNotDuplicable = 1 in
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
 def CONSTPOOL_ENTRY :
 PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
                     i32imm:$size),
@@ -771,6 +771,7 @@ def STM : AXI4st<(outs),
 //  Move Instructions.
 //
 
+let neverHasSideEffects = 1 in
 def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm,
                  "mov", " $dst, $src", []>, UnaryDP;
 def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
@@ -946,6 +947,7 @@ def MLA   : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
                    [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
 
 // Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
 def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
                                (ins GPR:$a, GPR:$b),
                     "smull", " $ldst, $hdst, $a, $b", []>;
@@ -967,6 +969,7 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
                                (ins GPR:$a, GPR:$b),
                     "umaal", " $ldst, $hdst, $a, $b", []>,
                     Requires<[IsARM, HasV6]>;
+} // neverHasSideEffects
 
 // Most significant word multiply
 def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index ffb83a8..54232f6 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -298,6 +298,7 @@ def tADDrr : TI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                 "add $dst, $lhs, $rhs",
                 [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>;
 
+let neverHasSideEffects = 1 in
 def tADDhirr : TIt<(outs tGPR:$dst), (ins GPR:$lhs, GPR:$rhs),
                    "add $dst, $rhs @ addhirr", []>;
 
@@ -387,6 +388,7 @@ def tMOVi8 : TI<(outs tGPR:$dst), (ins i32imm:$src),
 
 // Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy',
 // which is MOV(3).  This also supports high registers.
+let neverHasSideEffects = 1 in {
 def tMOVr       : TI<(outs tGPR:$dst), (ins tGPR:$src),
                       "cpy $dst, $src", []>;
 def tMOVhir2lor : TI<(outs tGPR:$dst), (ins GPR:$src),
@@ -395,6 +397,7 @@ def tMOVlor2hir : TI<(outs GPR:$dst), (ins tGPR:$src),
                       "cpy $dst, $src\t@ lor2hir", []>;
 def tMOVhir2hir : TI<(outs GPR:$dst), (ins GPR:$src),
                       "cpy $dst, $src\t@ hir2hir", []>;
+} // neverHasSideEffects
 
 def tMUL : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                "mul $dst, $rhs",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 0247daf..9104c77 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -192,11 +192,13 @@ def FCVTSD : AI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
   let Inst{7-4}   = 0b1100;
 }
 
+let neverHasSideEffects = 1 in {
 def FCPYD  : ADuI<0b11101011, 0b0000, 0b0100, (outs DPR:$dst), (ins DPR:$a),
                  "fcpyd", " $dst, $a", []>;
 
 def FCPYS  : ASuI<0b11101011, 0b0000, 0b0100, (outs SPR:$dst), (ins SPR:$a),
                  "fcpys", " $dst, $a", []>;
+} // neverHasSideEffects
 
 def FNEGD  : ADuI<0b11101011, 0b0001, 0b0100, (outs DPR:$dst), (ins DPR:$a),
                  "fnegd", " $dst, $a",
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 963ff0d..684ecb4 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -17,24 +17,31 @@
 #include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
 STATISTIC(NumSTMGened , "Number of stm instructions generated");
 STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
 STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
+STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
+
+/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
+/// load / store instructions to form ldm / stm instructions.
 
 namespace {
   struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
@@ -81,12 +88,6 @@ namespace {
   char ARMLoadStoreOpt::ID = 0;
 }
 
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
-FunctionPass *llvm::createARMLoadStoreOptimizationPass() {
-  return new ARMLoadStoreOpt();
-}
-
 static int getLoadStoreMultipleOpcode(int Opcode) {
   switch (Opcode) {
   case ARM::LDR:
@@ -582,6 +583,23 @@ void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
     RS->forward(prior(Loc));
 }
 
+static int getMemoryOpOffset(const MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+  int Offset = isAM2
+    ? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
+  if (isAM2) {
+    if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  } else {
+    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  }
+  return Offset;
+}
+
 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
 /// ops of the same base and incrementing offset into LDM / STM ops.
 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
@@ -606,22 +624,11 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
     bool isMemOp = isMemoryOp(MBBI);
     if (isMemOp) {
       int Opcode = MBBI->getOpcode();
-      bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
       unsigned Size = getLSMultipleTransferSize(MBBI);
       unsigned Base = MBBI->getOperand(1).getReg();
       unsigned PredReg = 0;
       ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
-      unsigned NumOperands = MBBI->getDesc().getNumOperands();
-      unsigned OffField = MBBI->getOperand(NumOperands-3).getImm();
-      int Offset = isAM2
-        ? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
-      if (isAM2) {
-        if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
-          Offset = -Offset;
-      } else {
-        if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
-          Offset = -Offset;
-      }
+      int Offset = getMemoryOpOffset(MBBI);
       // Watch out for:
       // r4 := ldr [r5]
       // r5 := ldr [r5, #4]
@@ -744,6 +751,17 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   return NumMerges > 0;
 }
 
+namespace {
+  struct OffsetCompare {
+    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
+      int LOffset = getMemoryOpOffset(LHS);
+      int ROffset = getMemoryOpOffset(RHS);
+      assert(LHS == RHS || LOffset != ROffset);
+      return LOffset > ROffset;
+    }
+  };
+}
+
 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
 /// (bx lr) into the preceeding stack restore so it directly restore the value
 /// of LR into pc.
@@ -788,3 +806,277 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   delete RS;
   return Modified;
 }
+
+
+/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
+/// load / stores from consecutive locations close to make it more
+/// likely they will be combined later.
+
+namespace {
+  struct VISIBILITY_HIDDEN ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
+    static char ID;
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
+
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM pre- register allocation load / store optimization pass";
+    }
+
+  private:
+    bool RescheduleOps(MachineBasicBlock *MBB,
+                       SmallVector<MachineInstr*, 4> &Ops,
+                       unsigned Base, bool isLd,
+                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
+    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+  };
+  char ARMPreAllocLoadStoreOpt::ID = 0;
+}
+
+bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  TII = Fn.getTarget().getInstrInfo();
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI)
+    Modified |= RescheduleLoadStoreInstrs(MFI);
+
+  return Modified;
+}
+
+static bool IsSafeToMove(bool isLd, unsigned Base,
+                         MachineBasicBlock::iterator I,
+                         MachineBasicBlock::iterator E,
+                         SmallPtrSet<MachineInstr*, 4> MoveOps,
+                         const TargetRegisterInfo *TRI) {
+  // Are there stores / loads / calls between them?
+  // FIXME: This is overly conservative. We should make use of alias information
+  // some day.
+  while (++I != E) {
+    const TargetInstrDesc &TID = I->getDesc();
+    if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
+      return false;
+    if (isLd && TID.mayStore())
+      return false;
+    if (!isLd) {
+      if (TID.mayLoad())
+        return false;
+      // It's not safe to move the first 'str' down.
+      // str r1, [r0]
+      // strh r5, [r0]
+      // str r4, [r0, #+4]
+      if (TID.mayStore() && !MoveOps.count(&*I))
+        return false;
+    }
+    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (MO.isReg() && MO.isDef() && TRI->regsOverlap(MO.getReg(), Base))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
+                                 SmallVector<MachineInstr*, 4> &Ops,
+                                 unsigned Base, bool isLd,
+                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
+  bool RetVal = false;
+
+  // Sort by offset (in reverse order).
+  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
+
+  // The loads / stores of the same base are in order. Scan them from first to
+  // last and check for the followins:
+  // 1. Any def of base.
+  // 2. Any gaps.
+  while (Ops.size() > 1) {
+    unsigned FirstLoc = ~0U;
+    unsigned LastLoc = 0;
+    MachineInstr *FirstOp = 0;
+    MachineInstr *LastOp = 0;
+    int LastOffset = 0;
+    unsigned LastBytes = 0;
+    unsigned NumMove = 0;
+    for (int i = Ops.size() - 1; i >= 0; --i) {
+      MachineInstr *Op = Ops[i];
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
+
+      int Offset = getMemoryOpOffset(Op);
+      unsigned Bytes = getLSMultipleTransferSize(Op);
+      if (LastBytes) {
+        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
+          break;
+      }
+      LastOffset = Offset;
+      LastBytes = Bytes;
+      if (++NumMove == 4)
+        break;
+    }
+
+    if (NumMove <= 1)
+      Ops.pop_back();
+    else {
+      SmallPtrSet<MachineInstr*, 4> MoveOps;
+      for (int i = NumMove-1; i >= 0; --i)
+        MoveOps.insert(Ops[i]);
+
+      // Be conservative, if the instructions are too far apart, don't
+      // move them. We want to limit the increase of register pressure.
+      bool DoMove = (LastLoc - FirstLoc) < NumMove*4;
+      if (DoMove)
+        DoMove = IsSafeToMove(isLd, Base, FirstOp, LastOp, MoveOps, TRI);
+      if (!DoMove) {
+        for (unsigned i = 0; i != NumMove; ++i)
+          Ops.pop_back();
+      } else {
+        // This is the new location for the loads / stores.
+        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
+        while (InsertPos != MBB->end() && MoveOps.count(InsertPos))
+          ++InsertPos;
+        for (unsigned i = 0; i != NumMove; ++i) {
+          MachineInstr *Op = Ops.back();
+          Ops.pop_back();
+          MBB->splice(InsertPos, MBB, Op);
+        }
+
+        NumLdStMoved += NumMove;
+        RetVal = true;
+      }
+    }
+  }
+
+  return RetVal;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
+  bool RetVal = false;
+
+  DenseMap<MachineInstr*, unsigned> MI2LocMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
+  SmallVector<unsigned, 4> LdBases;
+  SmallVector<unsigned, 4> StBases;
+
+  unsigned Loc = 0;
+  MachineBasicBlock::iterator MBBI = MBB->begin();
+  MachineBasicBlock::iterator E = MBB->end();
+  while (MBBI != E) {
+    for (; MBBI != E; ++MBBI) {
+      MachineInstr *MI = MBBI;
+      const TargetInstrDesc &TID = MI->getDesc();
+      if (TID.isCall() || TID.isTerminator()) {
+        // Stop at barriers.
+        ++MBBI;
+        break;
+      }
+
+      MI2LocMap[MI] = Loc++;
+      if (!isMemoryOp(MI))
+        continue;
+      unsigned PredReg = 0;
+      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+        continue;
+
+      int Opcode = MI->getOpcode();
+      bool isLd = Opcode == ARM::LDR ||
+        Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+      unsigned Base = MI->getOperand(1).getReg();
+      int Offset = getMemoryOpOffset(MI);
+
+      bool StopHere = false;
+      if (isLd) {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2LdsMap.find(Base);
+        if (BI != Base2LdsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          SmallVector<MachineInstr*, 4> MIs;
+          MIs.push_back(MI);
+          Base2LdsMap[Base] = MIs;
+          LdBases.push_back(Base);
+        }
+      } else {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2StsMap.find(Base);
+        if (BI != Base2StsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          SmallVector<MachineInstr*, 4> MIs;
+          MIs.push_back(MI);
+          Base2StsMap[Base] = MIs;
+          StBases.push_back(Base);
+        }
+      }
+
+      if (StopHere) {
+        // Found a duplicate (a base+offset combination that's seen earlier). Backtrack.
+        --Loc;
+        break;
+      }
+    }
+
+    // Re-schedule loads.
+    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
+      unsigned Base = LdBases[i];
+      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
+      if (Lds.size() > 1)
+        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
+    }
+
+    // Re-schedule stores.
+    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
+      unsigned Base = StBases[i];
+      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
+      if (Sts.size() > 1)
+        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
+    }
+
+    if (MBBI != E) {
+      Base2LdsMap.clear();
+      Base2StsMap.clear();
+      LdBases.clear();
+      StBases.clear();
+    }
+  }
+
+  return RetVal;
+}
+
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
+  if (PreAlloc)
+    return new ARMPreAllocLoadStoreOpt();
+  return new ARMLoadStoreOpt();
+}
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index b95d1f9..ebe7d58 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -219,3 +219,18 @@ def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
 
 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
+
+//===----------------------------------------------------------------------===//
+// Subregister Set Definitions... now that we have all of the pieces, define the
+// sub registers for each register.
+//
+
+def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7,
+                    D8, D9, D10, D11, D12, D13, D14, D15],
+                   [S0, S2, S4, S6, S8, S10, S12, S14,
+                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+
+def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7,
+                    D8, D9, D10, D11, D12, D13, D14, D15],
+                   [S1, S3, S5, S7, S9, S11, S13, S15,
+                    S17, S19, S21, S23, S25, S27, S29, S31]>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index ef78cd5..a978380 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -14,6 +14,8 @@
 #include "ARMSubtarget.h"
 #include "ARMGenSubtarget.inc"
 #include "llvm/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
@@ -28,6 +30,10 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
   , CPUString("generic")
   , TargetType(isELF) // Default to ELF unless otherwise specified.
   , TargetABI(ARM_ABI_APCS) {
+  // default to soft float ABI
+  if (FloatABIType == FloatABI::Default)
+    FloatABIType = FloatABI::Soft;
+
   // Determine default and user specified characteristics
 
   // Parse features string.
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 8b469cf..0704055 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -23,7 +23,7 @@ class Module;
 class ARMSubtarget : public TargetSubtarget {
 protected:
   enum ARMArchEnum {
-    V4T, V5T, V5TE, V6, V7A
+    V4T, V5T, V5TE, V6, V6T2, V7A
   };
 
   enum ARMFPEnum {
@@ -92,6 +92,7 @@ protected:
   bool hasV5TOps()  const { return ARMArchVersion >= V5T;  }
   bool hasV5TEOps() const { return ARMArchVersion >= V5TE; }
   bool hasV6Ops()   const { return ARMArchVersion >= V6;   }
+  bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; }
   bool hasV7Ops()   const { return ARMArchVersion >= V7A;  }
 
   bool hasVFP2() const { return ARMFPUType >= VFPv2; }
@@ -105,6 +106,7 @@ protected:
   bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
 
   bool isThumb() const { return IsThumb; }
+  bool isThumb1() const { return IsThumb && (ThumbMode == Thumb1); }
   bool isThumb2() const { return IsThumb && (ThumbMode >= Thumb2); }
 
   bool useThumbBacktraces() const { return UseThumbBacktraces; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 1dc7d19..7033907 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -23,6 +23,9 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+static cl::opt<bool>
+EnablePreLdStOpti("arm-pre-alloc-loadstore-opti", cl::Hidden,
+                  cl::desc("Enable pre-regalloc load store optimization pass"));
 static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
                               cl::desc("Disable load store optimization pass"));
 static cl::opt<bool> DisableIfConversion("disable-arm-if-conversion",cl::Hidden,
@@ -144,6 +147,16 @@ bool ARMTargetMachine::addInstSelector(PassManagerBase &PM,
   return false;
 }
 
+bool ARMTargetMachine::addPreRegAlloc(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  if (!EnablePreLdStOpti)
+    return false;
+  // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
+  if (OptLevel != CodeGenOpt::None && !DisableLdStOpti && !Subtarget.isThumb())
+    PM.add(createARMLoadStoreOptimizationPass(true));
+  return true;
+}
+
 bool ARMTargetMachine::addPreEmitPass(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
   // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 916a8aa..7192c1b 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -71,6 +71,7 @@ public:
 
   // Pass Pipeline Configuration
   virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
   virtual bool addAssemblyEmitter(PassManagerBase &PM,
                                   CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 1cf0a91..7cffd0e 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMTarget
   Target.cpp
   TargetAsmInfo.cpp
   TargetData.cpp
+  TargetELFWriterInfo.cpp
   TargetFrameInfo.cpp
   TargetInstrInfo.cpp
   TargetMachOWriterInfo.cpp
@@ -14,4 +15,4 @@ add_llvm_library(LLVMTarget
   TargetSubtarget.cpp
   )
 
-# TODO: Support other targets besides X86. See Makefile.
-\ No newline at end of file
+# TODO: Support other targets besides X86. See Makefile.
diff --git a/lib/Target/PIC16/PIC16AsmPrinter.cpp b/lib/Target/PIC16/PIC16AsmPrinter.cpp
index b42ee45..f9a8801 100644
--- a/lib/Target/PIC16/PIC16AsmPrinter.cpp
+++ b/lib/Target/PIC16/PIC16AsmPrinter.cpp
@@ -33,8 +33,9 @@ bool PIC16AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
   return true;
 }
 
-/// runOnMachineFunction - This uses the printInstruction()
-/// method to print assembly for each instruction.
+/// runOnMachineFunction - This emits the frame section, autos section and 
+/// assembly for each instruction. Also takes care of function begin debug
+/// directive and file begin debug directive (if required) for the function.
 ///
 bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
@@ -47,20 +48,38 @@ bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const Function *F = MF.getFunction();
   CurrentFnName = Mang->getValueName(F);
 
-  DbgInfo.EmitFileDirective(F);
-  // Emit the function variables.
+  // Iterate over the first basic block instructions to find if it has a
+  // DebugLoc. If so emit .file directive. Instructions such as movlw do not
+  // have valid DebugLoc, so need to iterate over instructions.
+  MachineFunction::const_iterator I = MF.begin();
+  for (MachineBasicBlock::const_iterator MBBI = I->begin(), E = I->end();
+       MBBI != E; MBBI++) {
+    const DebugLoc DLoc = MBBI->getDebugLoc();
+    if (!DLoc.isUnknown()) {
+      GlobalVariable *CU = MF.getDebugLocTuple(DLoc).CompileUnit;
+      unsigned line = MF.getDebugLocTuple(DLoc).Line;
+      DbgInfo.EmitFileDirective(CU);
+      DbgInfo.SetFunctBeginLine(line);
+      break;
+    }
+  }
+
+  // Emit the function frame (args and temps).
   EmitFunctionFrame(MF);
 
-  // Emit function begin debug directives
+  // Emit function begin debug directive.
   DbgInfo.EmitFunctBeginDI(F);
 
+  // Emit the autos section of function.
   EmitAutos(CurrentFnName);
+
+  // Now emit the instructions of function in its code section.
   const char *codeSection = PAN::getCodeSectionName(CurrentFnName).c_str();
  
   const Section *fCodeSection = TAI->getNamedSection(codeSection,
                                                      SectionFlags::Code);
-  O <<  "\n";
   // Start the Code Section.
+  O <<  "\n";
   SwitchToSection (fCodeSection);
 
   // Emit the frame address of the function at the beginning of code.
@@ -77,14 +96,17 @@ bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Print out code for the function.
   for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
        I != E; ++I) {
+
     // Print a label for the basic block.
     if (I != MF.begin()) {
       printBasicBlockLabel(I, true);
       O << '\n';
     }
     
+    // Print a basic block.
     for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
          II != E; ++II) {
+
       // Emit the line directive if source line changed.
       const DebugLoc DL = II->getDebugLoc();
       if (!DL.isUnknown()) {
@@ -102,6 +124,7 @@ bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   
   // Emit function end debug directives.
   DbgInfo.EmitFunctEndDI(F, CurLine);
+
   return false;  // we didn't modify anything.
 }
 
@@ -158,11 +181,16 @@ void PIC16AsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
   }
 }
 
+/// printCCOperand - Print the cond code operand.
+///
 void PIC16AsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
   int CC = (int)MI->getOperand(opNum).getImm();
   O << PIC16CondCodeToString((PIC16CC::CondCodes)CC);
 }
 
+/// printLibcallDecls - print the extern declarations for compiler 
+/// intrinsics.
+///
 void PIC16AsmPrinter::printLibcallDecls(void) {
   // If no libcalls used, return.
   if (LibcallDecls.empty()) return;
@@ -180,6 +208,10 @@ void PIC16AsmPrinter::printLibcallDecls(void) {
   O << TAI->getCommentString() << "External decls for libcalls - END." <<"\n";
 }
 
+/// doInitialization - Perfrom Module level initializations here.
+/// One task that we do here is to sectionize all global variables.
+/// The MemSelOptimizer pass depends on the sectionizing.
+///
 bool PIC16AsmPrinter::doInitialization (Module &M) {
   bool Result = AsmPrinter::doInitialization(M);
 
@@ -194,23 +226,23 @@ bool PIC16AsmPrinter::doInitialization (Module &M) {
     I->setSection(TAI->SectionForGlobal(I)->getName());
   }
 
-  DbgInfo.EmitFileDirective(M);
+  DbgInfo.Init(M);
   EmitFunctionDecls(M);
   EmitUndefinedVars(M);
   EmitDefinedVars(M);
   EmitIData(M);
   EmitUData(M);
   EmitRomData(M);
-  DbgInfo.PopulateFunctsDI(M);
   return Result;
 }
 
-// Emit extern decls for functions imported from other modules, and emit
-// global declarations for function defined in this module and which are
-// available to other modules.
+/// Emit extern decls for functions imported from other modules, and emit
+/// global declarations for function defined in this module and which are
+/// available to other modules.
+///
 void PIC16AsmPrinter::EmitFunctionDecls (Module &M) {
  // Emit declarations for external functions.
-  O << TAI->getCommentString() << "Function Declarations - BEGIN." <<"\n";
+  O <<"\n"<<TAI->getCommentString() << "Function Declarations - BEGIN." <<"\n";
   for (Module::iterator I = M.begin(), E = M.end(); I != E; I++) {
     std::string Name = Mang->getValueName(I);
     if (Name.compare("@abort") == 0)
@@ -280,6 +312,7 @@ void PIC16AsmPrinter::EmitRomData (Module &M)
 
 bool PIC16AsmPrinter::doFinalization(Module &M) {
   printLibcallDecls();
+  EmitRemainingAutos();
   DbgInfo.EmitVarDebugInfo(M);
   DbgInfo.EmitEOF();
   O << "\n\t" << "END\n";
@@ -383,6 +416,8 @@ void PIC16AsmPrinter::EmitAutos (std::string FunctName)
   for (unsigned i = 0; i < AutosSections.size(); i++) {
     O << "\n";
     if (AutosSections[i]->S_->getName() == SectionName) { 
+      // Set the printing status to true
+      AutosSections[i]->setPrintedStatus(true);
       SwitchToSection(AutosSections[i]->S_);
       std::vector<const GlobalVariable*> Items = AutosSections[i]->Items;
       for (unsigned j = 0; j < Items.size(); j++) {
@@ -398,3 +433,34 @@ void PIC16AsmPrinter::EmitAutos (std::string FunctName)
   }
 }
 
+// Print autos that were not printed during the code printing of functions.
+// As the functions might themselves would have got deleted by the optimizer.
+void PIC16AsmPrinter::EmitRemainingAutos()
+{
+  const TargetData *TD = TM.getTargetData();
+
+  // Now print Autos section for this function.
+  std::vector <PIC16Section *>AutosSections = PTAI->AutosSections;
+  for (unsigned i = 0; i < AutosSections.size(); i++) {
+    
+    // if the section is already printed then don't print again
+    if (AutosSections[i]->isPrinted()) 
+      continue;
+
+    // Set status as printed
+    AutosSections[i]->setPrintedStatus(true);
+
+    O << "\n";
+    SwitchToSection(AutosSections[i]->S_);
+    std::vector<const GlobalVariable*> Items = AutosSections[i]->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      std::string VarName = Mang->getValueName(Items[j]);
+      Constant *C = Items[j]->getInitializer();
+      const Type *Ty = C->getType();
+      unsigned Size = TD->getTypeAllocSize(Ty);
+      // Emit memory reserve directive.
+      O << VarName << "  RES  " << Size << "\n";
+    }
+  }
+}
+
diff --git a/lib/Target/PIC16/PIC16AsmPrinter.h b/lib/Target/PIC16/PIC16AsmPrinter.h
index 2545dfd..8bdcf72 100644
--- a/lib/Target/PIC16/PIC16AsmPrinter.h
+++ b/lib/Target/PIC16/PIC16AsmPrinter.h
@@ -52,6 +52,7 @@ namespace llvm {
     void EmitIData (Module &M);
     void EmitUData (Module &M);
     void EmitAutos (std::string FunctName);
+    void EmitRemainingAutos ();
     void EmitRomData (Module &M);
     void EmitFunctionFrame(MachineFunction &MF);
     void printLibcallDecls(void);
diff --git a/lib/Target/PIC16/PIC16DebugInfo.cpp b/lib/Target/PIC16/PIC16DebugInfo.cpp
index faf4590..d7ebea7 100644
--- a/lib/Target/PIC16/PIC16DebugInfo.cpp
+++ b/lib/Target/PIC16/PIC16DebugInfo.cpp
@@ -18,13 +18,6 @@
 
 using namespace llvm;
 
-PIC16DbgInfo::~PIC16DbgInfo() {
-  for(std::map<std::string, DISubprogram *>::iterator i = FunctNameMap.begin();
-      i!=FunctNameMap.end(); i++) 
-    delete i->second;
-  FunctNameMap.clear();
-}
-
 void PIC16DbgInfo::PopulateDebugInfo(DIType Ty, unsigned short &TypeNo,
                                      bool &HasAux, int Aux[], 
                                      std::string &TypeName) {
@@ -70,7 +63,7 @@ void PIC16DbgInfo::PopulateDebugInfo(DIType Ty, unsigned short &TypeNo,
         }
         HasAux = true;
         // In auxillary entry for array, 7th and 8th byte represent array size.
-        Aux[6] = size;
+        Aux[6] = size & 0xff;
         Aux[7] = size >> 8;
         DIType BaseType = CTy.getTypeDerivedFrom();
         PopulateDebugInfo(BaseType, TypeNo, HasAux, Aux, TypeName);
@@ -86,10 +79,14 @@ void PIC16DbgInfo::PopulateDebugInfo(DIType Ty, unsigned short &TypeNo,
         else
           TypeNo = TypeNo | PIC16Dbg::T_UNION;
         CTy.getName(TypeName);
-        unsigned size = CTy.getSizeInBits()/8;
+        // UniqueSuffix is .number where number is obtained from 
+        // llvm.dbg.composite<number>.
+        std::string UniqueSuffix = "." + Ty.getGV()->getName().substr(18);
+        TypeName += UniqueSuffix;
+        unsigned short size = CTy.getSizeInBits()/8;
         // 7th and 8th byte represent size.   
         HasAux = true;
-        Aux[6] = size;
+        Aux[6] = size & 0xff;
         Aux[7] = size >> 8;
         break;
       }
@@ -145,37 +142,84 @@ short PIC16DbgInfo::getClass(DIGlobalVariable DIGV) {
   return ClassNo;
 }
 
-void PIC16DbgInfo::PopulateFunctsDI(Module &M) {
-  GlobalVariable *Root = M.getGlobalVariable("llvm.dbg.subprograms");
-  if (!Root)
-    return;
-  Constant *RootC = cast<Constant>(*Root->use_begin());
-
-  for (Value::use_iterator UI = RootC->use_begin(), UE = Root->use_end();
-       UI != UE; ++UI)
-    for (Value::use_iterator UUI = UI->use_begin(), UUE = UI->use_end();
-         UUI != UUE; ++UUI) {
-      GlobalVariable *GVSP = cast<GlobalVariable>(*UUI);
-      DISubprogram *SP = new DISubprogram(GVSP);
-      std::string Name;
-      SP->getLinkageName(Name);
-      FunctNameMap[Name] = SP; 
-    }
-  return;
+void PIC16DbgInfo::Init(Module &M) {
+  // Do all debug related initializations here.
+  EmitFileDirective(M);
+  EmitCompositeTypeDecls(M);
 }
 
-DISubprogram* PIC16DbgInfo::getFunctDI(std::string FunctName) {
-  return FunctNameMap[FunctName];
+void PIC16DbgInfo::EmitCompositeTypeDecls(Module &M) {
+  for(iplist<GlobalVariable>::iterator I = M.getGlobalList().begin(),
+      E = M.getGlobalList().end(); I != E; I++) {
+    // Structures and union declaration's debug info has llvm.dbg.composite
+    // in its name.
+    if(I->getName().find("llvm.dbg.composite") != std::string::npos) {
+      GlobalVariable *GV = cast<GlobalVariable >(I);
+      DICompositeType CTy(GV);
+      if (CTy.getTag() == dwarf::DW_TAG_union_type ||
+          CTy.getTag() == dwarf::DW_TAG_structure_type ) {
+        std::string name;
+        CTy.getName(name);
+        std::string DIVar = I->getName();
+        // Get the number after llvm.dbg.composite and make UniqueSuffix from 
+        // it.
+        std::string UniqueSuffix = "." + DIVar.substr(18);
+        std::string MangledCTyName = name + UniqueSuffix;
+        unsigned short size = CTy.getSizeInBits()/8;
+        int Aux[PIC16Dbg::AuxSize] = {0};
+        // 7th and 8th byte represent size of structure/union.
+        Aux[6] = size & 0xff;
+        Aux[7] = size >> 8;
+        // Emit .def for structure/union tag.
+        if( CTy.getTag() == dwarf::DW_TAG_union_type)
+          EmitSymbol(MangledCTyName, PIC16Dbg::C_UNTAG);
+        else if  (CTy.getTag() == dwarf::DW_TAG_structure_type) 
+          EmitSymbol(MangledCTyName, PIC16Dbg::C_STRTAG);
+
+        // Emit auxiliary debug information for structure/union tag. 
+        EmitAuxEntry(MangledCTyName, Aux, PIC16Dbg::AuxSize);
+        unsigned long Value = 0;
+        DIArray Elements = CTy.getTypeArray();
+        for (unsigned i = 0, N = Elements.getNumElements(); i < N; i++) {
+          DIDescriptor Element = Elements.getElement(i);
+          unsigned short TypeNo = 0;
+          bool HasAux = false;
+          int ElementAux[PIC16Dbg::AuxSize] = { 0 };
+          std::string TypeName = "";
+          std::string ElementName;
+          GlobalVariable *GV = Element.getGV();
+          DIDerivedType DITy(GV);
+          DITy.getName(ElementName);
+          unsigned short ElementSize = DITy.getSizeInBits()/8;
+          // Get mangleddd name for this structure/union  element.
+          std::string MangMemName = ElementName + UniqueSuffix;
+	  PopulateDebugInfo(DITy, TypeNo, HasAux, ElementAux, TypeName);
+          short Class;
+          if( CTy.getTag() == dwarf::DW_TAG_union_type)
+            Class = PIC16Dbg::C_MOU;
+          else if  (CTy.getTag() == dwarf::DW_TAG_structure_type)
+            Class = PIC16Dbg::C_MOS;
+          EmitSymbol(MangMemName, Class, TypeNo, Value);
+          if (CTy.getTag() == dwarf::DW_TAG_structure_type)
+            Value += ElementSize;
+          if (HasAux)
+            EmitAuxEntry(MangMemName, ElementAux, PIC16Dbg::AuxSize, TypeName);
+        }
+        // Emit mangled Symbol for end of structure/union.
+        std::string EOSSymbol = ".eos" + UniqueSuffix;
+        EmitSymbol(EOSSymbol, PIC16Dbg::C_EOS);
+        EmitAuxEntry(EOSSymbol, Aux, PIC16Dbg::AuxSize, MangledCTyName);
+      }
+    }
+  }
 }
 
 void PIC16DbgInfo::EmitFunctBeginDI(const Function *F) {
   std::string FunctName = F->getName();
-  DISubprogram *SP = getFunctDI(FunctName);
-  if (SP) {
+  if (EmitDebugDirectives) {
     std::string FunctBeginSym = ".bf." + FunctName;
     std::string BlockBeginSym = ".bb." + FunctName;
 
-    int FunctBeginLine = SP->getLineNumber();
     int BFAux[PIC16Dbg::AuxSize] = {0};
     BFAux[4] = FunctBeginLine;
     BFAux[5] = FunctBeginLine >> 8;
@@ -189,8 +233,7 @@ void PIC16DbgInfo::EmitFunctBeginDI(const Function *F) {
 
 void PIC16DbgInfo::EmitFunctEndDI(const Function *F, unsigned Line) {
   std::string FunctName = F->getName();
-  DISubprogram *SP = getFunctDI(FunctName);
-  if (SP) {
+  if (EmitDebugDirectives) {
     std::string FunctEndSym = ".ef." + FunctName;
     std::string BlockEndSym = ".eb." + FunctName;
 
@@ -208,14 +251,21 @@ void PIC16DbgInfo::EmitFunctEndDI(const Function *F, unsigned Line) {
 
 /// EmitAuxEntry - Emit Auxiliary debug information.
 ///
-void PIC16DbgInfo::EmitAuxEntry(const std::string VarName, int Aux[], int num) {
+void PIC16DbgInfo::EmitAuxEntry(const std::string VarName, int Aux[], int num,
+                                std::string tag) {
   O << "\n\t.dim " << VarName << ", 1" ;
+  if (tag != "")
+    O << ", " << tag;
   for (int i = 0; i<num; i++)
     O << "," << Aux[i];
 }
 
-void PIC16DbgInfo::EmitSymbol(std::string Name, int Class) {
-  O << "\n\t" << ".def "<< Name << ", debug, class = " << Class;
+void PIC16DbgInfo::EmitSymbol(std::string Name, short Class, unsigned short
+                              Type, unsigned long Value) {
+  O << "\n\t" << ".def "<< Name << ", type = " << Type << ", class = " 
+    << Class;
+  if (Value > 0)
+    O  << ", value = " << Value;
 }
 
 void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
@@ -241,18 +291,8 @@ void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
         O << "\n\t.type " << VarName << ", " << TypeNo;
         short ClassNo = getClass(DIGV);
         O << "\n\t.class " << VarName << ", " << ClassNo;
-        if (HasAux) {
-          if (TypeName != "") {
-           // Emit debug info for structure and union objects after
-           // .dim directive supports structure/union tag name in aux entry.
-           /* O << "\n\t.dim " << VarName << ", 1," << TypeName;
-            for (int i = 0; i<PIC16Dbg::AuxSize; i++)
-              O << "," << Aux[i];*/
-         }
-          else {
-            EmitAuxEntry(VarName, Aux, PIC16Dbg::AuxSize);
-          }
-        }
+        if (HasAux) 
+          EmitAuxEntry(VarName, Aux, PIC16Dbg::AuxSize, TypeName);
       }
     }
   }
@@ -262,26 +302,20 @@ void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
 void PIC16DbgInfo::EmitFileDirective(Module &M) {
   GlobalVariable *CU = M.getNamedGlobal("llvm.dbg.compile_unit");
   if (CU) {
-    DICompileUnit DIUnit(CU);
-    std::string Dir, FN;
-    std::string File = DIUnit.getDirectory(Dir) + "/" + DIUnit.getFilename(FN);
-    O << "\n\t.file\t\"" << File << "\"\n" ;
-    CurFile = File;
+    EmitDebugDirectives = true;
+    EmitFileDirective(CU, false);
   }
 }
 
-void PIC16DbgInfo::EmitFileDirective(const Function *F) {
-  std::string FunctName = F->getName();
-  DISubprogram *SP = getFunctDI(FunctName);
-  if (SP) {
-    std::string Dir, FN;
-    DICompileUnit CU = SP->getCompileUnit();
-    std::string File = CU.getDirectory(Dir) + "/" + CU.getFilename(FN);
-    if ( File != CurFile) {
+void PIC16DbgInfo::EmitFileDirective(GlobalVariable *CU, bool EmitEof) {
+  std::string Dir, FN;
+  DICompileUnit DIUnit(CU);
+  std::string File = DIUnit.getDirectory(Dir) + "/" + DIUnit.getFilename(FN);
+  if ( File != CurFile ) {
+    if (EmitEof)
       EmitEOF();
-      O << "\n\t.file\t\"" << File << "\"\n" ;
-      CurFile = File;
-    }
+    O << "\n\t.file\t\"" << File << "\"\n" ;
+    CurFile = File;
   }
 }
 
@@ -290,3 +324,6 @@ void PIC16DbgInfo::EmitEOF() {
     O << "\n\t.EOF";
 }
 
+void PIC16DbgInfo::SetFunctBeginLine(unsigned line) {
+  FunctBeginLine = line;
+}
diff --git a/lib/Target/PIC16/PIC16DebugInfo.h b/lib/Target/PIC16/PIC16DebugInfo.h
index be39393..9d50380 100644
--- a/lib/Target/PIC16/PIC16DebugInfo.h
+++ b/lib/Target/PIC16/PIC16DebugInfo.h
@@ -91,29 +91,36 @@ namespace llvm {
   class raw_ostream;
 
   class PIC16DbgInfo {
-    std::map <std::string, DISubprogram *> FunctNameMap;
     raw_ostream &O;
     const TargetAsmInfo *TAI;
     std::string CurFile;
+    // EmitDebugDirectives is set if debug information is available. Default
+    // value for it is false.
+    bool EmitDebugDirectives;
+    unsigned FunctBeginLine;
   public:
     PIC16DbgInfo(raw_ostream &o, const TargetAsmInfo *T) : O(o), TAI(T) {
-      CurFile = "";  
+      CurFile = ""; 
+      EmitDebugDirectives = false; 
     }
-    ~PIC16DbgInfo();
     void PopulateDebugInfo(DIType Ty, unsigned short &TypeNo, bool &HasAux,
                            int Aux[], std::string &TypeName);
     unsigned GetTypeDebugNumber(std::string &type);
     short getClass(DIGlobalVariable DIGV);
-    void PopulateFunctsDI(Module &M);
-    DISubprogram *getFunctDI(std::string FunctName);
     void EmitFunctBeginDI(const Function *F);
+    void Init(Module &M);
+    void EmitCompositeTypeDecls(Module &M);
     void EmitFunctEndDI(const Function *F, unsigned Line);
-    void EmitAuxEntry(const std::string VarName, int Aux[], int num);
-    inline void EmitSymbol(std::string Name, int Class);
+    void EmitAuxEntry(const std::string VarName, int Aux[], 
+                      int num = PIC16Dbg::AuxSize, std::string tag = "");
+    inline void EmitSymbol(std::string Name, short Class, 
+                           unsigned short Type = PIC16Dbg::T_NULL, 
+                           unsigned long Value = 0);
     void EmitVarDebugInfo(Module &M);
     void EmitFileDirective(Module &M);
-    void EmitFileDirective(const Function *F);
+    void EmitFileDirective(GlobalVariable *CU, bool EmitEof = true);
     void EmitEOF();
+    void SetFunctBeginLine(unsigned line);
   };
 } // end namespace llvm;
 #endif
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index ac9a143..ba465f3 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -56,6 +56,17 @@ static const char *getIntrinsicName(unsigned opcode) {
   case RTLIB::SREM_I32: Basename = "srem.i32"; break;
   case RTLIB::UREM_I16: Basename = "urem.i16"; break;
   case RTLIB::UREM_I32: Basename = "urem.i32"; break;
+
+  case RTLIB::FPTOSINT_F32_I32:
+               Basename = "f32_to_si32"; break;
+  case RTLIB::SINTTOFP_I32_F32:
+               Basename = "si32_to_f32"; break;
+               
+  case RTLIB::ADD_F32: Basename = "add.f32"; break;
+  case RTLIB::SUB_F32: Basename = "sub.f32"; break;
+  case RTLIB::MUL_F32: Basename = "mul.f32"; break;
+  case RTLIB::DIV_F32: Basename = "div.f32"; break;
+  
   }
   
   std::string prefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
@@ -113,7 +124,17 @@ PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
   // Unsigned remainder lib call names
   setLibcallName(RTLIB::UREM_I16, getIntrinsicName(RTLIB::UREM_I16));
   setLibcallName(RTLIB::UREM_I32, getIntrinsicName(RTLIB::UREM_I32));
-  
+ 
+  // Floating point operations
+  setLibcallName(RTLIB::FPTOSINT_F32_I32, 
+                 getIntrinsicName(RTLIB::FPTOSINT_F32_I32));
+  setLibcallName(RTLIB::SINTTOFP_I32_F32, 
+                 getIntrinsicName(RTLIB::SINTTOFP_I32_F32));
+  setLibcallName(RTLIB::ADD_F32, getIntrinsicName(RTLIB::ADD_F32));
+  setLibcallName(RTLIB::SUB_F32, getIntrinsicName(RTLIB::SUB_F32));
+  setLibcallName(RTLIB::MUL_F32, getIntrinsicName(RTLIB::MUL_F32));
+  setLibcallName(RTLIB::DIV_F32, getIntrinsicName(RTLIB::DIV_F32));
+
   setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
   setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
 
diff --git a/lib/Target/PIC16/PIC16TargetAsmInfo.h b/lib/Target/PIC16/PIC16TargetAsmInfo.h
index e464e36..b7292b8 100644
--- a/lib/Target/PIC16/PIC16TargetAsmInfo.h
+++ b/lib/Target/PIC16/PIC16TargetAsmInfo.h
@@ -33,9 +33,13 @@ namespace llvm {
   struct PIC16Section {
       const Section *S_; // Connection to actual Section.
       unsigned Size;  // Total size of the objects contained.
+      bool SectionPrinted;
       std::vector<const GlobalVariable*> Items;
      
-      PIC16Section (const Section *s) { S_ = s; Size = 0; }
+      PIC16Section (const Section *s) { S_ = s; Size = 0; 
+                                        SectionPrinted = false;}
+      bool isPrinted() { return SectionPrinted ; }
+      void setPrintedStatus(bool status) { SectionPrinted = status ;} 
   };
       
   struct PIC16TargetAsmInfo : public TargetAsmInfo {
diff --git a/lib/Target/TargetELFWriterInfo.cpp b/lib/Target/TargetELFWriterInfo.cpp
new file mode 100644
index 0000000..9651e65
--- /dev/null
+++ b/lib/Target/TargetELFWriterInfo.cpp
@@ -0,0 +1,36 @@
+//===-- lib/Target/TargetELFWriterInfo.cpp - ELF Writer Info --0-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetELFWriterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Target/TargetELFWriterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+TargetELFWriterInfo::TargetELFWriterInfo(TargetMachine &tm) : TM(tm) {
+  is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+  isLittleEndian = TM.getTargetData()->isLittleEndian();
+}
+
+TargetELFWriterInfo::~TargetELFWriterInfo() {}
+
+/// getFunctionAlignment - Returns the alignment for function 'F', targets
+/// with different alignment constraints should overload this method
+unsigned TargetELFWriterInfo::getFunctionAlignment(const Function *F) const {
+  const TargetData *TD = TM.getTargetData();
+  unsigned FnAlign = F->getAlignment();
+  unsigned TDAlign = TD->getPointerABIAlignment();
+  unsigned Align = std::max(FnAlign, TDAlign);
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+  return Align;
+}
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index dea293b..c487cb8 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -30,6 +30,7 @@ namespace llvm {
   bool FiniteOnlyFPMathOption;
   bool HonorSignDependentRoundingFPMathOption;
   bool UseSoftFloat;
+  FloatABI::ABIType FloatABIType;
   bool NoImplicitFloat;
   bool NoZerosInBSS;
   bool ExceptionHandling;
@@ -84,6 +85,19 @@ GenerateSoftFloatCalls("soft-float",
   cl::desc("Generate software floating point library calls"),
   cl::location(UseSoftFloat),
   cl::init(false));
+static cl::opt<llvm::FloatABI::ABIType, true>
+FloatABIForCalls("float-abi",
+  cl::desc("Choose float ABI type"),
+  cl::location(FloatABIType),
+  cl::init(FloatABI::Default),
+  cl::values(
+    clEnumValN(FloatABI::Default, "default",
+               "Target default float ABI type"),
+    clEnumValN(FloatABI::Soft, "soft",
+               "Soft float ABI (implied by -soft-float)"),
+    clEnumValN(FloatABI::Hard, "hard",
+               "Hard float ABI (uses FP registers)"),
+    clEnumValEnd));
 static cl::opt<bool, true>
 DontPlaceZerosInBSS("nozero-initialized-in-bss",
   cl::desc("Don't place zero-initialized symbols into bss section"),
@@ -162,6 +176,14 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
 // TargetMachine Class
 //
 
+TargetMachine::TargetMachine() 
+  : AsmInfo(0) {
+  // Typically it will be subtargets that will adjust FloatABIType from Default
+  // to Soft or Hard.
+  if (UseSoftFloat)
+    FloatABIType = FloatABI::Soft;
+}
+
 TargetMachine::~TargetMachine() {
   delete AsmInfo;
 }
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 710bd03..3796aac 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -482,35 +482,6 @@ _usesbb:
 
 //===---------------------------------------------------------------------===//
 
-Currently we don't have elimination of redundant stack manipulations. Consider
-the code:
-
-int %main() {
-entry:
-	call fastcc void %test1( )
-	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
-	ret int 0
-}
-
-declare fastcc void %test1()
-
-declare fastcc void %test2(sbyte*)
-
-
-This currently compiles to:
-
-	subl $16, %esp
-	call _test5
-	addl $12, %esp
-	subl $16, %esp
-	movl $_test5, (%esp)
-	call _test6
-	addl $12, %esp
-
-The add\sub pair is really unneeded here.
-
-//===---------------------------------------------------------------------===//
-
 Consider the expansion of:
 
 define i32 @test3(i32 %X) {
@@ -902,34 +873,6 @@ condition register is dead. xor reg reg is shorter than mov reg, #0.
 
 //===---------------------------------------------------------------------===//
 
-We aren't matching RMW instructions aggressively
-enough.  Here's a reduced testcase (more in PR1160):
-
-define void @test(i32* %huge_ptr, i32* %target_ptr) {
-        %A = load i32* %huge_ptr                ; <i32> [#uses=1]
-        %B = load i32* %target_ptr              ; <i32> [#uses=1]
-        %C = or i32 %A, %B              ; <i32> [#uses=1]
-        store i32 %C, i32* %target_ptr
-        ret void
-}
-
-$ llvm-as < t.ll | llc -march=x86-64
-
-_test:
-        movl (%rdi), %eax
-        orl (%rsi), %eax
-        movl %eax, (%rsi)
-        ret
-
-That should be something like:
-
-_test:
-        movl (%rdi), %eax
-        orl %eax, (%rsi)
-        ret
-
-//===---------------------------------------------------------------------===//
-
 The following code:
 
 bb114.preheader:		; preds = %cond_next94
@@ -1897,3 +1840,60 @@ The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
   Core 2, and "Generic"
 
 //===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+	movl	4(%esp), %eax
+	andl	$127, %eax
+	cmpl	$31, %eax
+	seta	%al
+	movzbl	%al, %eax
+	ret
+
+Ideal output:
+	xorl	%eax, %eax
+	testl	$96, 4(%esp)
+	setne	%al
+	ret
+
+We could do this transformation in instcombine, but it's only clearly
+beneficial on platforms with a test instruction.
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+	movl	4(%esp), %eax
+	shrl	$4, %eax
+	andl	$15, %eax
+	ret
+
+Ideal output:
+	movzbl	4(%esp), %eax
+	shrl	$4, %eax
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+
+Current output:
+	testl	$128, 4(%esp)
+	setne	%al
+	movzbl	%al, %eax
+	shll	$8, %eax
+	ret
+
+Ideal output:
+	movl	4(%esp), %eax
+	addl	%eax, %eax
+	andl	$256, %eax
+	ret
+
+We generally want to fold shifted tests of a single bit into a shift+and on x86.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 7f99203..e9fcbd5 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -215,50 +215,6 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 16>>
 ]>;
 
-// Tail call convention (fast): One register is reserved for target address,
-// namely R9
-def CC_X86_64_TailCall : CallingConv<[
-  // Handles byval parameters.
-  CCIfByVal<CCPassByVal<8, 8>>,
-
-  // Promote i8/i16 arguments to i32.
-  CCIfType<[i8, i16], CCPromoteToType<i32>>,
-
-  // The 'nest' parameter, if any, is passed in R10.
-  CCIfNest<CCAssignToReg<[R10]>>,
-
-  // The first 6 integer arguments are passed in integer registers.
-  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>,
-  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
-  
-  // The first 8 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-            CCIfSubtarget<"hasSSE1()",
-            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
-
-  // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
-  // registers on Darwin.
-  CCIfType<[v8i8, v4i16, v2i32, v2f32],
-            CCIfSubtarget<"isTargetDarwin()",
-            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
- 
-  // The first 8 v1i64 vector arguments are passed in GPRs on Darwin.
-  CCIfType<[v1i64],
-            CCIfSubtarget<"isTargetDarwin()",
-            CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
- 
-  // Integer/FP values get stored in stack slots that are 8 bytes in size and
-  // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
-  
-  // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
-
-  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
-]>;
-
-
 //===----------------------------------------------------------------------===//
 // X86 C Calling Convention
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index 2604741..d84034b 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -12,8 +12,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ELFWriterInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-X86ELFWriterInfo::X86ELFWriterInfo(bool is64Bit) :
-  TargetELFWriterInfo(is64Bit ? EM_X86_64 : EM_386) {}
+X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)
+  : TargetELFWriterInfo(TM) {
+    bool is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+    EMachine = is64Bit ? EM_X86_64 : EM_386;
+  }
+
 X86ELFWriterInfo::~X86ELFWriterInfo() {}
+
+unsigned X86ELFWriterInfo::getFunctionAlignment(const Function *F) const {
+  unsigned FnAlign = 4;
+
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
+    FnAlign = 1;
+
+  if (F->getAlignment())
+    FnAlign = Log2_32(F->getAlignment());
+
+  return (1 << FnAlign);
+}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
index acfa501..e9c5bc4 100644
--- a/lib/Target/X86/X86ELFWriterInfo.h
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -20,8 +20,10 @@ namespace llvm {
 
   class X86ELFWriterInfo : public TargetELFWriterInfo {
   public:
-    X86ELFWriterInfo(bool is64Bit);
+    X86ELFWriterInfo(TargetMachine &TM);
     virtual ~X86ELFWriterInfo();
+
+    virtual unsigned getFunctionAlignment(const Function *F) const;
   };
 
 } // end llvm namespace
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 33332e4..2bcfd76 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -171,8 +171,6 @@ CCAssignFn *X86FastISel::CCAssignFnForCall(unsigned CC, bool isTaillCall) {
   if (Subtarget->is64Bit()) {
     if (Subtarget->isTargetWin64())
       return CC_X86_Win64_C;
-    else if (CC == CallingConv::Fast && isTaillCall)
-      return CC_X86_64_TailCall;
     else
       return CC_X86_64_C;
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9e15a54..36e3ab2 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -944,7 +944,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
     SDValue StackAdjustment = TailCall.getOperand(2);
     assert(((TargetAddress.getOpcode() == ISD::Register &&
                (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
-                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R11)) ||
               TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
               TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
              "Expecting an global address, external symbol, or register");
@@ -1171,8 +1171,6 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
   if (Subtarget->is64Bit()) {
     if (Subtarget->isTargetWin64())
       return CC_X86_Win64_C;
-    else if (CC == CallingConv::Fast && PerformTailCallOpt)
-      return CC_X86_64_TailCall;
     else
       return CC_X86_64_C;
   }
@@ -1799,7 +1797,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
   } else if (IsTailCall) {
-    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
+    unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
 
     Chain = DAG.getCopyToReg(Chain,  dl,
                              DAG.getRegister(Opc, getPointerTy()),
@@ -7696,7 +7694,7 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
                                      SelectionDAG &DAG, MachineFrameInfo *MFI,
                                      const TargetLowering &TLI) {
   LDBase = NULL;
-  LastLoadedElt = -1;
+  LastLoadedElt = -1U;
   for (unsigned i = 0; i < NumElems; ++i) {
     if (N->getMaskElt(i) < 0) {
       if (!LDBase)
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index c733f26..6c0074e 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -822,6 +822,13 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
 
+  unsigned ReadyLabelId = 0;
+  if (needsFrameMoves) {
+    // Mark effective beginning of when frame pointer is ready.
+    ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
+  }
+
   // Skip the callee-saved push instructions.
   while (MBBI != MBB.end() &&
          (MBBI->getOpcode() == X86::PUSH32r ||
@@ -831,67 +838,61 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
-  if (NumBytes) {   // Adjust stack pointer: ESP -= numbytes.
-    if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
-      // Check, whether EAX is livein for this function.
-      bool isEAXAlive = false;
-      for (MachineRegisterInfo::livein_iterator
+  // Adjust stack pointer: ESP -= numbytes.
+  if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+    // Check, whether EAX is livein for this function.
+    bool isEAXAlive = false;
+    for (MachineRegisterInfo::livein_iterator
            II = MF.getRegInfo().livein_begin(),
            EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
-        unsigned Reg = II->first;
-        isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
-                      Reg == X86::AH  || Reg == X86::AL);
-      }
+      unsigned Reg = II->first;
+      isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+                    Reg == X86::AH || Reg == X86::AL);
+    }
 
-      // Function prologue calls _alloca to probe the stack when allocating more
-      // than 4k bytes in one go. Touching the stack at 4K increments is
-      // necessary to ensure that the guard pages used by the OS virtual memory
-      // manager are allocated in correct sequence.
-      if (!isEAXAlive) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-          .addImm(NumBytes);
-        BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
-          .addExternalSymbol("_alloca");
-      } else {
-        // Save EAX
-        BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
-          .addReg(X86::EAX, RegState::Kill);
-
-        // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
-        // allocated bytes for EAX.
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-          .addImm(NumBytes-4);
-        BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
-          .addExternalSymbol("_alloca");
-
-        // Restore EAX
-        MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
-                                                X86::EAX),
-                                        StackPtr, false, NumBytes-4);
-        MBB.insert(MBBI, MI);
-      }
+    // Function prologue calls _alloca to probe the stack when allocating more
+    // than 4k bytes in one go. Touching the stack at 4K increments is necessary
+    // to ensure that the guard pages used by the OS virtual memory manager are
+    // allocated in correct sequence.
+    if (!isEAXAlive) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+        .addImm(NumBytes);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("_alloca");
     } else {
-      // If there is an SUB32ri of ESP immediately before this instruction,
-      // merge the two. This can be the case when tail call elimination is
-      // enabled and the callee has more arguments then the caller.
-      NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+      // Save EAX
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+        .addReg(X86::EAX, RegState::Kill);
+
+      // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+      // allocated bytes for EAX.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+        .addImm(NumBytes - 4);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("_alloca");
+
+      // Restore EAX
+      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                              X86::EAX),
+                                      StackPtr, false, NumBytes - 4);
+      MBB.insert(MBBI, MI);
+    }
+  } else if (NumBytes) {
+    // If there is an SUB32ri of ESP immediately before this instruction, merge
+    // the two. This can be the case when tail call elimination is enabled and
+    // the callee has more arguments then the caller.
+    NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
 
-      // If there is an ADD32ri or SUB32ri of ESP immediately after this
-      // instruction, merge the two instructions.
-      mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+    // If there is an ADD32ri or SUB32ri of ESP immediately after this
+    // instruction, merge the two instructions.
+    mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
 
-      if (NumBytes)
-        emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
-    }
+    if (NumBytes)
+      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
   }
 
-  if (needsFrameMoves) {
-    // Mark effective beginning of when frame pointer is ready.
-    unsigned ReadyLabelId = 0;
-    ReadyLabelId = MMI->NextLabelID();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
+  if (needsFrameMoves)
     emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
-  }
 }
 
 void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 03ce1ae..56983ce 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -350,6 +350,10 @@ X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
   , MaxInlineSizeThreshold(128)
   , Is64Bit(is64Bit)
   , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+  // default to hard float ABI
+  if (FloatABIType == FloatABI::Default)
+    FloatABIType = FloatABI::Hard;
     
   // Determine default and user specified characteristics
   if (!FS.empty()) {
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 88ab247..dfb055f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -133,8 +133,7 @@ X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
     DataLayout(Subtarget.getDataLayout()),
     FrameInfo(TargetFrameInfo::StackGrowsDown,
               Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
-    InstrInfo(*this), JITInfo(*this), TLInfo(*this),
-    ELFWriterInfo(Subtarget.is64Bit()) {
+    InstrInfo(*this), JITInfo(*this), TLInfo(*this), ELFWriterInfo(*this) {
   DefRelocModel = getRelocationModel();
   // FIXME: Correctly select PIC model for Win64 stuff
   if (getRelocationModel() == Reloc::Default) {
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 2bb6428..a612634 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -127,17 +127,8 @@ bool ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
 
   // Second check: make sure that all callers are direct callers.  We can't
   // transform functions that have indirect callers.
-  for (Value::use_iterator UI = F->use_begin(), E = F->use_end();
-       UI != E; ++UI) {
-    CallSite CS = CallSite::get(*UI);
-    if (!CS.getInstruction())       // "Taking the address" of the function
-      return false;
-
-    // Ensure that this call site is CALLING the function, not passing it as
-    // an argument.
-    if (!CS.isCallee(UI))
-      return false;
-  }
+  if (F->hasAddressTaken())
+    return false;
 
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 666db7e..e480dad 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -175,15 +175,8 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
 
   // Ensure that the function is only directly called.
-  for (Value::use_iterator I = Fn.use_begin(), E = Fn.use_end(); I != E; ++I) {
-    // If this use is anything other than a call site, give up.
-    CallSite CS = CallSite::get(*I);
-    Instruction *TheCall = CS.getInstruction();
-    if (!TheCall) return false;   // Not a direct call site?
-
-    // The addr of this function is passed to the call.
-    if (!CS.isCallee(I)) return false;
-  }
+  if (Fn.hasAddressTaken())
+    return false;
 
   // Okay, we know we can transform this function if safe.  Scan its body
   // looking for calls to llvm.vastart.
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index db378b0..9c652b9 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -47,7 +47,6 @@ namespace {
     void GlobalIsNeeded(GlobalValue *GV);
     void MarkUsedGlobalsAsNeeded(Constant *C);
 
-    bool SafeToDestroyConstant(Constant* C);
     bool RemoveUnusedGlobalValue(GlobalValue &GV);
   };
 }
@@ -211,17 +210,3 @@ bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) {
   GV.removeDeadConstantUsers();
   return GV.use_empty();
 }
-
-// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
-// by constants itself.  Note that constants cannot be cyclic, so this test is
-// pretty easy to implement recursively.
-//
-bool GlobalDCE::SafeToDestroyConstant(Constant *C) {
-  for (Value::use_iterator I = C->use_begin(), E = C->use_end(); I != E; ++I)
-    if (Constant *User = dyn_cast<Constant>(*I)) {
-      if (!SafeToDestroyConstant(User)) return false;
-    } else {
-      return false;
-    }
-  return true;
-}
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 5f12825..9a1b294 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -136,16 +136,16 @@ struct VISIBILITY_HIDDEN GlobalStatus {
 
 }
 
-/// ConstantIsDead - Return true if the specified constant is (transitively)
-/// dead.  The constant may be used by other constants (e.g. constant arrays and
-/// constant exprs) as long as they are dead, but it cannot be used by anything
-/// else.
-static bool ConstantIsDead(Constant *C) {
+// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
+// by constants itself.  Note that constants cannot be cyclic, so this test is
+// pretty easy to implement recursively.
+//
+static bool SafeToDestroyConstant(Constant *C) {
   if (isa<GlobalValue>(C)) return false;
 
   for (Value::use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; ++UI)
     if (Constant *CU = dyn_cast<Constant>(*UI)) {
-      if (!ConstantIsDead(CU)) return false;
+      if (!SafeToDestroyConstant(CU)) return false;
     } else
       return false;
   return true;
@@ -233,7 +233,7 @@ static bool AnalyzeGlobal(Value *V, GlobalStatus &GS,
     } else if (Constant *C = dyn_cast<Constant>(*UI)) {
       GS.HasNonInstructionUser = true;
       // We might have a dead and dangling constant hanging off of here.
-      if (!ConstantIsDead(C))
+      if (!SafeToDestroyConstant(C))
         return true;
     } else {
       GS.HasNonInstructionUser = true;
@@ -338,7 +338,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
     } else if (Constant *C = dyn_cast<Constant>(U)) {
       // If we have a chain of dead constantexprs or other things dangling from
       // us, and if they are all dead, nuke them without remorse.
-      if (ConstantIsDead(C)) {
+      if (SafeToDestroyConstant(C)) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         CleanupConstantGlobalUsers(V, Init);
@@ -354,7 +354,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) {
 static bool isSafeSROAElementUse(Value *V) {
   // We might have a dead and dangling constant hanging off of here.
   if (Constant *C = dyn_cast<Constant>(V))
-    return ConstantIsDead(C);
+    return SafeToDestroyConstant(C);
   
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
@@ -1769,22 +1769,6 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
   return false;
 }
 
-/// OnlyCalledDirectly - Return true if the specified function is only called
-/// directly.  In other words, its address is never taken.
-static bool OnlyCalledDirectly(Function *F) {
-  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
-    Instruction *User = dyn_cast<Instruction>(*UI);
-    if (!User) return false;
-    if (!isa<CallInst>(User) && !isa<InvokeInst>(User)) return false;
-
-    // See if the function address is passed as an argument.
-    for (User::op_iterator i = User->op_begin() + 1, e = User->op_end();
-         i != e; ++i)
-      if (*i == F) return false;
-  }
-  return true;
-}
-
 /// ChangeCalleesToFastCall - Walk all of the direct calls of the specified
 /// function, changing them to FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
@@ -1830,7 +1814,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
       ++NumFnDeleted;
     } else if (F->hasLocalLinkage()) {
       if (F->getCallingConv() == CallingConv::C && !F->isVarArg() &&
-          OnlyCalledDirectly(F)) {
+          !F->hasAddressTaken()) {
         // If this function has C calling conventions, is not a varargs
         // function, and is only called directly, promote it to use the Fast
         // calling convention.
@@ -1841,7 +1825,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
       }
 
       if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
-          OnlyCalledDirectly(F)) {
+          !F->hasAddressTaken()) {
         // The function is not used by a trampoline intrinsic, so it is safe
         // to remove the 'nest' attribute.
         RemoveNestAttribute(F);
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 17bc2d4..5693cc0 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -9,10 +9,6 @@
 //
 // This pass looks for equivalent functions that are mergable and folds them.
 //
-// A Function will not be analyzed if:
-// * it is overridable at runtime (except for weak linkage), or
-// * it is used by anything other than the callee parameter of a call/invoke
-//
 // A hash is computed from the function, based on its type and number of
 // basic blocks.
 //
@@ -24,8 +20,6 @@
 // When a match is found, the functions are folded. We can only fold two
 // functions when we know that the definition of one of them is not
 // overridable.
-// * fold a function marked internal by replacing all of its users.
-// * fold extern or weak functions by replacing them with a global alias
 //
 //===----------------------------------------------------------------------===//
 //
@@ -48,6 +42,7 @@
 #define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
 #include "llvm/InlineAsm.h"
@@ -62,7 +57,6 @@
 using namespace llvm;
 
 STATISTIC(NumFunctionsMerged, "Number of functions merged");
-STATISTIC(NumMergeFails, "Number of identical function pairings not merged");
 
 namespace {
   struct VISIBILITY_HIDDEN MergeFunctions : public ModulePass {
@@ -81,16 +75,169 @@ ModulePass *llvm::createMergeFunctionsPass() {
   return new MergeFunctions();
 }
 
+// ===----------------------------------------------------------------------===
+// Comparison of functions
+// ===----------------------------------------------------------------------===
+
 static unsigned long hash(const Function *F) {
-  return F->size() ^ reinterpret_cast<unsigned long>(F->getType());
-  //return F->size() ^ F->arg_size() ^ F->getReturnType();
+  const FunctionType *FTy = F->getFunctionType();
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  ID.AddBoolean(F->hasGC());
+  ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(FTy->getReturnType()->getTypeID());
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(FTy->getParamType(i)->getTypeID());
+  return ID.ComputeHash();
+}
+
+/// IgnoreBitcasts - given a bitcast, returns the first non-bitcast found by
+/// walking the chain of cast operands. Otherwise, returns the argument.
+static Value* IgnoreBitcasts(Value *V) {
+  while (BitCastInst *BC = dyn_cast<BitCastInst>(V))
+    V = BC->getOperand(0);
+
+  return V;
+}
+
+/// isEquivalentType - any two pointers are equivalent. Otherwise, standard
+/// type equivalence rules apply.
+static bool isEquivalentType(const Type *Ty1, const Type *Ty2) {
+  if (Ty1 == Ty2)
+    return true;
+  if (Ty1->getTypeID() != Ty2->getTypeID())
+    return false;
+
+  switch(Ty1->getTypeID()) {
+  case Type::VoidTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+    return true;
+
+  case Type::IntegerTyID:
+  case Type::OpaqueTyID:
+    // Ty1 == Ty2 would have returned true earlier.
+    return false;
+
+  default:
+    assert(0 && "Unknown type!");
+    return false;
+
+  case Type::PointerTyID: {
+    const PointerType *PTy1 = cast<PointerType>(Ty1);
+    const PointerType *PTy2 = cast<PointerType>(Ty2);
+    return PTy1->getAddressSpace() == PTy2->getAddressSpace();
+  }
+
+  case Type::StructTyID: {
+    const StructType *STy1 = cast<StructType>(Ty1);
+    const StructType *STy2 = cast<StructType>(Ty2);
+    if (STy1->getNumElements() != STy2->getNumElements())
+      return false;
+
+    if (STy1->isPacked() != STy2->isPacked())
+      return false;
+
+    for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) {
+      if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::FunctionTyID: {
+    const FunctionType *FTy1 = cast<FunctionType>(Ty1);
+    const FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    if (FTy1->getNumParams() != FTy2->getNumParams() ||
+        FTy1->isVarArg() != FTy2->isVarArg())
+      return false;
+
+    if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType()))
+      return false;
+
+    for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) {
+      if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID: {
+    const SequentialType *STy1 = cast<SequentialType>(Ty1);
+    const SequentialType *STy2 = cast<SequentialType>(Ty2);
+    return isEquivalentType(STy1->getElementType(), STy2->getElementType());
+  }
+  }
+}
+
+/// isEquivalentOperation - determine whether the two operations are the same
+/// except that pointer-to-A and pointer-to-B are equivalent. This should be
+/// kept in sync with Instruction::isSameOperationAs.
+static bool
+isEquivalentOperation(const Instruction *I1, const Instruction *I2) {
+  if (I1->getOpcode() != I2->getOpcode() ||
+      I1->getNumOperands() != I2->getNumOperands() ||
+      !isEquivalentType(I1->getType(), I2->getType()))
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i)
+    if (!isEquivalentType(I1->getOperand(i)->getType(),
+                          I2->getOperand(i)->getType()))
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
+    return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
+           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
+    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
+    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(I1))
+    return CI->isTailCall() == cast<CallInst>(I2)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
+           CI->getAttributes().getRawPointer() ==
+             cast<CallInst>(I2)->getAttributes().getRawPointer();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
+    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
+           CI->getAttributes().getRawPointer() ==
+             cast<InvokeInst>(I2)->getAttributes().getRawPointer();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1)) {
+    if (IVI->getNumIndices() != cast<InsertValueInst>(I2)->getNumIndices())
+      return false;
+    for (unsigned i = 0, e = IVI->getNumIndices(); i != e; ++i)
+      if (IVI->idx_begin()[i] != cast<InsertValueInst>(I2)->idx_begin()[i])
+        return false;
+    return true;
+  }
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1)) {
+    if (EVI->getNumIndices() != cast<ExtractValueInst>(I2)->getNumIndices())
+      return false;
+    for (unsigned i = 0, e = EVI->getNumIndices(); i != e; ++i)
+      if (EVI->idx_begin()[i] != cast<ExtractValueInst>(I2)->idx_begin()[i])
+        return false;
+    return true;
+  }
+
+  return true;
 }
 
 static bool compare(const Value *V, const Value *U) {
   assert(!isa<BasicBlock>(V) && !isa<BasicBlock>(U) &&
          "Must not compare basic blocks.");
 
-  assert(V->getType() == U->getType() &&
+  assert(isEquivalentType(V->getType(), U->getType()) &&
         "Two of the same operation have operands of different type.");
 
   // TODO: If the constant is an expression of F, we should accept that it's
@@ -117,20 +264,40 @@ static bool compare(const Value *V, const Value *U) {
 static bool equals(const BasicBlock *BB1, const BasicBlock *BB2,
                    DenseMap<const Value *, const Value *> &ValueMap,
                    DenseMap<const Value *, const Value *> &SpeculationMap) {
-  // Specutively add it anyways. If it's false, we'll notice a difference later, and
-  // this won't matter.
+  // Speculatively add it anyways. If it's false, we'll notice a difference
+  // later, and this won't matter.
   ValueMap[BB1] = BB2;
 
   BasicBlock::const_iterator FI = BB1->begin(), FE = BB1->end();
   BasicBlock::const_iterator GI = BB2->begin(), GE = BB2->end();
 
   do {
-    if (!FI->isSameOperationAs(const_cast<Instruction *>(&*GI)))
-      return false;
+    if (isa<BitCastInst>(FI)) {
+      ++FI;
+      continue;
+    }
+    if (isa<BitCastInst>(GI)) {
+      ++GI;
+      continue;
+    }
 
-    if (FI->getNumOperands() != GI->getNumOperands())
+    if (!isEquivalentOperation(FI, GI))
       return false;
 
+    if (isa<GetElementPtrInst>(FI)) {
+      const GetElementPtrInst *GEPF = cast<GetElementPtrInst>(FI);
+      const GetElementPtrInst *GEPG = cast<GetElementPtrInst>(GI);
+      if (GEPF->hasAllZeroIndices() && GEPG->hasAllZeroIndices()) {
+        // It's effectively a bitcast.
+        ++FI, ++GI;
+        continue;
+      }
+
+      // TODO: we only really care about the elements before the index
+      if (FI->getOperand(0)->getType() != GI->getOperand(0)->getType())
+        return false;
+    }
+
     if (ValueMap[FI] == GI) {
       ++FI, ++GI;
       continue;
@@ -140,8 +307,8 @@ static bool equals(const BasicBlock *BB1, const BasicBlock *BB2,
       return false;
 
     for (unsigned i = 0, e = FI->getNumOperands(); i != e; ++i) {
-      Value *OpF = FI->getOperand(i);
-      Value *OpG = GI->getOperand(i);
+      Value *OpF = IgnoreBitcasts(FI->getOperand(i));
+      Value *OpG = IgnoreBitcasts(GI->getOperand(i));
 
       if (ValueMap[OpF] == OpG)
         continue;
@@ -149,10 +316,8 @@ static bool equals(const BasicBlock *BB1, const BasicBlock *BB2,
       if (ValueMap[OpF] != NULL)
         return false;
 
-      assert(OpF->getType() == OpG->getType() &&
-             "Two of the same operation has operands of different type.");
-
-      if (OpF->getValueID() != OpG->getValueID())
+      if (OpF->getValueID() != OpG->getValueID() ||
+          !isEquivalentType(OpF->getType(), OpG->getType()))
         return false;
 
       if (isa<PHINode>(FI)) {
@@ -203,14 +368,15 @@ static bool equals(const Function *F, const Function *G) {
   if (F->hasSection() && F->getSection() != G->getSection())
     return false;
 
+  if (F->isVarArg() != G->isVarArg())
+    return false;
+
   // TODO: if it's internal and only used in direct calls, we could handle this
   // case too.
   if (F->getCallingConv() != G->getCallingConv())
     return false;
 
-  // TODO: We want to permit cases where two functions take T* and S* but
-  // only load or store them into T** and S**.
-  if (F->getType() != G->getType())
+  if (!isEquivalentType(F->getFunctionType(), G->getFunctionType()))
     return false;
 
   DenseMap<const Value *, const Value *> ValueMap;
@@ -237,89 +403,213 @@ static bool equals(const Function *F, const Function *G) {
   return true;
 }
 
-static bool fold(std::vector<Function *> &FnVec, unsigned i, unsigned j) {
-  if (FnVec[i]->mayBeOverridden() && !FnVec[j]->mayBeOverridden())
-    std::swap(FnVec[i], FnVec[j]);
-
-  Function *F = FnVec[i];
-  Function *G = FnVec[j];
+// ===----------------------------------------------------------------------===
+// Folding of functions
+// ===----------------------------------------------------------------------===
+
+// Cases:
+// * F is external strong, G is external strong:
+//   turn G into a thunk to F    (1)
+// * F is external strong, G is external weak:
+//   turn G into a thunk to F    (1)
+// * F is external weak, G is external weak:
+//   unfoldable
+// * F is external strong, G is internal:
+//   address of G taken:
+//     turn G into a thunk to F  (1)
+//   address of G not taken:
+//     make G an alias to F      (2)
+// * F is internal, G is external weak
+//   address of F is taken:
+//     turn G into a thunk to F  (1)
+//   address of F is not taken:
+//     make G an alias of F      (2)
+// * F is internal, G is internal:
+//   address of F and G are taken:
+//     turn G into a thunk to F  (1)
+//   address of G is not taken:
+//     make G an alias to F      (2)
+//
+// alias requires linkage == (external,local,weak) fallback to creating a thunk
+// external means 'externally visible' linkage != (internal,private)
+// internal means linkage == (internal,private)
+// weak means linkage mayBeOverridable
+// being external implies that the address is taken
+//
+// 1. turn G into a thunk to F
+// 2. make G an alias to F
+
+enum LinkageCategory {
+  ExternalStrong,
+  ExternalWeak,
+  Internal
+};
+
+static LinkageCategory categorize(const Function *F) {
+  switch (F->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    return Internal;
+
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    return ExternalWeak;
+
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::AppendingLinkage:
+  case GlobalValue::DLLImportLinkage:
+  case GlobalValue::DLLExportLinkage:
+  case GlobalValue::GhostLinkage:
+  case GlobalValue::CommonLinkage:
+    return ExternalStrong;
+  }
 
-  if (!F->mayBeOverridden()) {
-    if (G->hasLocalLinkage()) {
-      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
-      G->replaceAllUsesWith(F);
-      G->eraseFromParent();
-      ++NumFunctionsMerged;
-      return true;
-    }
+  assert(0 && "Unknown LinkageType.");
+  return ExternalWeak;
+}
 
-    if (G->hasExternalLinkage() || G->hasWeakLinkage()) {
-      GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "",
-                                        F, G->getParent());
-      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
-      GA->takeName(G);
-      GA->setVisibility(G->getVisibility());
-      G->replaceAllUsesWith(GA);
-      G->eraseFromParent();
-      ++NumFunctionsMerged;
-      return true;
+static void ThunkGToF(Function *F, Function *G) {
+  Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                                    G->getParent());
+  BasicBlock *BB = BasicBlock::Create("", NewG);
+
+  std::vector<Value *> Args;
+  unsigned i = 0;
+  const FunctionType *FFTy = F->getFunctionType();
+  for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
+       AI != AE; ++AI) {
+    if (FFTy->getParamType(i) == AI->getType())
+      Args.push_back(AI);
+    else {
+      Value *BCI = new BitCastInst(AI, FFTy->getParamType(i), "", BB);
+      Args.push_back(BCI);
     }
+    ++i;
   }
 
-  if (F->hasWeakLinkage() && G->hasWeakLinkage()) {
-    GlobalAlias *GA_F = new GlobalAlias(F->getType(), F->getLinkage(), "",
-                                        0, F->getParent());
-    GA_F->takeName(F);
-    GA_F->setVisibility(F->getVisibility());
-    F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
-    F->replaceAllUsesWith(GA_F);
-    F->setName("folded." + GA_F->getName());
-    F->setLinkage(GlobalValue::ExternalLinkage);
-    GA_F->setAliasee(F);
-
-    GlobalAlias *GA_G = new GlobalAlias(G->getType(), G->getLinkage(), "",
-                                        F, G->getParent());
-    GA_G->takeName(G);
-    GA_G->setVisibility(G->getVisibility());
-    G->replaceAllUsesWith(GA_G);
-    G->eraseFromParent();
-
-    ++NumFunctionsMerged;
-    return true;
+  CallInst *CI = CallInst::Create(F, Args.begin(), Args.end(), "", BB);
+  CI->setTailCall();
+  CI->setCallingConv(F->getCallingConv());
+  if (NewG->getReturnType() == Type::VoidTy) {
+    ReturnInst::Create(BB);
+  } else if (CI->getType() != NewG->getReturnType()) {
+    Value *BCI = new BitCastInst(CI, NewG->getReturnType(), "", BB);
+    ReturnInst::Create(BCI, BB);
+  } else {
+    ReturnInst::Create(CI, BB);
   }
 
-  DOUT << "Failed on " << F->getName() << " and " << G->getName() << "\n";
+  NewG->copyAttributesFrom(G);
+  NewG->takeName(G);
+  G->replaceAllUsesWith(NewG);
+  G->eraseFromParent();
 
-  ++NumMergeFails;
-  return false;
+  // TODO: look at direct callers to G and make them all direct callers to F.
 }
 
-static bool hasAddressTaken(User *U) {
-  for (User::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I) {
-    User *Use = *I;
+static void AliasGToF(Function *F, Function *G) {
+  if (!G->hasExternalLinkage() && !G->hasLocalLinkage() && !G->hasWeakLinkage())
+    return ThunkGToF(F, G);
+
+  GlobalAlias *GA = new GlobalAlias(
+    G->getType(), G->getLinkage(), "",
+    ConstantExpr::getBitCast(F, G->getType()), G->getParent());
+  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+}
 
-    // 'call (bitcast @F to ...)' happens a lot.
-    while (isa<ConstantExpr>(Use) && Use->hasOneUse()) {
-      Use = *Use->use_begin();
-    }
+static bool fold(std::vector<Function *> &FnVec, unsigned i, unsigned j) {
+  Function *F = FnVec[i];
+  Function *G = FnVec[j];
 
-    if (isa<ConstantExpr>(Use)) {
-      if (hasAddressTaken(Use))
-        return true;
-    }
+  LinkageCategory catF = categorize(F);
+  LinkageCategory catG = categorize(G);
 
-    if (!isa<CallInst>(Use) && !isa<InvokeInst>(Use))
-      return true;
+  if (catF == ExternalWeak || (catF == Internal && catG == ExternalStrong)) {
+    std::swap(FnVec[i], FnVec[j]);
+    std::swap(F, G);
+    std::swap(catF, catG);
+  }
 
-    // Make sure we aren't passing U as a parameter to call instead of the
-    // callee.
-    if (CallSite(cast<Instruction>(Use)).hasArgument(U))
-      return true;
+  switch (catF) {
+    case ExternalStrong:
+      switch (catG) {
+        case ExternalStrong:
+        case ExternalWeak:
+          ThunkGToF(F, G);
+          break;
+        case Internal:
+          if (G->hasAddressTaken())
+            ThunkGToF(F, G);
+          else
+            AliasGToF(F, G);
+          break;
+      }
+      break;
+
+    case ExternalWeak: {
+      assert(catG == ExternalWeak);
+
+      // Make them both thunks to the same internal function.
+      F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+      Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
+                                     F->getParent());
+      H->copyAttributesFrom(F);
+      H->takeName(F);
+      F->replaceAllUsesWith(H);
+
+      ThunkGToF(F, G);
+      ThunkGToF(F, H);
+
+      F->setLinkage(GlobalValue::InternalLinkage);
+    } break;
+
+    case Internal:
+      switch (catG) {
+        case ExternalStrong:
+          assert(0);
+          // fall-through
+        case ExternalWeak:
+	  if (F->hasAddressTaken())
+            ThunkGToF(F, G);
+          else
+            AliasGToF(F, G);
+	  break;
+        case Internal: {
+          bool addrTakenF = F->hasAddressTaken();
+          bool addrTakenG = G->hasAddressTaken();
+          if (!addrTakenF && addrTakenG) {
+            std::swap(FnVec[i], FnVec[j]);
+            std::swap(F, G);
+	    std::swap(addrTakenF, addrTakenG);
+	  }
+
+          if (addrTakenF && addrTakenG) {
+            ThunkGToF(F, G);
+          } else {
+            assert(!addrTakenG);
+            AliasGToF(F, G);
+          }
+	} break;
+      }
+      break;
   }
 
-  return false;
+  ++NumFunctionsMerged;
+  return true;
 }
 
+// ===----------------------------------------------------------------------===
+// Pass definition
+// ===----------------------------------------------------------------------===
+
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
 
@@ -329,25 +619,19 @@ bool MergeFunctions::runOnModule(Module &M) {
     if (F->isDeclaration() || F->isIntrinsic())
       continue;
 
-    if (!F->hasLocalLinkage() && !F->hasExternalLinkage() &&
-        !F->hasWeakLinkage())
-      continue;
-
-    if (hasAddressTaken(F))
-      continue;
-
     FnMap[hash(F)].push_back(F);
   }
 
-  // TODO: instead of running in a loop, we could also fold functions in callgraph
-  // order. Constructing the CFG probably isn't cheaper than just running in a loop.
+  // TODO: instead of running in a loop, we could also fold functions in
+  // callgraph order. Constructing the CFG probably isn't cheaper than just
+  // running in a loop, unless it happened to already be available.
 
   bool LocalChanged;
   do {
     LocalChanged = false;
+    DOUT << "size: " << FnMap.size() << "\n";
     for (std::map<unsigned long, std::vector<Function *> >::iterator
          I = FnMap.begin(), E = FnMap.end(); I != E; ++I) {
-      DOUT << "size: " << FnMap.size() << "\n";
       std::vector<Function *> &FnVec = I->second;
       DOUT << "hash (" << I->first << "): " << FnVec.size() << "\n";
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
new file mode 100644
index 0000000..b3a25540
--- /dev/null
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -0,0 +1,171 @@
+//===- PartialInlining.cpp - Inline parts of functions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs partial inlining, typically by inlining an if statement
+// that surrounds the body of the function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partialinlining"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/FunctionUtils.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CFG.h"
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN PartialInliner : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const { }
+    static char ID; // Pass identification, replacement for typeid
+    PartialInliner() : ModulePass(&ID) {}
+    
+    bool runOnModule(Module& M);
+    
+  private:
+    Function* unswitchFunction(Function* F);
+  };
+}
+
+char PartialInliner::ID = 0;
+static RegisterPass<PartialInliner> X("partial-inliner", "Partial Inliner");
+
+ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); }
+
+Function* PartialInliner::unswitchFunction(Function* F) {
+  // First, verify that this function is an unswitching candidate...
+  BasicBlock* entryBlock = F->begin();
+  if (!isa<BranchInst>(entryBlock->getTerminator()))
+    return 0;
+  
+  BasicBlock* returnBlock = 0;
+  BasicBlock* nonReturnBlock = 0;
+  unsigned returnCount = 0;
+  for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock);
+       SI != SE; ++SI)
+    if (isa<ReturnInst>((*SI)->getTerminator())) {
+      returnBlock = *SI;
+      returnCount++;
+    } else
+      nonReturnBlock = *SI;
+  
+  if (returnCount != 1)
+    return 0;
+  
+  // Clone the function, so that we can hack away on it.
+  DenseMap<const Value*, Value*> ValueMap;
+  Function* duplicateFunction = CloneFunction(F, ValueMap);
+  duplicateFunction->setLinkage(GlobalValue::InternalLinkage);
+  F->getParent()->getFunctionList().push_back(duplicateFunction);
+  BasicBlock* newEntryBlock = cast<BasicBlock>(ValueMap[entryBlock]);
+  BasicBlock* newReturnBlock = cast<BasicBlock>(ValueMap[returnBlock]);
+  BasicBlock* newNonReturnBlock = cast<BasicBlock>(ValueMap[nonReturnBlock]);
+  
+  // Go ahead and update all uses to the duplicate, so that we can just
+  // use the inliner functionality when we're done hacking.
+  F->replaceAllUsesWith(duplicateFunction);
+  
+  // Special hackery is needed with PHI nodes that have inputs from more than
+  // one extracted block.  For simplicity, just split the PHIs into a two-level
+  // sequence of PHIs, some of which will go in the extracted region, and some
+  // of which will go outside.
+  BasicBlock* preReturn = newReturnBlock;
+  newReturnBlock = newReturnBlock->splitBasicBlock(
+                                              newReturnBlock->getFirstNonPHI());
+  BasicBlock::iterator I = preReturn->begin();
+  BasicBlock::iterator Ins = newReturnBlock->begin();
+  while (I != preReturn->end()) {
+    PHINode* OldPhi = dyn_cast<PHINode>(I);
+    if (!OldPhi) break;
+    
+    PHINode* retPhi = PHINode::Create(OldPhi->getType(), "", Ins);
+    OldPhi->replaceAllUsesWith(retPhi);
+    Ins = newReturnBlock->getFirstNonPHI();
+    
+    retPhi->addIncoming(I, preReturn);
+    retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock),
+                        newEntryBlock);
+    OldPhi->removeIncomingValue(newEntryBlock);
+    
+    ++I;
+  }
+  newEntryBlock->getTerminator()->replaceUsesOfWith(preReturn, newReturnBlock);
+  
+  // Gather up the blocks that we're going to extract.
+  std::vector<BasicBlock*> toExtract;
+  toExtract.push_back(newNonReturnBlock);
+  for (Function::iterator FI = duplicateFunction->begin(),
+       FE = duplicateFunction->end(); FI != FE; ++FI)
+    if (&*FI != newEntryBlock && &*FI != newReturnBlock &&
+        &*FI != newNonReturnBlock)
+      toExtract.push_back(FI);
+      
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.runOnFunction(*duplicateFunction);
+  
+  // Extract the body of the the if.
+  Function* extractedFunction = ExtractCodeRegion(DT, toExtract);
+  
+  // Inline the top-level if test into all callers.
+  std::vector<User*> Users(duplicateFunction->use_begin(), 
+                           duplicateFunction->use_end());
+  for (std::vector<User*>::iterator UI = Users.begin(), UE = Users.end();
+       UI != UE; ++UI)
+    if (CallInst* CI = dyn_cast<CallInst>(*UI))
+      InlineFunction(CI);
+    else if (InvokeInst* II = dyn_cast<InvokeInst>(*UI))
+      InlineFunction(II);
+  
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  duplicateFunction->replaceAllUsesWith(F);
+  duplicateFunction->eraseFromParent();
+  
+  return extractedFunction;
+}
+
+bool PartialInliner::runOnModule(Module& M) {
+  std::vector<Function*> worklist;
+  worklist.reserve(M.size());
+  for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI)
+    if (!FI->use_empty() && !FI->isDeclaration())
+    worklist.push_back(&*FI);
+    
+  bool changed = false;
+  while (!worklist.empty()) {
+    Function* currFunc = worklist.back();
+    worklist.pop_back();
+  
+    if (currFunc->use_empty()) continue;
+    
+    bool recursive = false;
+    for (Function::use_iterator UI = currFunc->use_begin(),
+         UE = currFunc->use_end(); UI != UE; ++UI)
+      if (Instruction* I = dyn_cast<Instruction>(UI))
+        if (I->getParent()->getParent() == currFunc) {
+          recursive = true;
+          break;
+        }
+    if (recursive) continue;
+          
+    
+    if (Function* newFunc = unswitchFunction(currFunc)) {
+      worklist.push_back(newFunc);
+      changed = true;
+    }
+    
+  }
+  
+  return changed;
+}
+\ No newline at end of file
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 83503fd..38b1198 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -168,7 +168,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
 
   // Expand the code for the iteration count into the preheader of the loop.
   BasicBlock *Preheader = L->getLoopPreheader();
-  Value *ExitCnt = Rewriter.expandCodeFor(RHS, CmpIndVar->getType(),
+  Value *ExitCnt = Rewriter.expandCodeFor(RHS, IndVar->getType(),
                                           Preheader->getTerminator());
 
   // Insert a new icmp_ne or icmp_eq instruction before the branch.
@@ -392,10 +392,31 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // in this loop, insert a canonical induction variable of the largest size.
   Value *IndVar = 0;
   if (NeedCannIV) {
+    // Check to see if the loop already has a canonical-looking induction
+    // variable. If one is present and it's wider than the planned canonical
+    // induction variable, temporarily remove it, so that the Rewriter
+    // doesn't attempt to reuse it.
+    PHINode *OldCannIV = L->getCanonicalInductionVariable();
+    if (OldCannIV) {
+      if (SE->getTypeSizeInBits(OldCannIV->getType()) >
+          SE->getTypeSizeInBits(LargestType))
+        OldCannIV->removeFromParent();
+      else
+        OldCannIV = 0;
+    }
+
     IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L,LargestType);
+
     ++NumInserted;
     Changed = true;
     DOUT << "INDVARS: New CanIV: " << *IndVar;
+
+    // Now that the official induction variable is established, reinsert
+    // the old canonical-looking variable after it so that the IR remains
+    // consistent. It will be deleted as part of the dead-PHI deletion at
+    // the end of the pass.
+    if (OldCannIV)
+      OldCannIV->insertAfter(cast<Instruction>(IndVar));
   }
 
   // If we have a trip count expression, rewrite the loop's exit condition
@@ -459,8 +480,8 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
          E = List.end(); UI != E; ++UI) {
       SCEVHandle Offset = UI->getOffset();
       Value *Op = UI->getOperandValToReplace();
+      const Type *UseTy = Op->getType();
       Instruction *User = UI->getUser();
-      bool isSigned = UI->isSigned();
 
       // Compute the final addrec to expand into code.
       SCEVHandle AR = IU->getReplacementExpr(*UI);
@@ -471,7 +492,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
         // Expand loop-invariant values in the loop preheader. They will
         // be sunk to the exit block later, if possible.
         NewVal =
-          Rewriter.expandCodeFor(AR, LargestType,
+          Rewriter.expandCodeFor(AR, UseTy,
                                  L->getLoopPreheader()->getTerminator());
         Rewriter.setInsertionPoint(I);
         ++NumReplaced;
@@ -485,74 +506,6 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
         if (!Stride->isLoopInvariant(L))
           continue;
 
-        const Type *IVTy = Offset->getType();
-        const Type *UseTy = Op->getType();
-
-        // Promote the Offset and Stride up to the canonical induction
-        // variable's bit width.
-        SCEVHandle PromotedOffset = Offset;
-        SCEVHandle PromotedStride = Stride;
-        if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType)) {
-          // It doesn't matter for correctness whether zero or sign extension
-          // is used here, since the value is truncated away below, but if the
-          // value is signed, sign extension is more likely to be folded.
-          if (isSigned) {
-            PromotedOffset = SE->getSignExtendExpr(PromotedOffset, LargestType);
-            PromotedStride = SE->getSignExtendExpr(PromotedStride, LargestType);
-          } else {
-            PromotedOffset = SE->getZeroExtendExpr(PromotedOffset, LargestType);
-            // If the stride is obviously negative, use sign extension to
-            // produce things like x-1 instead of x+255.
-            if (isa<SCEVConstant>(PromotedStride) &&
-                cast<SCEVConstant>(PromotedStride)
-                  ->getValue()->getValue().isNegative())
-              PromotedStride = SE->getSignExtendExpr(PromotedStride,
-                                                     LargestType);
-            else
-              PromotedStride = SE->getZeroExtendExpr(PromotedStride,
-                                                     LargestType);
-          }
-        }
-
-        // Create the SCEV representing the offset from the canonical
-        // induction variable, still in the canonical induction variable's
-        // type, so that all expanded arithmetic is done in the same type.
-        SCEVHandle NewAR = SE->getAddRecExpr(SE->getIntegerSCEV(0, LargestType),
-                                             PromotedStride, L);
-        // Add the PromotedOffset as a separate step, because it may not be
-        // loop-invariant.
-        NewAR = SE->getAddExpr(NewAR, PromotedOffset);
-
-        // Expand the addrec into instructions.
-        Value *V = Rewriter.expandCodeFor(NewAR);
-
-        // Insert an explicit cast if necessary to truncate the value
-        // down to the original stride type. This is done outside of
-        // SCEVExpander because in SCEV expressions, a truncate of an
-        // addrec is always folded.
-        if (LargestType != IVTy) {
-          if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType))
-            NewAR = SE->getTruncateExpr(NewAR, IVTy);
-          if (Rewriter.isInsertedExpression(NewAR))
-            V = Rewriter.expandCodeFor(NewAR);
-          else {
-            V = Rewriter.InsertCastOfTo(CastInst::getCastOpcode(V, false,
-                                                                IVTy, false),
-                                        V, IVTy);
-            assert(!isa<SExtInst>(V) && !isa<ZExtInst>(V) &&
-                   "LargestType wasn't actually the largest type!");
-            // Force the rewriter to use this trunc whenever this addrec
-            // appears so that it doesn't insert new phi nodes or
-            // arithmetic in a different type.
-            Rewriter.addInsertedValue(V, NewAR);
-          }
-        }
-
-        DOUT << "INDVARS: Made offset-and-trunc IV for offset "
-             << *IVTy << " " << *Offset << ": ";
-        DEBUG(WriteAsOperand(*DOUT, V, false));
-        DOUT << "\n";
-
         // Now expand it into actual Instructions and patch it into place.
         NewVal = Rewriter.expandCodeFor(AR, UseTy);
       }
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
index 6d2ff0e..5465e4a 100644
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -2608,21 +2608,6 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
       else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
         return BinaryOperator::CreateFNeg(Op1I->getOperand(0), I.getName());
     }
-
-    if (Op1I->hasOneUse()) {
-      // Replace (x - (y - z)) with (x + (z - y)) if the (y - z) subexpression
-      // is not used by anyone else...
-      //
-      if (Op1I->getOpcode() == Instruction::FSub) {
-        // Swap the two operands of the subexpr...
-        Value *IIOp0 = Op1I->getOperand(0), *IIOp1 = Op1I->getOperand(1);
-        Op1I->setOperand(0, IIOp1);
-        Op1I->setOperand(1, IIOp0);
-
-        // Create the new top level fadd instruction...
-        return BinaryOperator::CreateFAdd(Op0, Op1);
-      }
-    }
   }
 
   return 0;
@@ -11824,7 +11809,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (SI.isVolatile()) return 0;  // Don't hack volatile stores.
 
   // store X, null    -> turns into 'unreachable' in SimplifyCFG
-  if (isa<ConstantPointerNull>(Ptr)) {
+  if (isa<ConstantPointerNull>(Ptr) &&
+      cast<PointerType>(Ptr->getType())->getAddressSpace() == 0) {
     if (!isa<UndefValue>(Val)) {
       SI.setOperand(0, UndefValue::get(Val->getType()));
       if (Instruction *U = dyn_cast<Instruction>(Val))
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index b499279..5a85a04 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -125,13 +125,17 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         }
       }
       
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
-        if (isa<ConstantPointerNull>(SI->getOperand(1)) ||
-            isa<UndefValue>(SI->getOperand(1))) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        Value *Ptr = SI->getOperand(1);
+        
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             cast<PointerType>(Ptr->getType())->getAddressSpace() == 0)) {
           ChangeToUnreachable(SI);
           Changed = true;
           break;
         }
+      }
     }
 
     // Turn invokes that call 'nounwind' functions into ordinary calls.
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 3a991f6..54bd895 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -364,4 +364,15 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, const Type **Tys,
 #include "llvm/Intrinsics.gen"
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 
+  /// hasAddressTaken - returns true if there are any uses of this function
+  /// other than direct calls or invokes to it.
+bool Function::hasAddressTaken() const {
+  for (Value::use_const_iterator I = use_begin(), E = use_end(); I != E; ++I) {
+    if (I.getOperandNo() != 0 ||
+        (!isa<CallInst>(*I) && !isa<InvokeInst>(*I)))
+      return true;
+  }
+  return false;
+}
+
 // vim: sw=2 ai
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 7556b8e..e0764e4 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -218,9 +218,12 @@ bool Instruction::isIdenticalTo(const Instruction *I) const {
 }
 
 // isSameOperationAs
+// This should be kept in sync with isEquivalentOperation in
+// lib/Transforms/IPO/MergeFunctions.cpp.
 bool Instruction::isSameOperationAs(const Instruction *I) const {
-  if (getOpcode() != I->getOpcode() || getType() != I->getType() ||
-      getNumOperands() != I->getNumOperands())
+  if (getOpcode() != I->getOpcode() ||
+      getNumOperands() != I->getNumOperands() ||
+      getType() != I->getType())
     return false;
 
   // We have two instructions of identical opcode and #operands.  Check to see
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index b1297ff..e9f2acd 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -276,8 +276,8 @@ namespace {
                           int VT, unsigned ArgNo, std::string &Suffix);
     void VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
                                   unsigned RetNum, unsigned ParamNum, ...);
-    void VerifyAttrs(Attributes Attrs, const Type *Ty,
-                     bool isReturnValue, const Value *V);
+    void VerifyParameterAttrs(Attributes Attrs, const Type *Ty,
+                              bool isReturnValue, const Value *V);
     void VerifyFunctionAttrs(const FunctionType *FT, const AttrListPtr &Attrs,
                              const Value *V);
 
@@ -437,22 +437,23 @@ void Verifier::visitGlobalAlias(GlobalAlias &GA) {
 void Verifier::verifyTypeSymbolTable(TypeSymbolTable &ST) {
 }
 
-// VerifyAttrs - Check the given parameter attributes for an argument or return
+// VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::VerifyAttrs(Attributes Attrs, const Type *Ty, 
-                           bool isReturnValue, const Value *V) {
+void Verifier::VerifyParameterAttrs(Attributes Attrs, const Type *Ty,
+                                    bool isReturnValue, const Value *V) {
   if (Attrs == Attribute::None)
     return;
 
+  Attributes FnCheckAttr = Attrs & Attribute::FunctionOnly;
+  Assert1(!FnCheckAttr, "Attribute " + Attribute::getAsString(FnCheckAttr) +
+          " only applies to the function!", V);
+
   if (isReturnValue) {
     Attributes RetI = Attrs & Attribute::ParameterOnly;
     Assert1(!RetI, "Attribute " + Attribute::getAsString(RetI) +
             " does not apply to return values!", V);
   }
-  Attributes FnCheckAttr = Attrs & Attribute::FunctionOnly;
-  Assert1(!FnCheckAttr, "Attribute " + Attribute::getAsString(FnCheckAttr) +
-          " only applies to functions!", V);
-  
+
   for (unsigned i = 0;
        i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
     Attributes MutI = Attrs & Attribute::MutuallyIncompatible[i];
@@ -495,9 +496,9 @@ void Verifier::VerifyFunctionAttrs(const FunctionType *FT,
     else if (Attr.Index-1 < FT->getNumParams())
       Ty = FT->getParamType(Attr.Index-1);
     else
-      break;  // VarArgs attributes, don't verify.
-    
-    VerifyAttrs(Attr.Attrs, Ty, Attr.Index == 0, V);
+      break;  // VarArgs attributes, verified elsewhere.
+
+    VerifyParameterAttrs(Attr.Attrs, Ty, Attr.Index == 0, V);
 
     if (Attr.Attrs & Attribute::Nest) {
       Assert1(!SawNest, "More than one parameter has attribute nest!", V);
@@ -509,10 +510,10 @@ void Verifier::VerifyFunctionAttrs(const FunctionType *FT,
   }
 
   Attributes FAttrs = Attrs.getFnAttributes();
-  Assert1(!(FAttrs & (~Attribute::FunctionOnly)),
-          "Attribute " + Attribute::getAsString(FAttrs) +
-          " does not apply to function!", V);
-      
+  Attributes NotFn = FAttrs & (~Attribute::FunctionOnly);
+  Assert1(!NotFn, "Attribute " + Attribute::getAsString(NotFn) +
+          " does not apply to the function!", V);
+
   for (unsigned i = 0;
        i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
     Attributes MutI = FAttrs & Attribute::MutuallyIncompatible[i];
@@ -1025,7 +1026,7 @@ void Verifier::VerifyCallSite(CallSite CS) {
     for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
       Attributes Attr = Attrs.getParamAttributes(Idx);
 
-      VerifyAttrs(Attr, CS.getArgument(Idx-1)->getType(), false, I);
+      VerifyParameterAttrs(Attr, CS.getArgument(Idx-1)->getType(), false, I);
 
       Attributes VArgI = Attr & Attribute::VarArgsIncompatible;
       Assert1(!VArgI, "Attribute " + Attribute::getAsString(VArgI) +