76 files changed, 1506 insertions, 891 deletions
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
index 2dea7b3..2b34ad3 100644
--- a/lib/Analysis/InstCount.cpp
+++ b/lib/Analysis/InstCount.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/Streams.h"
 #include "llvm/ADT/Statistic.h"
-#include <ostream>
 using namespace llvm;
 
 STATISTIC(TotalInsts , "Number of instructions (of all types)");
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index de6480a..a0d3974 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include <algorithm>
-#include <ostream>
 using namespace llvm;
 
 char LoopInfo::ID = 0;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index f7f1849..03c5005 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -80,7 +80,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
-#include <ostream>
 #include <algorithm>
 using namespace llvm;
 
@@ -2463,24 +2462,15 @@ void ScalarEvolution::forgetLoopPHIs(const Loop *L) {
 ScalarEvolution::BackedgeTakenInfo
 ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   // If the loop has a non-one exit block count, we can't analyze it.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() != 1) return UnknownValue;
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock)
+    return UnknownValue;
 
   // Okay, there is one exit block.  Try to find the condition that causes the
   // loop to be exited.
-  BasicBlock *ExitBlock = ExitBlocks[0];
-
-  BasicBlock *ExitingBlock = 0;
-  for (pred_iterator PI = pred_begin(ExitBlock), E = pred_end(ExitBlock);
-       PI != E; ++PI)
-    if (L->contains(*PI)) {
-      if (ExitingBlock == 0)
-        ExitingBlock = *PI;
-      else
-        return UnknownValue;   // More than one block exiting!
-    }
-  assert(ExitingBlock && "No exits from loop, something is broken!");
+  BasicBlock *ExitingBlock = L->getExitingBlock();
+  if (!ExitingBlock)
+    return UnknownValue;   // More than one block exiting!
 
   // Okay, we've computed the exiting block.  See what condition causes us to
   // exit.
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 7ba8268..ef77e46 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -644,3 +644,16 @@ Value *SCEVExpander::expand(const SCEV *S) {
   InsertedExpressions[S] = V;
   return V;
 }
+
+/// getOrInsertCanonicalInductionVariable - This method returns the
+/// canonical induction variable of the specified type for the specified
+/// loop (inserting one if there is none).  A canonical induction variable
+/// starts at zero and steps by one on each iteration.
+Value *
+SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
+                                                    const Type *Ty) {
+  assert(Ty->isInteger() && "Can only insert integer induction variables!");
+  SCEVHandle H = SE.getAddRecExpr(SE.getIntegerSCEV(0, Ty),
+                                  SE.getIntegerSCEV(1, Ty), L);
+  return expand(H);
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 29ff8aa..45f97b8 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -771,7 +771,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
   if (I == 0) return false;
   
   // (add x, 0.0) is guaranteed to return +0.0, not -0.0.
-  if (I->getOpcode() == Instruction::Add &&
+  if (I->getOpcode() == Instruction::FAdd &&
       isa<ConstantFP>(I->getOperand(1)) && 
       cast<ConstantFP>(I->getOperand(1))->isNullValue())
     return true;
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index f2e6890..c5190ef 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -547,6 +547,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(optsize);
   KEYWORD(ssp);
   KEYWORD(sspreq);
+  KEYWORD(noredzone);
+  KEYWORD(noimplicitfloat);
 
   KEYWORD(type);
   KEYWORD(opaque);
@@ -590,7 +592,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) { \
     UIntVal = Instruction::Enum; return lltok::kw_##STR; }
 
-  INSTKEYWORD(add,   Add);  INSTKEYWORD(sub,   Sub);  INSTKEYWORD(mul,   Mul);
+  INSTKEYWORD(add,   Add);  INSTKEYWORD(fadd,   FAdd);
+  INSTKEYWORD(sub,   Sub);  INSTKEYWORD(fsub,   FSub);
+  INSTKEYWORD(mul,   Mul);  INSTKEYWORD(fmul,   FMul);
   INSTKEYWORD(udiv,  UDiv); INSTKEYWORD(sdiv,  SDiv); INSTKEYWORD(fdiv,  FDiv);
   INSTKEYWORD(urem,  URem); INSTKEYWORD(srem,  SRem); INSTKEYWORD(frem,  FRem);
   INSTKEYWORD(shl,   Shl);  INSTKEYWORD(lshr,  LShr); INSTKEYWORD(ashr,  AShr);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 8db4c71..5c44502 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -712,25 +712,26 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
         return Error(AttrLoc, "invalid use of parameter-only attribute");
         
       return false;
-    case lltok::kw_zeroext:      Attrs |= Attribute::ZExt; break;
-    case lltok::kw_signext:      Attrs |= Attribute::SExt; break;
-    case lltok::kw_inreg:        Attrs |= Attribute::InReg; break;
-    case lltok::kw_sret:         Attrs |= Attribute::StructRet; break;
-    case lltok::kw_noalias:      Attrs |= Attribute::NoAlias; break;
-    case lltok::kw_nocapture:    Attrs |= Attribute::NoCapture; break;
-    case lltok::kw_byval:        Attrs |= Attribute::ByVal; break;
-    case lltok::kw_nest:         Attrs |= Attribute::Nest; break;
-
-    case lltok::kw_noreturn:     Attrs |= Attribute::NoReturn; break;
-    case lltok::kw_nounwind:     Attrs |= Attribute::NoUnwind; break;
-    case lltok::kw_noinline:     Attrs |= Attribute::NoInline; break;
-    case lltok::kw_readnone:     Attrs |= Attribute::ReadNone; break;
-    case lltok::kw_readonly:     Attrs |= Attribute::ReadOnly; break;
-    case lltok::kw_alwaysinline: Attrs |= Attribute::AlwaysInline; break;
-    case lltok::kw_optsize:      Attrs |= Attribute::OptimizeForSize; break;
-    case lltok::kw_ssp:          Attrs |= Attribute::StackProtect; break;
-    case lltok::kw_sspreq:       Attrs |= Attribute::StackProtectReq; break;
-
+    case lltok::kw_zeroext:         Attrs |= Attribute::ZExt; break;
+    case lltok::kw_signext:         Attrs |= Attribute::SExt; break;
+    case lltok::kw_inreg:           Attrs |= Attribute::InReg; break;
+    case lltok::kw_sret:            Attrs |= Attribute::StructRet; break;
+    case lltok::kw_noalias:         Attrs |= Attribute::NoAlias; break;
+    case lltok::kw_nocapture:       Attrs |= Attribute::NoCapture; break;
+    case lltok::kw_byval:           Attrs |= Attribute::ByVal; break;
+    case lltok::kw_nest:            Attrs |= Attribute::Nest; break;
+
+    case lltok::kw_noreturn:        Attrs |= Attribute::NoReturn; break;
+    case lltok::kw_nounwind:        Attrs |= Attribute::NoUnwind; break;
+    case lltok::kw_noinline:        Attrs |= Attribute::NoInline; break;
+    case lltok::kw_readnone:        Attrs |= Attribute::ReadNone; break;
+    case lltok::kw_readonly:        Attrs |= Attribute::ReadOnly; break;
+    case lltok::kw_alwaysinline:    Attrs |= Attribute::AlwaysInline; break;
+    case lltok::kw_optsize:         Attrs |= Attribute::OptimizeForSize; break;
+    case lltok::kw_ssp:             Attrs |= Attribute::StackProtect; break;
+    case lltok::kw_sspreq:          Attrs |= Attribute::StackProtectReq; break;
+    case lltok::kw_noredzone:       Attrs |= Attribute::NoRedZone; break;
+    case lltok::kw_noimplicitfloat: Attrs |= Attribute::NoImplicitFloat; break;
         
     case lltok::kw_align: {
       unsigned Alignment;
@@ -1835,8 +1836,11 @@ bool LLParser::ParseValID(ValID &ID) {
       
   // Binary Operators.
   case lltok::kw_add:
+  case lltok::kw_fadd:
   case lltok::kw_sub:
+  case lltok::kw_fsub:
   case lltok::kw_mul:
+  case lltok::kw_fmul:
   case lltok::kw_udiv:
   case lltok::kw_sdiv:
   case lltok::kw_fdiv:
@@ -2400,8 +2404,13 @@ bool LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_sub:
-  case lltok::kw_mul:    return ParseArithmetic(Inst, PFS, KeywordVal, 0);
-      
+  case lltok::kw_mul:
+    // API compatibility: Accept either integer or floating-point types.
+    return ParseArithmetic(Inst, PFS, KeywordVal, 0);
+  case lltok::kw_fadd:
+  case lltok::kw_fsub:
+  case lltok::kw_fmul:    return ParseArithmetic(Inst, PFS, KeywordVal, 2);
+
   case lltok::kw_udiv:
   case lltok::kw_sdiv:
   case lltok::kw_urem:
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index d8bd38a..9335d19 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -80,6 +80,8 @@ namespace lltok {
     kw_optsize,
     kw_ssp,
     kw_sspreq,
+    kw_noredzone,
+    kw_noimplicitfloat,
 
     kw_type,
     kw_opaque,
@@ -89,7 +91,8 @@ namespace lltok {
     kw_ueq, kw_une,
 
     // Instruction Opcodes (Opcode in UIntVal).
-    kw_add,  kw_sub,  kw_mul,  kw_udiv, kw_sdiv, kw_fdiv,
+    kw_add,  kw_fadd, kw_sub,  kw_fsub, kw_mul,  kw_fmul,
+    kw_udiv, kw_sdiv, kw_fdiv,
     kw_urem, kw_srem, kw_frem, kw_shl,  kw_lshr, kw_ashr,
     kw_and,  kw_or,   kw_xor,  kw_icmp, kw_fcmp, kw_vicmp, kw_vfcmp,
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 1dad04b..3b44f564 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -104,9 +104,12 @@ static int GetDecodedCastOpcode(unsigned Val) {
 static int GetDecodedBinaryOpcode(unsigned Val, const Type *Ty) {
   switch (Val) {
   default: return -1;
-  case bitc::BINOP_ADD:  return Instruction::Add;
-  case bitc::BINOP_SUB:  return Instruction::Sub;
-  case bitc::BINOP_MUL:  return Instruction::Mul;
+  case bitc::BINOP_ADD:
+    return Ty->isFPOrFPVector() ? Instruction::FAdd : Instruction::Add;
+  case bitc::BINOP_SUB:
+    return Ty->isFPOrFPVector() ? Instruction::FSub : Instruction::Sub;
+  case bitc::BINOP_MUL:
+    return Ty->isFPOrFPVector() ? Instruction::FMul : Instruction::Mul;
   case bitc::BINOP_UDIV: return Instruction::UDiv;
   case bitc::BINOP_SDIV:
     return Ty->isFPOrFPVector() ? Instruction::FDiv : Instruction::SDiv;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index bfc029c..9f16728 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -77,9 +77,12 @@ static unsigned GetEncodedCastOpcode(unsigned Opcode) {
 static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
   switch (Opcode) {
   default: assert(0 && "Unknown binary instruction!");
-  case Instruction::Add:  return bitc::BINOP_ADD;
-  case Instruction::Sub:  return bitc::BINOP_SUB;
-  case Instruction::Mul:  return bitc::BINOP_MUL;
+  case Instruction::Add:
+  case Instruction::FAdd: return bitc::BINOP_ADD;
+  case Instruction::Sub:
+  case Instruction::FSub: return bitc::BINOP_SUB;
+  case Instruction::Mul:
+  case Instruction::FMul: return bitc::BINOP_MUL;
   case Instruction::UDiv: return bitc::BINOP_UDIV;
   case Instruction::FDiv:
   case Instruction::SDiv: return bitc::BINOP_SDIV;
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 25217b0..5a66f4b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameInfo.h"
-#include <ostream>
 using namespace llvm;
 
 static TimerGroup &getDwarfTimerGroup() {
diff --git a/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp b/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
index 45e7dd3..f7ca4f4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfPrinter.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <ostream>
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/ELF.h b/lib/CodeGen/ELF.h
new file mode 100644
index 0000000..bf43622
--- /dev/null
+++ b/lib/CodeGen/ELF.h
@@ -0,0 +1,186 @@
+//===-- lib/CodeGen/ELF.h - ELF constants and data structures ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header contains common, non-processor-specific data structures and
+// constants for the ELF file format.
+//
+// The details of the ELF32 bits in this file are largely based on
+// the Tool Interface Standard (TIS) Executable and Linking Format
+// (ELF) Specification Version 1.2, May 1995. The ELF64 stuff is not
+// standardized, as far as I can tell. It was largely based on information
+// I found in OpenBSD header files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ELF_H
+#define CODEGEN_ELF_H
+
+#include "llvm/Support/DataTypes.h"
+#include <cstring>
+
+namespace llvm {
+  class GlobalVariable;
+
+  // Identification Indexes
+  enum {
+    EI_MAG0 = 0,
+    EI_MAG1 = 1,
+    EI_MAG2 = 2,
+    EI_MAG3 = 3
+  };
+
+  // File types
+  enum {
+    ET_NONE   = 0,      // No file type
+    ET_REL    = 1,      // Relocatable file
+    ET_EXEC   = 2,      // Executable file
+    ET_DYN    = 3,      // Shared object file
+    ET_CORE   = 4,      // Core file
+    ET_LOPROC = 0xff00, // Beginning of processor-specific codes
+    ET_HIPROC = 0xffff  // Processor-specific
+  };
+
+  // Object file classes.
+  enum {
+    ELFCLASS32 = 1, // 32-bit object file
+    ELFCLASS64 = 2  // 64-bit object file
+  };
+
+  // Object file byte orderings.
+  enum {
+    ELFDATA2LSB = 1, // Little-endian object file
+    ELFDATA2MSB = 2  // Big-endian object file
+  };
+
+  // Versioning
+  enum {
+    EV_NONE = 0,
+    EV_CURRENT = 1
+  };
+
+  /// ELFSection - This struct contains information about each section that is
+  /// emitted to the file.  This is eventually turned into the section header
+  /// table at the end of the file.
+  struct ELFSection {
+
+    // ELF specific fields
+    std::string Name;       // Name of the section.
+    unsigned NameIdx;       // Index in .shstrtab of name, once emitted.
+    unsigned Type;
+    unsigned Flags;
+    uint64_t Addr;
+    unsigned Offset;
+    unsigned Size;
+    unsigned Link;
+    unsigned Info;
+    unsigned Align;
+    unsigned EntSize;
+
+    // Section Header Flags
+    enum {
+      SHF_WRITE            = 1 << 0, // Writable
+      SHF_ALLOC            = 1 << 1, // Mapped into the process addr space
+      SHF_EXECINSTR        = 1 << 2, // Executable
+      SHF_MERGE            = 1 << 4, // Might be merged if equal
+      SHF_STRINGS          = 1 << 5, // Contains null-terminated strings
+      SHF_INFO_LINK        = 1 << 6, // 'sh_info' contains SHT index
+      SHF_LINK_ORDER       = 1 << 7, // Preserve order after combining
+      SHF_OS_NONCONFORMING = 1 << 8, // nonstandard OS support required
+      SHF_GROUP            = 1 << 9, // Section is a member of a group
+      SHF_TLS              = 1 << 10 // Section holds thread-local data
+    };
+
+    // Section Types
+    enum {
+      SHT_NULL     = 0,  // No associated section (inactive entry).
+      SHT_PROGBITS = 1,  // Program-defined contents.
+      SHT_SYMTAB   = 2,  // Symbol table.
+      SHT_STRTAB   = 3,  // String table.
+      SHT_RELA     = 4,  // Relocation entries; explicit addends.
+      SHT_HASH     = 5,  // Symbol hash table.
+      SHT_DYNAMIC  = 6,  // Information for dynamic linking.
+      SHT_NOTE     = 7,  // Information about the file.
+      SHT_NOBITS   = 8,  // Data occupies no space in the file.
+      SHT_REL      = 9,  // Relocation entries; no explicit addends.
+      SHT_SHLIB    = 10, // Reserved.
+      SHT_DYNSYM   = 11, // Symbol table.
+      SHT_LOPROC   = 0x70000000, // Lowest processor architecture-specific type.
+      SHT_HIPROC   = 0x7fffffff, // Highest processor architecture-specific type.
+      SHT_LOUSER   = 0x80000000, // Lowest type reserved for applications.
+      SHT_HIUSER   = 0xffffffff  // Highest type reserved for applications.
+    };
+
+    // Special section indices.
+    enum {
+      SHN_UNDEF     = 0,      // Undefined, missing, irrelevant, or meaningless
+      SHN_LORESERVE = 0xff00, // Lowest reserved index
+      SHN_LOPROC    = 0xff00, // Lowest processor-specific index
+      SHN_HIPROC    = 0xff1f, // Highest processor-specific index
+      SHN_ABS       = 0xfff1, // Symbol has absolute value; does not need relocation
+      SHN_COMMON    = 0xfff2, // FORTRAN COMMON or C external global variables
+      SHN_HIRESERVE = 0xffff  // Highest reserved index
+    };
+
+    /// SectionIdx - The number of the section in the Section Table.
+    unsigned short SectionIdx;
+
+    /// SectionData - The actual data for this section which we are building
+    /// up for emission to the file.
+    std::vector<unsigned char> SectionData;
+
+    ELFSection(const std::string &name)
+      : Name(name), Type(0), Flags(0), Addr(0), Offset(0), Size(0),
+        Link(0), Info(0), Align(0), EntSize(0) {}
+  };
+
+  /// ELFSym - This struct contains information about each symbol that is
+  /// added to logical symbol table for the module.  This is eventually
+  /// turned into a real symbol table in the file.
+  struct ELFSym {
+    const GlobalValue *GV;    // The global value this corresponds to.
+
+    // ELF specific fields
+    unsigned NameIdx;         // Index in .strtab of name, once emitted.
+    uint64_t Value;
+    unsigned Size;
+    uint8_t Info;
+    uint8_t Other;
+    unsigned short SectionIdx;
+
+    enum { 
+      STB_LOCAL = 0,
+      STB_GLOBAL = 1,
+      STB_WEAK = 2 
+    };
+
+    enum { 
+      STT_NOTYPE = 0,
+      STT_OBJECT = 1,
+      STT_FUNC = 2,
+      STT_SECTION = 3,
+      STT_FILE = 4 
+    };
+
+    ELFSym(const GlobalValue *gv) : GV(gv), Value(0),
+                                    Size(0), Info(0), Other(0),
+                                    SectionIdx(ELFSection::SHN_UNDEF) {}
+
+    void SetBind(unsigned X) {
+      assert(X == (X & 0xF) && "Bind value out of range!");
+      Info = (Info & 0x0F) | (X << 4);
+    }
+    void SetType(unsigned X) {
+      assert(X == (X & 0xF) && "Type value out of range!");
+      Info = (Info & 0xF0) | X;
+    }
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index 0a0245f..9af276b 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -7,17 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "elfce"
+
 #include "ELFCodeEmitter.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/Mangler.h"
-#include "llvm/Support/OutputBuffer.h"
+#include "llvm/Support/Debug.h"
 
 //===----------------------------------------------------------------------===//
 //                       ELFCodeEmitter Implementation
@@ -27,67 +27,87 @@ namespace llvm {
 
 /// startFunction - This callback is invoked when a new machine function is
 /// about to be emitted.
-void ELFCodeEmitter::startFunction(MachineFunction &F) {
-  // Align the output buffer to the appropriate alignment.
-  unsigned Align = 16;   // FIXME: GENERICIZE!!
+void ELFCodeEmitter::startFunction(MachineFunction &MF) {
+  const TargetData *TD = TM.getTargetData();
+  const Function *F = MF.getFunction();
+
+  // Align the output buffer to the appropriate alignment, power of 2.
+  unsigned FnAlign = F->getAlignment();
+  unsigned TDAlign = TD->getPrefTypeAlignment(F->getType());
+  unsigned Align = std::max(FnAlign, TDAlign);
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+
   // Get the ELF Section that this function belongs in.
-  ES = &EW.getSection(".text", ELFWriter::ELFSection::SHT_PROGBITS,
-                      ELFWriter::ELFSection::SHF_EXECINSTR |
-                      ELFWriter::ELFSection::SHF_ALLOC);
-  OutBuffer = &ES->SectionData;
-  cerr << "FIXME: This code needs to be updated for changes in the "
-       << "CodeEmitter interfaces.  In particular, this should set "
-       << "BufferBegin/BufferEnd/CurBufferPtr, not deal with OutBuffer!";
-  abort();
+  ES = &EW.getTextSection();
+
+  // FIXME: better memory management, this will be replaced by BinaryObjects
+  ES->SectionData.reserve(4096);
+  BufferBegin = &ES->SectionData[0];
+  BufferEnd = BufferBegin + ES->SectionData.capacity();
 
   // Upgrade the section alignment if required.
   if (ES->Align < Align) ES->Align = Align;
 
-  // Add padding zeros to the end of the buffer to make sure that the
-  // function will start on the correct byte alignment within the section.
-  OutputBuffer OB(*OutBuffer,
-                  TM.getTargetData()->getPointerSizeInBits() == 64,
-                  TM.getTargetData()->isLittleEndian());
-  OB.align(Align);
-  FnStart = OutBuffer->size();
+  // Round the size up to the correct alignment for starting the new function.
+  ES->Size = (ES->Size + (Align-1)) & (-Align);
+
+  // Snaity check on allocated space for text section
+  assert( ES->Size < 4096 && "no more space in TextSection" );
+
+  // FIXME: Using ES->Size directly here instead of calculating it from the
+  // output buffer size (impossible because the code emitter deals only in raw
+  // bytes) forces us to manually synchronize size and write padding zero bytes
+  // to the output buffer for all non-text sections.  For text sections, we do
+  // not synchonize the output buffer, and we just blow up if anyone tries to
+  // write non-code to it.  An assert should probably be added to
+  // AddSymbolToSection to prevent calling it on the text section.
+  CurBufferPtr = BufferBegin + ES->Size;
+
+  // Record function start address relative to BufferBegin
+  FnStartPtr = CurBufferPtr;
 }
 
 /// finishFunction - This callback is invoked after the function is completely
 /// finished.
-bool ELFCodeEmitter::finishFunction(MachineFunction &F) {
-  // We now know the size of the function, add a symbol to represent it.
-  ELFWriter::ELFSym FnSym(F.getFunction());
+bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
+  // Add a symbol to represent the function.
+  ELFSym FnSym(MF.getFunction());
 
   // Figure out the binding (linkage) of the symbol.
-  switch (F.getFunction()->getLinkage()) {
+  switch (MF.getFunction()->getLinkage()) {
   default:
     // appending linkage is illegal for functions.
     assert(0 && "Unknown linkage type!");
   case GlobalValue::ExternalLinkage:
-    FnSym.SetBind(ELFWriter::ELFSym::STB_GLOBAL);
+    FnSym.SetBind(ELFSym::STB_GLOBAL);
     break;
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
-    FnSym.SetBind(ELFWriter::ELFSym::STB_WEAK);
+    FnSym.SetBind(ELFSym::STB_WEAK);
     break;
   case GlobalValue::PrivateLinkage:
     assert (0 && "PrivateLinkage should not be in the symbol table.");
   case GlobalValue::InternalLinkage:
-    FnSym.SetBind(ELFWriter::ELFSym::STB_LOCAL);
+    FnSym.SetBind(ELFSym::STB_LOCAL);
     break;
   }
 
-  ES->Size = OutBuffer->size();
+  // Set the symbol type as a function
+  FnSym.SetType(ELFSym::STT_FUNC);
 
-  FnSym.SetType(ELFWriter::ELFSym::STT_FUNC);
   FnSym.SectionIdx = ES->SectionIdx;
-  FnSym.Value = FnStart;   // Value = Offset from start of Section.
-  FnSym.Size = OutBuffer->size()-FnStart;
+  FnSym.Size = CurBufferPtr-FnStartPtr;
+
+  // Offset from start of Section
+  FnSym.Value = FnStartPtr-BufferBegin;
 
   // Finally, add it to the symtab.
   EW.SymbolTable.push_back(FnSym);
+
+  // Update Section Size
+  ES->Size = CurBufferPtr - BufferBegin;
   return false;
 }
 
diff --git a/lib/CodeGen/ELFCodeEmitter.h b/lib/CodeGen/ELFCodeEmitter.h
index 11ebcc8..e9ee936 100644
--- a/lib/CodeGen/ELFCodeEmitter.h
+++ b/lib/CodeGen/ELFCodeEmitter.h
@@ -21,11 +21,10 @@ namespace llvm {
   class ELFCodeEmitter : public MachineCodeEmitter {
     ELFWriter &EW;
     TargetMachine &TM;
-    ELFWriter::ELFSection *ES;  // Section to write to.
-    std::vector<unsigned char> *OutBuffer;
-    size_t FnStart;
+    ELFSection *ES;  // Section to write to.
+    uint8_t *FnStartPtr;
   public:
-    explicit ELFCodeEmitter(ELFWriter &ew) : EW(ew), TM(EW.TM), OutBuffer(0) {}
+    explicit ELFCodeEmitter(ELFWriter &ew) : EW(ew), TM(EW.TM) {}
 
     void startFunction(MachineFunction &F);
     bool finishFunction(MachineFunction &F);
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
index be8edce..24f12a3 100644
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -33,6 +33,7 @@
 
 #include "ELFWriter.h"
 #include "ELFCodeEmitter.h"
+#include "ELF.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/DerivedTypes.h"
@@ -67,7 +68,8 @@ MachineCodeEmitter *llvm::AddELFWriter(PassManagerBase &PM,
 
 ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm)
   : MachineFunctionPass(&ID), O(o), TM(tm) {
-  e_flags = 0;    // e_flags defaults to 0, no flags.
+  e_flags = 0;  // e_flags defaults to 0, no flags.
+  e_machine = TM.getELFWriterInfo()->getEMachine();
 
   is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
   isLittleEndian = TM.getTargetData()->isLittleEndian();
@@ -90,24 +92,39 @@ bool ELFWriter::doInitialization(Module &M) {
   std::vector<unsigned char> &FH = FileHeader;
   OutputBuffer FHOut(FH, is64Bit, isLittleEndian);
 
-  FHOut.outbyte(0x7F);                     // EI_MAG0
-  FHOut.outbyte('E');                      // EI_MAG1
-  FHOut.outbyte('L');                      // EI_MAG2
-  FHOut.outbyte('F');                      // EI_MAG3
-  FHOut.outbyte(is64Bit ? 2 : 1);          // EI_CLASS
-  FHOut.outbyte(isLittleEndian ? 1 : 2);   // EI_DATA
-  FHOut.outbyte(1);                        // EI_VERSION
-  FH.resize(16);                         // EI_PAD up to 16 bytes.
-
-  // This should change for shared objects.
-  FHOut.outhalf(1);                 // e_type = ET_REL
-  FHOut.outhalf(TM.getELFWriterInfo()->getEMachine()); // target-defined
-  FHOut.outword(1);                 // e_version = 1
-  FHOut.outaddr(0);                 // e_entry = 0 -> no entry point in .o file
-  FHOut.outaddr(0);                 // e_phoff = 0 -> no program header for .o
-
-  ELFHeader_e_shoff_Offset = FH.size();
-  FHOut.outaddr(0);                 // e_shoff
+  unsigned ElfClass = is64Bit ? ELFCLASS64 : ELFCLASS32;
+  unsigned ElfEndian = isLittleEndian ? ELFDATA2LSB : ELFDATA2MSB;
+
+  // ELF Header
+  // ----------
+  // Fields e_shnum e_shstrndx are only known after all section have
+  // been emitted. They locations in the ouput buffer are recorded so
+  // to be patched up later.
+  //
+  // Note
+  // ----
+  // FHOut.outaddr method behaves differently for ELF32 and ELF64 writing
+  // 4 bytes in the former and 8 in the last for *_off and *_addr elf types
+
+  FHOut.outbyte(0x7f); // e_ident[EI_MAG0]
+  FHOut.outbyte('E');  // e_ident[EI_MAG1]
+  FHOut.outbyte('L');  // e_ident[EI_MAG2]
+  FHOut.outbyte('F');  // e_ident[EI_MAG3]
+
+  FHOut.outbyte(ElfClass);   // e_ident[EI_CLASS]
+  FHOut.outbyte(ElfEndian);  // e_ident[EI_DATA]
+  FHOut.outbyte(EV_CURRENT); // e_ident[EI_VERSION]
+
+  FH.resize(16);  // e_ident[EI_NIDENT-EI_PAD]
+
+  FHOut.outhalf(ET_REL);     // e_type
+  FHOut.outhalf(e_machine);  // e_machine = target
+  FHOut.outword(EV_CURRENT); // e_version
+  FHOut.outaddr(0);          // e_entry = 0 -> no entry point in .o file
+  FHOut.outaddr(0);          // e_phoff = 0 -> no program header for .o
+
+  ELFHdr_e_shoff_Offset = FH.size();
+  FHOut.outaddr(0);                 // e_shoff = sec hdr table off in bytes
   FHOut.outword(e_flags);           // e_flags = whatever the target wants
 
   FHOut.outhalf(is64Bit ? 64 : 52); // e_ehsize = ELF header size
@@ -115,14 +132,16 @@ bool ELFWriter::doInitialization(Module &M) {
   FHOut.outhalf(0);                 // e_phnum     = # prog header entries = 0
   FHOut.outhalf(is64Bit ? 64 : 40); // e_shentsize = sect hdr entry size
 
+  // e_shnum     = # of section header ents
+  ELFHdr_e_shnum_Offset = FH.size();
+  FHOut.outhalf(0);
 
-  ELFHeader_e_shnum_Offset = FH.size();
-  FHOut.outhalf(0);                 // e_shnum     = # of section header ents
-  ELFHeader_e_shstrndx_Offset = FH.size();
-  FHOut.outhalf(0);                 // e_shstrndx  = Section # of '.shstrtab'
+  // e_shstrndx  = Section # of '.shstrtab'
+  ELFHdr_e_shstrndx_Offset = FH.size();
+  FHOut.outhalf(0);
 
   // Add the null section, which is required to be first in the file.
-  getSection("", 0, 0);
+  getSection("", ELFSection::SHT_NULL, 0);
 
   // Start up the symbol table.  The first entry in the symtab is the null
   // entry.
@@ -334,7 +353,7 @@ void ELFWriter::EmitSectionTableStringTable() {
   // Now that we know which section number is the .shstrtab section, update the
   // e_shstrndx entry in the ELF header.
   OutputBuffer FHOut(FileHeader, is64Bit, isLittleEndian);
-  FHOut.fixhalf(SHStrTab.SectionIdx, ELFHeader_e_shstrndx_Offset);
+  FHOut.fixhalf(SHStrTab.SectionIdx, ELFHdr_e_shstrndx_Offset);
 
   // Set the NameIdx of each section in the string table and emit the bytes for
   // the string table.
@@ -386,11 +405,11 @@ void ELFWriter::OutputSectionsAndSectionTable() {
   // Now that we know where all of the sections will be emitted, set the e_shnum
   // entry in the ELF header.
   OutputBuffer FHOut(FileHeader, is64Bit, isLittleEndian);
-  FHOut.fixhalf(NumSections, ELFHeader_e_shnum_Offset);
+  FHOut.fixhalf(NumSections, ELFHdr_e_shnum_Offset);
 
   // Now that we know the offset in the file of the section table, update the
   // e_shoff address in the ELF header.
-  FHOut.fixaddr(FileOff, ELFHeader_e_shoff_Offset);
+  FHOut.fixaddr(FileOff, ELFHdr_e_shoff_Offset);
 
   // Now that we know all of the data in the file header, emit it and all of the
   // sections!
diff --git a/lib/CodeGen/ELFWriter.h b/lib/CodeGen/ELFWriter.h
index 31aa05a..0389185 100644
--- a/lib/CodeGen/ELFWriter.h
+++ b/lib/CodeGen/ELFWriter.h
@@ -15,6 +15,7 @@
 #define ELFWRITER_H
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "ELF.h"
 #include <list>
 #include <map>
 
@@ -82,10 +83,8 @@ namespace llvm {
     /// doInitialization - Emit the file header and all of the global variables
     /// for the module to the ELF file.
     bool doInitialization(Module &M);
-
     bool runOnMachineFunction(MachineFunction &MF);
 
-
     /// doFinalization - Now that the module has been completely processed, emit
     /// the ELF file to 'O'.
     bool doFinalization(Module &M);
@@ -96,53 +95,6 @@ namespace llvm {
     // as well!).
     DataBuffer FileHeader;
 
-    /// ELFSection - This struct contains information about each section that is
-    /// emitted to the file.  This is eventually turned into the section header
-    /// table at the end of the file.
-    struct ELFSection {
-      std::string Name;       // Name of the section.
-      unsigned NameIdx;       // Index in .shstrtab of name, once emitted.
-      unsigned Type;
-      unsigned Flags;
-      uint64_t Addr;
-      unsigned Offset;
-      unsigned Size;
-      unsigned Link;
-      unsigned Info;
-      unsigned Align;
-      unsigned EntSize;
-
-      /// SectionIdx - The number of the section in the Section Table.
-      ///
-      unsigned short SectionIdx;
-
-      /// SectionData - The actual data for this section which we are building
-      /// up for emission to the file.
-      DataBuffer SectionData;
-
-      enum { SHT_NULL = 0, SHT_PROGBITS = 1, SHT_SYMTAB = 2, SHT_STRTAB = 3,
-             SHT_RELA = 4, SHT_HASH = 5, SHT_DYNAMIC = 6, SHT_NOTE = 7,
-             SHT_NOBITS = 8, SHT_REL = 9, SHT_SHLIB = 10, SHT_DYNSYM = 11 };
-      enum { SHN_UNDEF = 0, SHN_ABS = 0xFFF1, SHN_COMMON = 0xFFF2 };
-      enum {   // SHF - ELF Section Header Flags
-        SHF_WRITE            = 1 << 0, // Writable
-        SHF_ALLOC            = 1 << 1, // Mapped into the process addr space
-        SHF_EXECINSTR        = 1 << 2, // Executable
-        SHF_MERGE            = 1 << 4, // Might be merged if equal
-        SHF_STRINGS          = 1 << 5, // Contains null-terminated strings
-        SHF_INFO_LINK        = 1 << 6, // 'sh_info' contains SHT index
-        SHF_LINK_ORDER       = 1 << 7, // Preserve order after combining
-        SHF_OS_NONCONFORMING = 1 << 8, // nonstandard OS support required
-        SHF_GROUP            = 1 << 9, // Section is a member of a group
-        SHF_TLS              = 1 << 10 // Section holds thread-local data
-      };
-
-      ELFSection(const std::string &name)
-        : Name(name), Type(0), Flags(0), Addr(0), Offset(0), Size(0),
-          Link(0), Info(0), Align(0), EntSize(0) {
-      }
-    };
-
     /// SectionList - This is the list of sections that we have emitted to the
     /// file.  Once the file has been completely built, the section header table
     /// is constructed from this info.
@@ -165,9 +117,15 @@ namespace llvm {
       SN->SectionIdx = NumSections++;
       SN->Type = Type;
       SN->Flags = Flags;
+      SN->Link = ELFSection::SHN_UNDEF;
       return *SN;
     }
 
+    ELFSection &getTextSection() {
+      return getSection(".text", ELFSection::SHT_PROGBITS,
+                        ELFSection::SHF_EXECINSTR | ELFSection::SHF_ALLOC);
+    }
+
     ELFSection &getDataSection() {
       return getSection(".data", ELFSection::SHT_PROGBITS,
                         ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
@@ -177,34 +135,6 @@ namespace llvm {
                         ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
     }
 
-    /// ELFSym - This struct contains information about each symbol that is
-    /// added to logical symbol table for the module.  This is eventually
-    /// turned into a real symbol table in the file.
-    struct ELFSym {
-      const GlobalValue *GV;    // The global value this corresponds to.
-      unsigned NameIdx;         // Index in .strtab of name, once emitted.
-      uint64_t Value;
-      unsigned Size;
-      unsigned char Info;
-      unsigned char Other;
-      unsigned short SectionIdx;
-
-      enum { STB_LOCAL = 0, STB_GLOBAL = 1, STB_WEAK = 2 };
-      enum { STT_NOTYPE = 0, STT_OBJECT = 1, STT_FUNC = 2, STT_SECTION = 3,
-             STT_FILE = 4 };
-      ELFSym(const GlobalValue *gv) : GV(gv), Value(0), Size(0), Info(0),
-                                      Other(0), SectionIdx(0) {}
-
-      void SetBind(unsigned X) {
-        assert(X == (X & 0xF) && "Bind value out of range!");
-        Info = (Info & 0x0F) | (X << 4);
-      }
-      void SetType(unsigned X) {
-        assert(X == (X & 0xF) && "Type value out of range!");
-        Info = (Info & 0xF0) | X;
-      }
-    };
-
     /// SymbolTable - This is the list of symbols we have emitted to the file.
     /// This actually gets rearranged before emission to the file (to put the
     /// local symbols first in the list).
@@ -214,9 +144,9 @@ namespace llvm {
     // (e.g. the location of the section table).  These members keep track of
     // the offset in ELFHeader of these various pieces to update and other
     // locations in the file.
-    unsigned ELFHeader_e_shoff_Offset;     // e_shoff    in ELF header.
-    unsigned ELFHeader_e_shstrndx_Offset;  // e_shstrndx in ELF header.
-    unsigned ELFHeader_e_shnum_Offset;     // e_shnum    in ELF header.
+    unsigned ELFHdr_e_shoff_Offset;     // e_shoff    in ELF header.
+    unsigned ELFHdr_e_shstrndx_Offset;  // e_shstrndx in ELF header.
+    unsigned ELFHdr_e_shnum_Offset;     // e_shnum    in ELF header.
   private:
     void EmitGlobal(GlobalVariable *GV);
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index b8c8563..c351593 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/Streams.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/FoldingSet.h"
-#include <ostream>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index 8520888..804fae5 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -40,7 +40,6 @@
 #include <queue>
 #include <memory>
 #include <cmath>
-#include <iostream>
 
 using namespace llvm;
 
@@ -399,7 +398,7 @@ unsigned RALinScan::attemptTrivialCoalescing(LiveInterval &cur, unsigned Reg) {
     }
 
     ++NumCoalesce;
-    return SrcReg;
+    return PhysReg;
   }
 
   return Reg;
@@ -543,13 +542,37 @@ void RALinScan::linearScan()
     // Ignore splited live intervals.
     if (!isPhys && vrm_->getPreSplitReg(cur.reg))
       continue;
+
+    // A register defined by an implicit_def can be liveout the def BB and livein
+    // to a use BB. Add it to the livein set of the use BB's.
+    if (!isPhys && cur.empty()) {
+      if (MachineInstr *DefMI = mri_->getVRegDef(cur.reg)) {
+        assert(DefMI->getOpcode() == TargetInstrInfo::IMPLICIT_DEF);
+        MachineBasicBlock *DefMBB = DefMI->getParent();
+        SmallPtrSet<MachineBasicBlock*, 4> Seen;
+        Seen.insert(DefMBB);
+        for (MachineRegisterInfo::reg_iterator ri = mri_->reg_begin(cur.reg),
+               re = mri_->reg_end(); ri != re; ++ri) {
+          MachineInstr *UseMI = &*ri;
+          MachineBasicBlock *UseMBB = UseMI->getParent();
+          if (Seen.insert(UseMBB)) {
+            assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+                   "Adding a virtual register to livein set?");
+            UseMBB->addLiveIn(Reg);
+          }
+        }
+      }
+    }
     for (LiveInterval::Ranges::const_iterator I = cur.begin(), E = cur.end();
          I != E; ++I) {
       const LiveRange &LR = *I;
       if (li_->findLiveInMBBs(LR.start, LR.end, LiveInMBBs)) {
         for (unsigned i = 0, e = LiveInMBBs.size(); i != e; ++i)
-          if (LiveInMBBs[i] != EntryMBB)
+          if (LiveInMBBs[i] != EntryMBB) {
+            assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+                   "Adding a virtual register to livein set?");
             LiveInMBBs[i]->addLiveIn(Reg);
+          }
         LiveInMBBs.clear();
       }
     }
@@ -1192,7 +1215,6 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur)
   // The earliest start of a Spilled interval indicates up to where
   // in handled we need to roll back
   
-  unsigned earliestStart = cur->beginNumber();
   LiveInterval *earliestStartInterval = cur;
 
   // Spill live intervals of virtual regs mapped to the physical register we
@@ -1206,19 +1228,10 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur)
     LiveInterval *sli = spillIs.back();
     spillIs.pop_back();
     DOUT << "\t\t\tspilling(a): " << *sli << '\n';
-    earliestStart = std::min(earliestStart, sli->beginNumber());
     earliestStartInterval =
       (earliestStartInterval->beginNumber() < sli->beginNumber()) ?
          earliestStartInterval : sli;
-    
-    if (earliestStartInterval->beginNumber()!=earliestStart) {
-      epicFail |= true;
-      std::cerr << "What the 1 - "
-      		<< "earliestStart = " << earliestStart
-      		<< "earliestStartInterval = " << earliestStartInterval->beginNumber()
-      		<< "\n";
-    }
-   
+       
     std::vector<LiveInterval*> newIs;
     if (!NewSpillFramework) {
       newIs = li_->addIntervalsForSpills(*sli, spillIs, loopInfo, *vrm_);
@@ -1229,20 +1242,12 @@ void RALinScan::assignRegOrStackSlotAtInterval(LiveInterval* cur)
     std::copy(newIs.begin(), newIs.end(), std::back_inserter(added));
     spilled.insert(sli->reg);
 
-    if (earliestStartInterval->beginNumber()!=earliestStart) {
-      epicFail |= true;
-      std::cerr << "What the 2 - "
-      		<< "earliestStart = " << earliestStart
-      		<< "earliestStartInterval = " << earliestStartInterval->beginNumber()
-      		<< "\n";
-    }
-
     if (epicFail) {
       //abort();
     }
   }
 
-  earliestStart = earliestStartInterval->beginNumber();
+  unsigned earliestStart = earliestStartInterval->beginNumber();
 
   DOUT << "\t\trolling back to: " << earliestStart << '\n';
 
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4c1710d..609ec82 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3626,30 +3626,29 @@ static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, MVT VT) {
   assert(N->getOpcode() == ISD::BUILD_PAIR);
 
-  SDNode *LD1 = getBuildPairElt(N, 0);
-  if (!ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
+  LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
+  LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse())
     return SDValue();
   MVT LD1VT = LD1->getValueType(0);
-  SDNode *LD2 = getBuildPairElt(N, 1);
   const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
 
   if (ISD::isNON_EXTLoad(LD2) &&
       LD2->hasOneUse() &&
       // If both are volatile this would reduce the number of volatile loads.
       // If one is volatile it might be ok, but play conservative and bail out.
-      !cast<LoadSDNode>(LD1)->isVolatile() &&
-      !cast<LoadSDNode>(LD2)->isVolatile() &&
+      !LD1->isVolatile() &&
+      !LD2->isVolatile() &&
       TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) {
-    LoadSDNode *LD = cast<LoadSDNode>(LD1);
-    unsigned Align = LD->getAlignment();
+    unsigned Align = LD1->getAlignment();
     unsigned NewAlign = TLI.getTargetData()->
       getABITypeAlignment(VT.getTypeForMVT());
 
     if (NewAlign <= Align &&
         (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
-      return DAG.getLoad(VT, N->getDebugLoc(), LD->getChain(), LD->getBasePtr(),
-                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                         false, Align);
+      return DAG.getLoad(VT, N->getDebugLoc(), LD1->getChain(),
+                         LD1->getBasePtr(), LD1->getSrcValue(),
+                         LD1->getSrcValueOffset(), false, Align);
   }
 
   return SDValue();
@@ -4019,6 +4018,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   // fold (fmul A, 0) -> 0
   if (UnsafeFPMath && N1CFP && N1CFP->getValueAPF().isZero())
     return N1;
+  // fold (fmul A, 0) -> 0, vector edition.
+  if (UnsafeFPMath && ISD::isBuildVectorAllZeros(N1.getNode()))
+    return N1;
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 6becff3..4a7dbeb 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -639,18 +639,18 @@ FastISel::FastEmitBranch(MachineBasicBlock *MSucc) {
 bool
 FastISel::SelectOperator(User *I, unsigned Opcode) {
   switch (Opcode) {
-  case Instruction::Add: {
-    ISD::NodeType Opc = I->getType()->isFPOrFPVector() ? ISD::FADD : ISD::ADD;
-    return SelectBinaryOp(I, Opc);
-  }
-  case Instruction::Sub: {
-    ISD::NodeType Opc = I->getType()->isFPOrFPVector() ? ISD::FSUB : ISD::SUB;
-    return SelectBinaryOp(I, Opc);
-  }
-  case Instruction::Mul: {
-    ISD::NodeType Opc = I->getType()->isFPOrFPVector() ? ISD::FMUL : ISD::MUL;
-    return SelectBinaryOp(I, Opc);
-  }
+  case Instruction::Add:
+    return SelectBinaryOp(I, ISD::ADD);
+  case Instruction::FAdd:
+    return SelectBinaryOp(I, ISD::FADD);
+  case Instruction::Sub:
+    return SelectBinaryOp(I, ISD::SUB);
+  case Instruction::FSub:
+    return SelectBinaryOp(I, ISD::FSUB);
+  case Instruction::Mul:
+    return SelectBinaryOp(I, ISD::MUL);
+  case Instruction::FMul:
+    return SelectBinaryOp(I, ISD::FMUL);
   case Instruction::SDiv:
     return SelectBinaryOp(I, ISD::SDIV);
   case Instruction::UDiv:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2cd67e6..5ae183e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -116,6 +116,8 @@ private:
   /// result.
   SDValue LegalizeOp(SDValue O);
 
+  SDValue OptimizeFloatStore(StoreSDNode *ST);
+
   /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
@@ -165,6 +167,7 @@ private:
   SDValue ExpandBitCount(unsigned Opc, SDValue Op, DebugLoc dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
+  SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
   void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
@@ -681,6 +684,59 @@ ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, DebugLoc dl) {
   return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
 }
 
+SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  // FIXME: We shouldn't do this for TargetConstantFP's.
+  // FIXME: move this to the DAG Combiner!  Note that we can't regress due
+  // to phase ordering between legalized code and the dag combiner.  This
+  // probably means that we need to integrate dag combiner and legalizer
+  // together.
+  // We generally can't do this one for long doubles.
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3;
+  int SVOffset = ST->getSrcValueOffset();
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  DebugLoc dl = ST->getDebugLoc();
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
+    if (CFP->getValueType(0) == MVT::f32 &&
+        getTypeAction(MVT::i32) == Legal) {
+      Tmp3 = DAG.getConstant(CFP->getValueAPF().
+                                      bitcastToAPInt().zextOrTrunc(32),
+                              MVT::i32);
+      return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                          SVOffset, isVolatile, Alignment);
+    } else if (CFP->getValueType(0) == MVT::f64) {
+      // If this target supports 64-bit registers, do a single 64-bit store.
+      if (getTypeAction(MVT::i64) == Legal) {
+        Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                  zextOrTrunc(64), MVT::i64);
+        return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
+                            SVOffset, isVolatile, Alignment);
+      } else if (getTypeAction(MVT::i32) == Legal && !ST->isVolatile()) {
+        // Otherwise, if the target supports 32-bit registers, use 2 32-bit
+        // stores.  If the target supports neither 32- nor 64-bits, this
+        // xform is certainly not worth it.
+        const APInt &IntVal =CFP->getValueAPF().bitcastToAPInt();
+        SDValue Lo = DAG.getConstant(APInt(IntVal).trunc(32), MVT::i32);
+        SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
+        if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+        Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getSrcValue(),
+                          SVOffset, isVolatile, Alignment);
+        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                            DAG.getIntPtrConstant(4));
+        Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(), SVOffset+4,
+                          isVolatile, MinAlign(Alignment, 4U));
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      }
+    }
+  }
+  return SDValue();
+}
+
 /// LegalizeOp - We know that the specified value has a legal type, and
 /// that its operands are legal.  Now ensure that the operation itself
 /// is legal, recursively ensuring that the operands' operations remain
@@ -1293,50 +1349,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     bool isVolatile = ST->isVolatile();
 
     if (!ST->isTruncatingStore()) {
-      // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
-      // FIXME: We shouldn't do this for TargetConstantFP's.
-      // FIXME: move this to the DAG Combiner!  Note that we can't regress due
-      // to phase ordering between legalized code and the dag combiner.  This
-      // probably means that we need to integrate dag combiner and legalizer
-      // together.
-      // We generally can't do this one for long doubles.
-      if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
-        if (CFP->getValueType(0) == MVT::f32 &&
-            getTypeAction(MVT::i32) == Legal) {
-          Tmp3 = DAG.getConstant(CFP->getValueAPF().
-                                          bitcastToAPInt().zextOrTrunc(32),
-                                  MVT::i32);
-          Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
-                                SVOffset, isVolatile, Alignment);
-          break;
-        } else if (CFP->getValueType(0) == MVT::f64) {
-          // If this target supports 64-bit registers, do a single 64-bit store.
-          if (getTypeAction(MVT::i64) == Legal) {
-            Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
-                                     zextOrTrunc(64), MVT::i64);
-            Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getSrcValue(),
-                                  SVOffset, isVolatile, Alignment);
-            break;
-          } else if (getTypeAction(MVT::i32) == Legal && !ST->isVolatile()) {
-            // Otherwise, if the target supports 32-bit registers, use 2 32-bit
-            // stores.  If the target supports neither 32- nor 64-bits, this
-            // xform is certainly not worth it.
-            const APInt &IntVal =CFP->getValueAPF().bitcastToAPInt();
-            SDValue Lo = DAG.getConstant(APInt(IntVal).trunc(32), MVT::i32);
-            SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
-            if (TLI.isBigEndian()) std::swap(Lo, Hi);
-
-            Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getSrcValue(),
-                              SVOffset, isVolatile, Alignment);
-            Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                               DAG.getIntPtrConstant(4));
-            Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2, ST->getSrcValue(), SVOffset+4,
-                              isVolatile, MinAlign(Alignment, 4U));
-
-            Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
-            break;
-          }
-        }
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        Result = SDValue(OptStore, 0);
+        break;
       }
 
       {
@@ -1510,6 +1525,46 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, NULL, 0);
 }
 
+SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
+  // We can't handle this case efficiently.  Allocate a sufficiently
+  // aligned object on the stack, store each element into it, then load
+  // the result as a vector.
+  // Create the stack frame object.
+  MVT VT = Node->getValueType(0);
+  MVT OpVT = Node->getOperand(0).getValueType();
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  const Value *SV = PseudoSourceValue::getFixedStack(FI);
+
+  // Emit a store of each element to the stack slot.
+  SmallVector<SDValue, 8> Stores;
+  unsigned TypeByteSize = OpVT.getSizeInBits() / 8;
+  // Store (in the right endianness) the elements to memory.
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    unsigned Offset = TypeByteSize*i;
+
+    SDValue Idx = DAG.getConstant(Offset, FIPtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
+
+    Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
+                                  Idx, SV, Offset));
+  }
+
+  SDValue StoreChain;
+  if (!Stores.empty())    // Not all undef elements?
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             &Stores[0], Stores.size());
+  else
+    StoreChain = DAG.getEntryNode();
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, SV, 0);
+}
+
 SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
   DebugLoc dl = Node->getDebugLoc();
   SDValue Tmp1 = Node->getOperand(0);
@@ -1853,40 +1908,8 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
     }
   }
 
-  // Otherwise, we can't handle this case efficiently.  Allocate a sufficiently
-  // aligned object on the stack, store each element into it, then load
-  // the result as a vector.
-  // Create the stack frame object.
-  SDValue FIPtr = DAG.CreateStackTemporary(VT);
-  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
-  const Value *SV = PseudoSourceValue::getFixedStack(FI);
-
-  // Emit a store of each element to the stack slot.
-  SmallVector<SDValue, 8> Stores;
-  unsigned TypeByteSize = OpVT.getSizeInBits() / 8;
-  // Store (in the right endianness) the elements to memory.
-  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
-    // Ignore undef elements.
-    if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
-
-    unsigned Offset = TypeByteSize*i;
-
-    SDValue Idx = DAG.getConstant(Offset, FIPtr.getValueType());
-    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
-
-    Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
-                                  Idx, SV, Offset));
-  }
-
-  SDValue StoreChain;
-  if (!Stores.empty())    // Not all undef elements?
-    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                             &Stores[0], Stores.size());
-  else
-    StoreChain = DAG.getEntryNode();
-
-  // Result is a load from the stack slot.
-  return DAG.getLoad(VT, dl, StoreChain, FIPtr, SV, 0);
+  // Otherwise, we can't handle this case efficiently.
+  return ExpandVectorBuildThroughStack(Node);
 }
 
 // ExpandLibCall - Expand a node into a call to a libcall.  If the result value
@@ -2437,23 +2460,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
     break;
   case ISD::CONCAT_VECTORS: {
-    // Use extract/insert/build vector for now. We might try to be
-    // more clever later.
-    SmallVector<SDValue, 8> Ops;
-    unsigned NumOperands = Node->getNumOperands();
-    for (unsigned i=0; i < NumOperands; ++i) {
-      SDValue SubOp = Node->getOperand(i);
-      MVT VVT = SubOp.getNode()->getValueType(0);
-      MVT EltVT = VVT.getVectorElementType();
-      unsigned NumSubElem = VVT.getVectorNumElements();
-      for (unsigned j=0; j < NumSubElem; ++j) {
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
-                                  DAG.getIntPtrConstant(j)));
-      }
-    }
-    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
-                       &Ops[0], Ops.size());
-    Results.push_back(Tmp1);
+    Results.push_back(ExpandVectorBuildThroughStack(Node));
     break;
   }
   case ISD::SCALAR_TO_VECTOR:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index eb9342c..0c826f6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -356,13 +356,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
   unsigned NewOpc = N->getOpcode();
   DebugLoc dl = N->getDebugLoc();
 
-  // If we're promoting a UINT to a larger size, check to see if the new node
-  // will be legal.  If it isn't, check to see if FP_TO_SINT is legal, since
-  // we can use that instead.  This allows us to generate better code for
-  // FP_TO_UINT for small destination sizes on targets where FP_TO_UINT is not
-  // legal, such as PowerPC.
+  // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
+  // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
+  // and SINT conversions are Custom, there is no way to tell which is preferable.
+  // We choose SINT because that's the right thing on PPC.)  
   if (N->getOpcode() == ISD::FP_TO_UINT &&
-      !TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NVT) &&
+      !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
       TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
     NewOpc = ISD::FP_TO_SINT;
 
@@ -1747,7 +1746,9 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   DebugLoc dl = N->getDebugLoc();
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-  if (VT == MVT::i32)
+  if (VT == MVT::i16)
+    LC = RTLIB::SDIV_I16;
+  else if (VT == MVT::i32)
     LC = RTLIB::SDIV_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::SDIV_I64;
@@ -1909,7 +1910,9 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
   DebugLoc dl = N->getDebugLoc();
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-  if (VT == MVT::i32)
+  if (VT == MVT::i16)
+    LC = RTLIB::SREM_I16;
+  else if (VT == MVT::i32)
     LC = RTLIB::SREM_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::SREM_I64;
@@ -1938,7 +1941,9 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
   DebugLoc dl = N->getDebugLoc();
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-  if (VT == MVT::i32)
+  if (VT == MVT::i16)
+    LC = RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
     LC = RTLIB::UDIV_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::UDIV_I64;
@@ -1956,7 +1961,9 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
   DebugLoc dl = N->getDebugLoc();
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-  if (VT == MVT::i32)
+  if (VT == MVT::i16)
+    LC = RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
     LC = RTLIB::UREM_I32;
   else if (VT == MVT::i64)
     LC = RTLIB::UREM_I64;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index df9af21..335c73c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -129,6 +129,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   if (!HasVectorValue)
     return TranslateLegalizeResults(Op, Result);
 
+  MVT QueryType;
   switch (Op.getOpcode()) {
   default:
     return TranslateLegalizeResults(Op, Result);
@@ -162,8 +163,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::ANY_EXTEND:
   case ISD::TRUNCATE:
   case ISD::SIGN_EXTEND:
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FNEG:
@@ -183,10 +182,15 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FFLOOR:
+    QueryType = Node->getValueType(0);
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    QueryType = Node->getOperand(0).getValueType();
     break;
   }
 
-  switch (TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0))) {
+  switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
   case TargetLowering::Promote:
     // "Promote" the operation by bitcasting
     Result = PromoteVectorOp(Op);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 195896e..a9adce8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -154,7 +154,7 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   // Do not accept an all-undef vector.
   if (i == e) return false;
 
-  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // Do not accept build_vectors that aren't all constants or which have non-0
   // elements.
   SDValue Zero = N->getOperand(i);
   if (isa<ConstantSDNode>(Zero)) {
@@ -166,7 +166,7 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   } else
     return false;
 
-  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // Okay, we have at least one 0 value, check to see if the rest match or are
   // undefs.
   for (++i; i != e; ++i)
     if (N->getOperand(i) != Zero &&
@@ -2807,16 +2807,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, MVT VT,
     case ISD::ADDC:
     case ISD::ADDE:
     case ISD::SUB:
-    case ISD::FADD:
-    case ISD::FSUB:
-    case ISD::FMUL:
-    case ISD::FDIV:
-    case ISD::FREM:
     case ISD::UDIV:
     case ISD::SDIV:
     case ISD::UREM:
     case ISD::SREM:
       return N2;       // fold op(arg1, undef) -> undef
+    case ISD::FADD:
+    case ISD::FSUB:
+    case ISD::FMUL:
+    case ISD::FDIV:
+    case ISD::FREM:
+      if (UnsafeFPMath)
+        return N2;
+      break;
     case ISD::MUL:
     case ISD::AND:
     case ISD::SRL:
@@ -3059,7 +3062,7 @@ bool MeetsMaxMemopRequirement(std::vector<MVT> &MemOps,
   isSrcStr = isMemSrcFromString(Src, Str);
   bool isSrcConst = isa<ConstantSDNode>(Src);
   bool AllowUnalign = TLI.allowsUnalignedMemoryAccesses();
-  MVT VT = TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr);
+  MVT VT = TLI.getOptimalMemOpType(Size, Align, isSrcConst, isSrcStr, DAG);
   if (VT != MVT::iAny) {
     unsigned NewAlign = (unsigned)
       TLI.getTargetData()->getABITypeAlignment(VT.getTypeForMVT());
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
index 889d7f5..93750d6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
@@ -842,20 +842,6 @@ void SelectionDAGLowering::visit(unsigned Opcode, User &I) {
   }
 }
 
-void SelectionDAGLowering::visitAdd(User &I) {
-  if (I.getType()->isFPOrFPVector())
-    visitBinary(I, ISD::FADD);
-  else
-    visitBinary(I, ISD::ADD);
-}
-
-void SelectionDAGLowering::visitMul(User &I) {
-  if (I.getType()->isFPOrFPVector())
-    visitBinary(I, ISD::FMUL);
-  else
-    visitBinary(I, ISD::MUL);
-}
-
 SDValue SelectionDAGLowering::getValue(const Value *V) {
   SDValue &N = NodeMap[V];
   if (N.getNode()) return N;
@@ -2161,37 +2147,33 @@ void SelectionDAGLowering::visitSwitch(SwitchInst &SI) {
 }
 
 
-void SelectionDAGLowering::visitSub(User &I) {
+void SelectionDAGLowering::visitFSub(User &I) {
   // -0.0 - X --> fneg
   const Type *Ty = I.getType();
   if (isa<VectorType>(Ty)) {
     if (ConstantVector *CV = dyn_cast<ConstantVector>(I.getOperand(0))) {
       const VectorType *DestTy = cast<VectorType>(I.getType());
       const Type *ElTy = DestTy->getElementType();
-      if (ElTy->isFloatingPoint()) {
-        unsigned VL = DestTy->getNumElements();
-        std::vector<Constant*> NZ(VL, ConstantFP::getNegativeZero(ElTy));
-        Constant *CNZ = ConstantVector::get(&NZ[0], NZ.size());
-        if (CV == CNZ) {
-          SDValue Op2 = getValue(I.getOperand(1));
-          setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
-                                   Op2.getValueType(), Op2));
-          return;
-        }
-      }
-    }
-  }
-  if (Ty->isFloatingPoint()) {
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(I.getOperand(0)))
-      if (CFP->isExactlyValue(ConstantFP::getNegativeZero(Ty)->getValueAPF())) {
+      unsigned VL = DestTy->getNumElements();
+      std::vector<Constant*> NZ(VL, ConstantFP::getNegativeZero(ElTy));
+      Constant *CNZ = ConstantVector::get(&NZ[0], NZ.size());
+      if (CV == CNZ) {
         SDValue Op2 = getValue(I.getOperand(1));
         setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
                                  Op2.getValueType(), Op2));
         return;
       }
+    }
   }
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(I.getOperand(0)))
+    if (CFP->isExactlyValue(ConstantFP::getNegativeZero(Ty)->getValueAPF())) {
+      SDValue Op2 = getValue(I.getOperand(1));
+      setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+                               Op2.getValueType(), Op2));
+      return;
+    }
 
-  visitBinary(I, Ty->isFPOrFPVector() ? ISD::FSUB : ISD::SUB);
+  visitBinary(I, ISD::FSUB);
 }
 
 void SelectionDAGLowering::visitBinary(User &I, unsigned OpCode) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
index 578aa591..057c841 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.h
@@ -469,9 +469,12 @@ private:
 
   void visitBinary(User &I, unsigned OpCode);
   void visitShift(User &I, unsigned Opcode);
-  void visitAdd(User &I);
-  void visitSub(User &I);
-  void visitMul(User &I);
+  void visitAdd(User &I)  { visitBinary(I, ISD::ADD); }
+  void visitFAdd(User &I) { visitBinary(I, ISD::FADD); }
+  void visitSub(User &I)  { visitBinary(I, ISD::SUB); }
+  void visitFSub(User &I);
+  void visitMul(User &I)  { visitBinary(I, ISD::MUL); }
+  void visitFMul(User &I) { visitBinary(I, ISD::FMUL); }
   void visitURem(User &I) { visitBinary(I, ISD::UREM); }
   void visitSRem(User &I) { visitBinary(I, ISD::SREM); }
   void visitFRem(User &I) { visitBinary(I, ISD::FREM); }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3334e53..ab4cd51 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2070,13 +2070,13 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA,
 }
 
 
-/// isConsecutiveLoad - Return true if LD (which must be a LoadSDNode) is
-/// loading 'Bytes' bytes from a location that is 'Dist' units away from the
-/// location that the 'Base' load is loading from.
-bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base,
-                                       unsigned Bytes, int Dist,
+/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a 
+/// location that is 'Dist' units away from the location that the 'Base' load 
+/// is loading from.
+bool TargetLowering::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, 
+                                       unsigned Bytes, int Dist, 
                                        const MachineFrameInfo *MFI) const {
-  if (LD->getOperand(0).getNode() != Base->getOperand(0).getNode())
+  if (LD->getChain() != Base->getChain())
     return false;
   MVT VT = LD->getValueType(0);
   if (VT.getSizeInBits() / 8 != Bytes)
@@ -2094,6 +2094,11 @@ bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base,
     if (FS != BFS || FS != (int)Bytes) return false;
     return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
   }
+  if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) {
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Loc.getOperand(1));
+    if (V && (V->getSExtValue() == Dist*Bytes))
+      return true;
+  }
 
   GlobalValue *GV1 = NULL;
   GlobalValue *GV2 = NULL;
diff --git a/lib/CodeGen/VirtRegRewriter.cpp b/lib/CodeGen/VirtRegRewriter.cpp
index c31f622..bd6584a 100644
--- a/lib/CodeGen/VirtRegRewriter.cpp
+++ b/lib/CodeGen/VirtRegRewriter.cpp
@@ -33,99 +33,21 @@ STATISTIC(NumSUnfold , "Number of stores unfolded");
 STATISTIC(NumModRefUnfold, "Number of modref unfolded");
 
 namespace {
-  enum RewriterName { simple, local, trivial };
+  enum RewriterName { local, trivial };
 }
 
 static cl::opt<RewriterName>
 RewriterOpt("rewriter",
             cl::desc("Rewriter to use: (default: local)"),
             cl::Prefix,
-            cl::values(clEnumVal(simple,  "simple rewriter"),
-                       clEnumVal(local,   "local rewriter"),
+            cl::values(clEnumVal(local,   "local rewriter"),
                        clEnumVal(trivial, "trivial rewriter"),
                        clEnumValEnd),
             cl::init(local));
 
 VirtRegRewriter::~VirtRegRewriter() {}
 
- 
-// ****************************** //
-// Simple Spiller Implementation  //
-// ****************************** //
-
-struct VISIBILITY_HIDDEN SimpleRewriter : public VirtRegRewriter {
-
-  bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM,
-                            LiveIntervals* LIs) {
-    DOUT << "********** REWRITE MACHINE CODE **********\n";
-    DOUT << "********** Function: " << MF.getFunction()->getName() << '\n';
-    const TargetMachine &TM = MF.getTarget();
-    const TargetInstrInfo &TII = *TM.getInstrInfo();
-    const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-
-
-    // LoadedRegs - Keep track of which vregs are loaded, so that we only load
-    // each vreg once (in the case where a spilled vreg is used by multiple
-    // operands).  This is always smaller than the number of operands to the
-    // current machine instr, so it should be small.
-    std::vector<unsigned> LoadedRegs;
-
-    for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
-         MBBI != E; ++MBBI) {
-      DOUT << MBBI->getBasicBlock()->getName() << ":\n";
-      MachineBasicBlock &MBB = *MBBI;
-      for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
-           MII != E; ++MII) {
-        MachineInstr &MI = *MII;
-        for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-          MachineOperand &MO = MI.getOperand(i);
-          if (MO.isReg() && MO.getReg()) {
-            if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-              unsigned VirtReg = MO.getReg();
-              unsigned SubIdx = MO.getSubReg();
-              unsigned PhysReg = VRM.getPhys(VirtReg);
-              unsigned RReg = SubIdx ? TRI.getSubReg(PhysReg, SubIdx) : PhysReg;
-              if (!VRM.isAssignedReg(VirtReg)) {
-                int StackSlot = VRM.getStackSlot(VirtReg);
-                const TargetRegisterClass* RC = 
-                                             MF.getRegInfo().getRegClass(VirtReg);
-                
-                if (MO.isUse() &&
-                    std::find(LoadedRegs.begin(), LoadedRegs.end(), VirtReg)
-                             == LoadedRegs.end()) {
-                  TII.loadRegFromStackSlot(MBB, &MI, PhysReg, StackSlot, RC);
-                  MachineInstr *LoadMI = prior(MII);
-                  VRM.addSpillSlotUse(StackSlot, LoadMI);
-                  LoadedRegs.push_back(VirtReg);
-                  ++NumLoads;
-                  DOUT << '\t' << *LoadMI;
-                }
-
-                if (MO.isDef()) {
-                  TII.storeRegToStackSlot(MBB, next(MII), PhysReg, true,   
-                                          StackSlot, RC);
-                  MachineInstr *StoreMI = next(MII);
-                  VRM.addSpillSlotUse(StackSlot, StoreMI);
-                  ++NumStores;
-                }
-              }
-              MF.getRegInfo().setPhysRegUsed(RReg);
-              MI.getOperand(i).setReg(RReg);
-              MI.getOperand(i).setSubReg(0);
-            } else {
-              MF.getRegInfo().setPhysRegUsed(MO.getReg());
-            }
-          }
-        }
-
-        DOUT << '\t' << MI;
-        LoadedRegs.clear();
-      }
-    }
-    return true;
-  }
 
-};
  
 /// This class is intended for use with the new spilling framework only. It
 /// rewrites vreg def/uses to use the assigned preg, but does not insert any
@@ -2231,8 +2153,6 @@ llvm::VirtRegRewriter* llvm::createVirtRegRewriter() {
   default: assert(0 && "Unreachable!");
   case local:
     return new LocalRewriter();
-  case simple:
-    return new SimpleRewriter();
   case trivial:
     return new TrivialRewriter();
   }
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 29a05bb..a80513f 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -573,8 +573,11 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       return GV;
     }
     case Instruction::Add:
+    case Instruction::FAdd:
     case Instruction::Sub:
+    case Instruction::FSub:
     case Instruction::Mul:
+    case Instruction::FMul:
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::URem:
@@ -605,11 +608,11 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       case Type::FloatTyID:
         switch (CE->getOpcode()) {
           default: assert(0 && "Invalid float opcode"); abort();
-          case Instruction::Add:  
+          case Instruction::FAdd:
             GV.FloatVal = LHS.FloatVal + RHS.FloatVal; break;
-          case Instruction::Sub:  
+          case Instruction::FSub:
             GV.FloatVal = LHS.FloatVal - RHS.FloatVal; break;
-          case Instruction::Mul:  
+          case Instruction::FMul:
             GV.FloatVal = LHS.FloatVal * RHS.FloatVal; break;
           case Instruction::FDiv: 
             GV.FloatVal = LHS.FloatVal / RHS.FloatVal; break;
@@ -620,11 +623,11 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       case Type::DoubleTyID:
         switch (CE->getOpcode()) {
           default: assert(0 && "Invalid double opcode"); abort();
-          case Instruction::Add:  
+          case Instruction::FAdd:
             GV.DoubleVal = LHS.DoubleVal + RHS.DoubleVal; break;
-          case Instruction::Sub:  
+          case Instruction::FSub:
             GV.DoubleVal = LHS.DoubleVal - RHS.DoubleVal; break;
-          case Instruction::Mul:  
+          case Instruction::FMul:
             GV.DoubleVal = LHS.DoubleVal * RHS.DoubleVal; break;
           case Instruction::FDiv: 
             GV.DoubleVal = LHS.DoubleVal / RHS.DoubleVal; break;
@@ -638,15 +641,15 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         APFloat apfLHS = APFloat(LHS.IntVal);
         switch (CE->getOpcode()) {
           default: assert(0 && "Invalid long double opcode"); abort();
-          case Instruction::Add:  
+          case Instruction::FAdd:
             apfLHS.add(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
-          case Instruction::Sub:  
+          case Instruction::FSub:
             apfLHS.subtract(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
-          case Instruction::Mul:  
+          case Instruction::FMul:
             apfLHS.multiply(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 765fed2..7dfeae0 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -64,45 +64,35 @@ void Interpreter::initializeExecutionEngine() {
      Dest.TY##Val = Src1.TY##Val OP Src2.TY##Val; \
      break
 
-#define IMPLEMENT_INTEGER_BINOP1(OP, TY) \
-   case Type::IntegerTyID: { \
-     Dest.IntVal = Src1.IntVal OP Src2.IntVal; \
-     break; \
-   }
-
-
-static void executeAddInst(GenericValue &Dest, GenericValue Src1, 
-                           GenericValue Src2, const Type *Ty) {
+static void executeFAddInst(GenericValue &Dest, GenericValue Src1,
+                            GenericValue Src2, const Type *Ty) {
   switch (Ty->getTypeID()) {
-    IMPLEMENT_INTEGER_BINOP1(+, Ty);
     IMPLEMENT_BINARY_OPERATOR(+, Float);
     IMPLEMENT_BINARY_OPERATOR(+, Double);
   default:
-    cerr << "Unhandled type for Add instruction: " << *Ty << "\n";
+    cerr << "Unhandled type for FAdd instruction: " << *Ty << "\n";
     abort();
   }
 }
 
-static void executeSubInst(GenericValue &Dest, GenericValue Src1, 
-                           GenericValue Src2, const Type *Ty) {
+static void executeFSubInst(GenericValue &Dest, GenericValue Src1,
+                            GenericValue Src2, const Type *Ty) {
   switch (Ty->getTypeID()) {
-    IMPLEMENT_INTEGER_BINOP1(-, Ty);
     IMPLEMENT_BINARY_OPERATOR(-, Float);
     IMPLEMENT_BINARY_OPERATOR(-, Double);
   default:
-    cerr << "Unhandled type for Sub instruction: " << *Ty << "\n";
+    cerr << "Unhandled type for FSub instruction: " << *Ty << "\n";
     abort();
   }
 }
 
-static void executeMulInst(GenericValue &Dest, GenericValue Src1, 
-                           GenericValue Src2, const Type *Ty) {
+static void executeFMulInst(GenericValue &Dest, GenericValue Src1,
+                            GenericValue Src2, const Type *Ty) {
   switch (Ty->getTypeID()) {
-    IMPLEMENT_INTEGER_BINOP1(*, Ty);
     IMPLEMENT_BINARY_OPERATOR(*, Float);
     IMPLEMENT_BINARY_OPERATOR(*, Double);
   default:
-    cerr << "Unhandled type for Mul instruction: " << *Ty << "\n";
+    cerr << "Unhandled type for FMul instruction: " << *Ty << "\n";
     abort();
   }
 }
@@ -550,11 +540,14 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
   GenericValue R;   // Result
 
   switch (I.getOpcode()) {
-  case Instruction::Add:   executeAddInst  (R, Src1, Src2, Ty); break;
-  case Instruction::Sub:   executeSubInst  (R, Src1, Src2, Ty); break;
-  case Instruction::Mul:   executeMulInst  (R, Src1, Src2, Ty); break;
-  case Instruction::FDiv:  executeFDivInst (R, Src1, Src2, Ty); break;
-  case Instruction::FRem:  executeFRemInst (R, Src1, Src2, Ty); break;
+  case Instruction::Add:   R.IntVal = Src1.IntVal + Src2.IntVal; break;
+  case Instruction::Sub:   R.IntVal = Src1.IntVal - Src2.IntVal; break;
+  case Instruction::Mul:   R.IntVal = Src1.IntVal * Src2.IntVal; break;
+  case Instruction::FAdd:  executeFAddInst(R, Src1, Src2, Ty); break;
+  case Instruction::FSub:  executeFSubInst(R, Src1, Src2, Ty); break;
+  case Instruction::FMul:  executeFMulInst(R, Src1, Src2, Ty); break;
+  case Instruction::FDiv:  executeFDivInst(R, Src1, Src2, Ty); break;
+  case Instruction::FRem:  executeFRemInst(R, Src1, Src2, Ty); break;
   case Instruction::UDiv:  R.IntVal = Src1.IntVal.udiv(Src2.IntVal); break;
   case Instruction::SDiv:  R.IntVal = Src1.IntVal.sdiv(Src2.IntVal); break;
   case Instruction::URem:  R.IntVal = Src1.IntVal.urem(Src2.IntVal); break;
@@ -1258,18 +1251,21 @@ GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
   GenericValue Dest;
   const Type * Ty = CE->getOperand(0)->getType();
   switch (CE->getOpcode()) {
-  case Instruction::Add:  executeAddInst (Dest, Op0, Op1, Ty); break;
-  case Instruction::Sub:  executeSubInst (Dest, Op0, Op1, Ty); break;
-  case Instruction::Mul:  executeMulInst (Dest, Op0, Op1, Ty); break;
+  case Instruction::Add:  Dest.IntVal = Op0.IntVal + Op1.IntVal; break;
+  case Instruction::Sub:  Dest.IntVal = Op0.IntVal - Op1.IntVal; break;
+  case Instruction::Mul:  Dest.IntVal = Op0.IntVal * Op1.IntVal; break;
+  case Instruction::FAdd: executeFAddInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FSub: executeFSubInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FMul: executeFMulInst(Dest, Op0, Op1, Ty); break;
   case Instruction::FDiv: executeFDivInst(Dest, Op0, Op1, Ty); break;
   case Instruction::FRem: executeFRemInst(Dest, Op0, Op1, Ty); break;
   case Instruction::SDiv: Dest.IntVal = Op0.IntVal.sdiv(Op1.IntVal); break;
   case Instruction::UDiv: Dest.IntVal = Op0.IntVal.udiv(Op1.IntVal); break;
   case Instruction::URem: Dest.IntVal = Op0.IntVal.urem(Op1.IntVal); break;
   case Instruction::SRem: Dest.IntVal = Op0.IntVal.srem(Op1.IntVal); break;
-  case Instruction::And:  Dest.IntVal = Op0.IntVal.And(Op1.IntVal); break;
-  case Instruction::Or:   Dest.IntVal = Op0.IntVal.Or(Op1.IntVal); break;
-  case Instruction::Xor:  Dest.IntVal = Op0.IntVal.Xor(Op1.IntVal); break;
+  case Instruction::And:  Dest.IntVal = Op0.IntVal & Op1.IntVal; break;
+  case Instruction::Or:   Dest.IntVal = Op0.IntVal | Op1.IntVal; break;
+  case Instruction::Xor:  Dest.IntVal = Op0.IntVal ^ Op1.IntVal; break;
   case Instruction::Shl:  
     Dest.IntVal = Op0.IntVal.shl(Op1.IntVal.getZExtValue());
     break;
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 89131a0..43f23e4 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -551,7 +551,7 @@ namespace {
 
     // When outputting a function stub in the context of some other function, we
     // save BufferBegin/BufferEnd/CurBufferPtr here.
-    unsigned char *SavedBufferBegin, *SavedBufferEnd, *SavedCurBufferPtr;
+    uint8_t *SavedBufferBegin, *SavedBufferEnd, *SavedCurBufferPtr;
 
     /// Relocations - These are the relocations that the function needs, as
     /// emitted.
@@ -891,8 +891,11 @@ unsigned JITEmitter::addSizeOfGlobalsInConstantVal(const Constant *C,
       break;
     }
     case Instruction::Add:
+    case Instruction::FAdd:
     case Instruction::Sub:
+    case Instruction::FSub:
     case Instruction::Mul:
+    case Instruction::FMul:
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::URem:
@@ -1056,11 +1059,11 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
   
   // FnStart is the start of the text, not the start of the constant pool and
   // other per-function data.
-  unsigned char *FnStart =
-    (unsigned char *)TheJIT->getPointerToGlobalIfAvailable(F.getFunction());
+  uint8_t *FnStart =
+    (uint8_t *)TheJIT->getPointerToGlobalIfAvailable(F.getFunction());
 
   // FnEnd is the end of the function's machine code.
-  unsigned char *FnEnd = CurBufferPtr;
+  uint8_t *FnEnd = CurBufferPtr;
 
   if (!Relocations.empty()) {
     CurFn = F.getFunction();
@@ -1183,7 +1186,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     } else {
       DOUT << "JIT: Binary code:\n";
       DOUT << std::hex;
-      unsigned char* q = FnStart;
+      uint8_t* q = FnStart;
       for (int i = 0; q < FnEnd; q += 4, ++i) {
         if (i == 4)
           i = 0;
@@ -1221,7 +1224,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
                                                              ActualSize);
     BufferEnd = BufferBegin+ActualSize;
-    unsigned char* FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd);
+    uint8_t* FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd);
     MemMgr->endExceptionTable(F.getFunction(), BufferBegin, CurBufferPtr,
                               FrameRegister);
     BufferBegin = SavedBufferBegin;
@@ -1416,7 +1419,7 @@ void JITEmitter::startGVStub(const GlobalValue* GV, void *Buffer,
   SavedBufferEnd = BufferEnd;
   SavedCurBufferPtr = CurBufferPtr;
   
-  BufferBegin = CurBufferPtr = (unsigned char *)Buffer;
+  BufferBegin = CurBufferPtr = (uint8_t *)Buffer;
   BufferEnd = BufferBegin+StubSize+1;
 }
 
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 2819b6d..70ccdcc 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -257,9 +257,9 @@ namespace {
     // When emitting code into a memory block, this is the block.
     MemoryRangeHeader *CurBlock;
     
-    unsigned char *CurStubPtr, *StubBase;
-    unsigned char *GOTBase;      // Target Specific reserved memory
-    void *DlsymTable;            // Stub external symbol information
+    uint8_t *CurStubPtr, *StubBase;
+    uint8_t *GOTBase;     // Target Specific reserved memory
+    void *DlsymTable;     // Stub external symbol information
 
     // Centralize memory block allocation.
     sys::MemoryBlock getNewMemoryBlock(unsigned size);
@@ -273,12 +273,12 @@ namespace {
     void AllocateGOT();
     void SetDlsymTable(void *);
     
-    unsigned char *allocateStub(const GlobalValue* F, unsigned StubSize,
-                                unsigned Alignment);
+    uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                          unsigned Alignment);
     
     /// startFunctionBody - When a function starts, allocate a block of free
     /// executable memory, returning a pointer to it and its actual size.
-    unsigned char *startFunctionBody(const Function *F, uintptr_t &ActualSize) {
+    uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize) {
       
       FreeRangeHeader* candidateBlock = FreeMemoryList;
       FreeRangeHeader* head = FreeMemoryList;
@@ -301,18 +301,18 @@ namespace {
       // Allocate the entire memory block.
       FreeMemoryList = candidateBlock->AllocateBlock();
       ActualSize = CurBlock->BlockSize-sizeof(MemoryRangeHeader);
-      return (unsigned char *)(CurBlock+1);
+      return (uint8_t *)(CurBlock+1);
     }
     
     /// endFunctionBody - The function F is now allocated, and takes the memory
     /// in the range [FunctionStart,FunctionEnd).
-    void endFunctionBody(const Function *F, unsigned char *FunctionStart,
-                         unsigned char *FunctionEnd) {
+    void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                         uint8_t *FunctionEnd) {
       assert(FunctionEnd > FunctionStart);
-      assert(FunctionStart == (unsigned char *)(CurBlock+1) &&
+      assert(FunctionStart == (uint8_t *)(CurBlock+1) &&
              "Mismatched function start/end!");
 
-      uintptr_t BlockSize = FunctionEnd - (unsigned char *)CurBlock;
+      uintptr_t BlockSize = FunctionEnd - (uint8_t *)CurBlock;
       FunctionBlocks[F] = CurBlock;
 
       // Release the memory at the end of this block that isn't needed.
@@ -320,17 +320,17 @@ namespace {
     }
 
     /// allocateSpace - Allocate a memory block of the given size.
-    unsigned char *allocateSpace(intptr_t Size, unsigned Alignment) {
+    uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
       CurBlock = FreeMemoryList;
       FreeMemoryList = FreeMemoryList->AllocateBlock();
 
-      unsigned char *result = (unsigned char *)CurBlock+1;
+      uint8_t *result = (uint8_t *)CurBlock+1;
 
       if (Alignment == 0) Alignment = 1;
-      result = (unsigned char*)(((intptr_t)result+Alignment-1) &
+      result = (uint8_t*)(((intptr_t)result+Alignment-1) &
                ~(intptr_t)(Alignment-1));
 
-      uintptr_t BlockSize = result + Size - (unsigned char *)CurBlock;
+      uintptr_t BlockSize = result + Size - (uint8_t *)CurBlock;
       FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
 
       return result;
@@ -338,28 +338,26 @@ namespace {
 
     /// startExceptionTable - Use startFunctionBody to allocate memory for the 
     /// function's exception table.
-    unsigned char* startExceptionTable(const Function* F, 
-                                       uintptr_t &ActualSize) {
+    uint8_t* startExceptionTable(const Function* F, uintptr_t &ActualSize) {
       return startFunctionBody(F, ActualSize);
     }
 
     /// endExceptionTable - The exception table of F is now allocated, 
     /// and takes the memory in the range [TableStart,TableEnd).
-    void endExceptionTable(const Function *F, unsigned char *TableStart,
-                           unsigned char *TableEnd, 
-                           unsigned char* FrameRegister) {
+    void endExceptionTable(const Function *F, uint8_t *TableStart,
+                           uint8_t *TableEnd, uint8_t* FrameRegister) {
       assert(TableEnd > TableStart);
-      assert(TableStart == (unsigned char *)(CurBlock+1) &&
+      assert(TableStart == (uint8_t *)(CurBlock+1) &&
              "Mismatched table start/end!");
       
-      uintptr_t BlockSize = TableEnd - (unsigned char *)CurBlock;
+      uintptr_t BlockSize = TableEnd - (uint8_t *)CurBlock;
       TableBlocks[F] = CurBlock;
 
       // Release the memory at the end of this block that isn't needed.
       FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
     }
     
-    unsigned char *getGOTBase() const {
+    uint8_t *getGOTBase() const {
       return GOTBase;
     }
     
@@ -433,7 +431,7 @@ DefaultJITMemoryManager::DefaultJITMemoryManager() {
   sys::MemoryBlock MemBlock = getNewMemoryBlock(16 << 20);
 #endif
 
-  unsigned char *MemBase = static_cast<unsigned char*>(MemBlock.base());
+  uint8_t *MemBase = static_cast<uint8_t*>(MemBlock.base());
 
   // Allocate stubs backwards from the base, allocate functions forward
   // from the base.
@@ -492,7 +490,7 @@ DefaultJITMemoryManager::DefaultJITMemoryManager() {
 
 void DefaultJITMemoryManager::AllocateGOT() {
   assert(GOTBase == 0 && "Cannot allocate the got multiple times");
-  GOTBase = new unsigned char[sizeof(void*) * 8192];
+  GOTBase = new uint8_t[sizeof(void*) * 8192];
   HasGOT = true;
 }
 
@@ -508,12 +506,12 @@ DefaultJITMemoryManager::~DefaultJITMemoryManager() {
   Blocks.clear();
 }
 
-unsigned char *DefaultJITMemoryManager::allocateStub(const GlobalValue* F,
+uint8_t *DefaultJITMemoryManager::allocateStub(const GlobalValue* F,
                                                      unsigned StubSize,
                                                      unsigned Alignment) {
   CurStubPtr -= StubSize;
-  CurStubPtr = (unsigned char*)(((intptr_t)CurStubPtr) &
-                                ~(intptr_t)(Alignment-1));
+  CurStubPtr = (uint8_t*)(((intptr_t)CurStubPtr) &
+                          ~(intptr_t)(Alignment-1));
   if (CurStubPtr < StubBase) {
     // FIXME: allocate a new block
     fprintf(stderr, "JIT ran out of memory for function stubs!\n");
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 6ac37bc..42e6fda 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Format.h"
 #include "llvm/System/Program.h"
+#include "llvm/System/Process.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
@@ -301,6 +302,35 @@ uint64_t raw_fd_ostream::seek(uint64_t off) {
   return pos;  
 }
 
+raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
+                                         bool bg) {
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+  const char *colorcode =
+    (colors == SAVEDCOLOR) ? sys::Process::OutputBold(bg)
+    : sys::Process::OutputColor(colors, bold, bg);
+  if (colorcode) {
+    unsigned len = strlen(colorcode);
+    write(colorcode, len);
+    // don't account colors towards output characters
+    pos -= len;
+  }
+  return *this;
+}
+
+raw_ostream &raw_fd_ostream::resetColor() {
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+  const char *colorcode = sys::Process::ResetColor();
+  if (colorcode) {
+    unsigned len = strlen(colorcode);
+    write(colorcode, len);
+    // don't account colors towards output characters
+    pos -= len;
+  }
+  return *this;
+}
+
 //===----------------------------------------------------------------------===//
 //  raw_stdout/err_ostream
 //===----------------------------------------------------------------------===//
diff --git a/lib/System/Unix/Process.inc b/lib/System/Unix/Process.inc
index 74b9bb8..2da31c9 100644
--- a/lib/System/Unix/Process.inc
+++ b/lib/System/Unix/Process.inc
@@ -235,3 +235,62 @@ unsigned Process::StandardErrColumns() {
 
   return getColumns(2);
 }
+
+static bool terminalHasColors() {
+  if (const char *term = std::getenv("TERM")) {
+    // Most modern terminals support ANSI escape sequences for colors.
+    // We could check terminfo, or have a list of known terms that support
+    // colors, but that would be overkill.
+    // The user can always ask for no colors by setting TERM to dumb, or
+    // using a commandline flag.
+    return strcmp(term, "dumb") != 0;
+  }
+  return false;
+}
+
+bool Process::StandardOutHasColors() {
+  if (!StandardOutIsDisplayed())
+    return false;
+  return terminalHasColors();
+}
+
+bool Process::StandardErrHasColors() {
+  if (!StandardErrIsDisplayed())
+    return false;
+  return terminalHasColors();
+}
+
+bool Process::ColorNeedsFlush() {
+  // No, we use ANSI escape sequences.
+  return false;
+}
+
+#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
+
+#define ALLCOLORS(FGBG,BOLD) {\
+    COLOR(FGBG, "0", BOLD),\
+    COLOR(FGBG, "1", BOLD),\
+    COLOR(FGBG, "2", BOLD),\
+    COLOR(FGBG, "3", BOLD),\
+    COLOR(FGBG, "4", BOLD),\
+    COLOR(FGBG, "5", BOLD),\
+    COLOR(FGBG, "6", BOLD),\
+    COLOR(FGBG, "7", BOLD)\
+  }
+
+static const char* colorcodes[2][2][8] = {
+ { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
+ { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
+};
+
+const char *Process::OutputColor(char code, bool bold, bool bg) {
+  return colorcodes[bg?1:0][bold?1:0][code&7];
+}
+
+const char *Process::OutputBold(bool bg) {
+  return "\033[1m";
+}
+
+const char *Process::ResetColor() {
+  return "\033[0m";
+}
diff --git a/lib/System/Win32/Process.inc b/lib/System/Win32/Process.inc
index e1d7a92..cfbe33c 100644
--- a/lib/System/Win32/Process.inc
+++ b/lib/System/Win32/Process.inc
@@ -147,4 +147,71 @@ unsigned Process::StandardErrColumns() {
   return Columns;
 }
 
+// It always has colors.
+bool Process::StandardErrHasColors() {
+  return StandardErrIsDisplayed();
+}
+
+bool Process::StandardOutHasColors() {
+  return StandardOutIsDisplayed();
+}
+
+namespace {
+class DefaultColors
+{
+  private:
+    WORD defaultColor;
+  public:
+    DefaultColors()
+     :defaultColor(GetCurrentColor()) {}
+    static unsigned GetCurrentColor() {
+      CONSOLE_SCREEN_BUFFER_INFO csbi;
+      if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi))
+        return csbi.wAttributes;
+      return 0;
+    }
+    WORD operator()() const { return defaultColor; }
+};
+
+DefaultColors defaultColors;
+}
+
+bool Process::ColorNeedsFlush() {
+  return true;
+}
+
+const char *Process::OutputBold(bool bg) {
+  WORD colors = DefaultColors::GetCurrentColor();
+  if (bg)
+    colors |= BACKGROUND_INTENSITY;
+  else
+    colors |= FOREGROUND_INTENSITY;
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors);
+  return 0;
+}
+
+const char *Process::OutputColor(char code, bool bold, bool bg) {
+  WORD colors;
+  if (bg) {
+    colors = ((code&1) ? BACKGROUND_RED : 0) |
+      ((code&2) ? BACKGROUND_GREEN : 0 ) |
+      ((code&4) ? BACKGROUND_BLUE : 0);
+    if (bold)
+      colors |= BACKGROUND_INTENSITY;
+  } else {
+    colors = ((code&1) ? FOREGROUND_RED : 0) |
+      ((code&2) ? FOREGROUND_GREEN : 0 ) |
+      ((code&4) ? FOREGROUND_BLUE : 0);
+    if (bold)
+      colors |= FOREGROUND_INTENSITY;
+  }
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors);
+  return 0;
+}
+
+const char *Process::ResetColor() {
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), defaultColors());
+  return 0;
+}
+
 }
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 3a038c9..a75ed3b 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Type.h"
 #include "llvm/Support/Streams.h"
 #include "llvm/Support/raw_ostream.h"
-#include <ostream>
 using namespace llvm;
 
 ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id,
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 09b8ce0..963ff0d 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -64,11 +64,15 @@ namespace {
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
     typedef MemOpQueue::iterator MemOpQueueIter;
 
-    SmallVector<MachineBasicBlock::iterator, 4>
-    MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
-                 int Opcode, unsigned Size,
-                 ARMCC::CondCodes Pred, unsigned PredReg,
-                 unsigned Scratch, MemOpQueue &MemOps);
+    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  int Offset, unsigned Base, bool BaseKill, int Opcode,
+                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
+                  DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
+    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                      int Opcode, unsigned Size,
+                      ARMCC::CondCodes Pred, unsigned PredReg,
+                      unsigned Scratch, MemOpQueue &MemOps,
+                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
 
     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
@@ -108,16 +112,16 @@ static int getLoadStoreMultipleOpcode(int Opcode) {
   return 0;
 }
 
-/// mergeOps - Create and insert a LDM or STM with Base as base register and
+/// MergeOps - Create and insert a LDM or STM with Base as base register and
 /// registers in Regs as the register operands that would be loaded / stored.
 /// It returns true if the transformation is done. 
-static bool mergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     int Offset, unsigned Base, bool BaseKill, int Opcode,
-                     ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
-                     SmallVector<std::pair<unsigned, bool>, 8> &Regs,
-                     const TargetInstrInfo *TII) {
-  // FIXME would it be better to take a DL from one of the loads arbitrarily?
-  DebugLoc dl = DebugLoc::getUnknownLoc();
+bool
+ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          int Offset, unsigned Base, bool BaseKill,
+                          int Opcode, ARMCC::CondCodes Pred,
+                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
+                          SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
   // Only a single register to load / store. Don't bother.
   unsigned NumRegs = Regs.size();
   if (NumRegs <= 1)
@@ -185,20 +189,21 @@ static bool mergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
 
 /// MergeLDR_STR - Merge a number of load / store instructions into one or more
 /// load / store multiple instructions.
-SmallVector<MachineBasicBlock::iterator, 4>
+void
 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
-                              unsigned Base, int Opcode, unsigned Size,
-                              ARMCC::CondCodes Pred, unsigned PredReg,
-                              unsigned Scratch, MemOpQueue &MemOps) {
-  SmallVector<MachineBasicBlock::iterator, 4> Merges;
+                          unsigned Base, int Opcode, unsigned Size,
+                          ARMCC::CondCodes Pred, unsigned PredReg,
+                          unsigned Scratch, MemOpQueue &MemOps,
+                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
   bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
   int Offset = MemOps[SIndex].Offset;
   int SOffset = Offset;
   unsigned Pos = MemOps[SIndex].Position;
   MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
-  unsigned PReg = MemOps[SIndex].MBBI->getOperand(0).getReg();
+  DebugLoc dl = Loc->getDebugLoc();
+  unsigned PReg = Loc->getOperand(0).getReg();
   unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
-  bool isKill = MemOps[SIndex].MBBI->getOperand(0).isKill();
+  bool isKill = Loc->getOperand(0).isKill();
 
   SmallVector<std::pair<unsigned,bool>, 8> Regs;
   Regs.push_back(std::make_pair(PReg, isKill));
@@ -216,18 +221,17 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
       PRegNum = RegNum;
     } else {
       // Can't merge this in. Try merge the earlier ones first.
-      if (mergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
-                   Scratch, Regs, TII)) {
+      if (MergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
+                   Scratch, dl, Regs)) {
         Merges.push_back(prior(Loc));
         for (unsigned j = SIndex; j < i; ++j) {
           MBB.erase(MemOps[j].MBBI);
           MemOps[j].Merged = true;
         }
       }
-      SmallVector<MachineBasicBlock::iterator, 4> Merges2 =
-        MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,MemOps);
-      Merges.append(Merges2.begin(), Merges2.end());
-      return Merges;
+      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
+                   MemOps, Merges);
+      return;
     }
 
     if (MemOps[i].Position > Pos) {
@@ -237,8 +241,8 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   }
 
   bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
-  if (mergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
-               Scratch, Regs, TII)) {
+  if (MergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
+               Scratch, dl, Regs)) {
     Merges.push_back(prior(Loc));
     for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) {
       MBB.erase(MemOps[i].MBBI);
@@ -246,7 +250,7 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     }
   }
 
-  return Merges;
+  return;
 }
 
 /// getInstrPredicate - If instruction is predicated, returns its predicate
@@ -530,7 +534,7 @@ static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     if (isAM2)
       // STR_PRE, STR_POST;
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
-        .addReg(MO.getReg(), getKillRegState(BaseKill))
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
     else
       // FSTMS, FSTMD
@@ -590,6 +594,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
   ARMCC::CondCodes CurrPred = ARMCC::AL;
   unsigned CurrPredReg = 0;
   unsigned Position = 0;
+  SmallVector<MachineBasicBlock::iterator,4> Merges;
 
   RS->enterBasicBlock(&MBB);
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
@@ -689,16 +694,16 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
         RS->forward(prior(MBBI));
 
         // Merge ops.
-        SmallVector<MachineBasicBlock::iterator,4> MBBII =
-          MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
-                       CurrPred, CurrPredReg, Scratch, MemOps);
+        Merges.clear();
+        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
+                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
 
         // Try folding preceeding/trailing base inc/dec into the generated
         // LDM/STM ops.
-        for (unsigned i = 0, e = MBBII.size(); i < e; ++i)
-          if (mergeBaseUpdateLSMultiple(MBB, MBBII[i], Advance, MBBI))
+        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
+          if (mergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
             ++NumMerges;
-        NumMerges += MBBII.size();
+        NumMerges += Merges.size();
 
         // Try folding preceeding/trailing base inc/dec into those load/store
         // that were not merged to form LDM/STM ops.
@@ -709,6 +714,13 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
 
         // RS may be pointing to an instruction that's deleted. 
         RS->skipTo(prior(MBBI));
+      } else if (NumMemOps == 1) {
+        // Try folding preceeding/trailing base inc/dec into the single
+        // load/store.
+        if (mergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
+          ++NumMerges;
+          RS->forward(prior(MBBI));
+        }
       }
 
       CurrBase = 0;
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 6662be1..0b0e289 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -100,7 +100,7 @@ public:
     GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
     JumpTableUId(0), ConstPoolEntryUId(0) {}
 
-  ARMFunctionInfo(MachineFunction &MF) :
+  explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
     Align(isThumb ? 1U : 2U),
     VarArgsRegSaveSize(0), HasStackFrame(false),
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index e8daf74..b95d1f9 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -103,28 +103,28 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
   let MethodBodies = [{
     // FP is R11, R9 is available.
     static const unsigned ARM_GPR_AO_1[] = {
-      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R12,ARM::LR,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7,
       ARM::R8, ARM::R9, ARM::R10,
       ARM::R11 };
     // FP is R11, R9 is not available.
     static const unsigned ARM_GPR_AO_2[] = {
-      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R12,ARM::LR,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7,
       ARM::R8, ARM::R10,
       ARM::R11 };
     // FP is R7, R9 is available.
     static const unsigned ARM_GPR_AO_3[] = {
-      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R12,ARM::LR,
       ARM::R4, ARM::R5, ARM::R6,
       ARM::R8, ARM::R9, ARM::R10,ARM::R11,
       ARM::R7 };
     // FP is R7, R9 is not available.
     static const unsigned ARM_GPR_AO_4[] = {
-      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
       ARM::R12,ARM::LR,
       ARM::R4, ARM::R5, ARM::R6,
       ARM::R8, ARM::R10,ARM::R11,
@@ -186,7 +186,7 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
   // scavenging.
   let MethodBodies = [{
     static const unsigned THUMB_tGPR_AO[] = {
-      ARM::R2, ARM::R1, ARM::R0,
+      ARM::R0, ARM::R1, ARM::R2,
       ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
 
     // FP is R7, only low registers available.
diff --git a/lib/Target/Alpha/AlphaMachineFunctionInfo.h b/lib/Target/Alpha/AlphaMachineFunctionInfo.h
index 47de5df..8221fc7 100644
--- a/lib/Target/Alpha/AlphaMachineFunctionInfo.h
+++ b/lib/Target/Alpha/AlphaMachineFunctionInfo.h
@@ -33,8 +33,8 @@ class AlphaMachineFunctionInfo : public MachineFunctionInfo {
 public:
   AlphaMachineFunctionInfo() : GlobalBaseReg(0), GlobalRetAddr(0) {}
 
-  AlphaMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0),
-                                                  GlobalRetAddr(0) {}
+  explicit AlphaMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0),
+                                                           GlobalRetAddr(0) {}
 
   unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 4d7b545..5814d27 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -1000,8 +1000,11 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       Out << ')';
       return;
     case Instruction::Add:
+    case Instruction::FAdd:
     case Instruction::Sub:
+    case Instruction::FSub:
     case Instruction::Mul:
+    case Instruction::FMul:
     case Instruction::SDiv:
     case Instruction::UDiv:
     case Instruction::FDiv:
@@ -1020,9 +1023,12 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       bool NeedsClosingParens = printConstExprCast(CE, Static); 
       printConstantWithCast(CE->getOperand(0), CE->getOpcode());
       switch (CE->getOpcode()) {
-      case Instruction::Add: Out << " + "; break;
-      case Instruction::Sub: Out << " - "; break;
-      case Instruction::Mul: Out << " * "; break;
+      case Instruction::Add:
+      case Instruction::FAdd: Out << " + "; break;
+      case Instruction::Sub:
+      case Instruction::FSub: Out << " - "; break;
+      case Instruction::Mul:
+      case Instruction::FMul: Out << " * "; break;
       case Instruction::URem:
       case Instruction::SRem: 
       case Instruction::FRem: Out << " % "; break;
@@ -1322,8 +1328,6 @@ bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
   case Instruction::Mul:
     // We need to cast integer arithmetic so that it is always performed
     // as unsigned, to avoid undefined behavior on overflow.
-    if (!Ty->isIntOrIntVector()) break;
-    // FALL THROUGH
   case Instruction::LShr:
   case Instruction::URem: 
   case Instruction::UDiv: NeedsExplicitCast = true; break;
@@ -1387,8 +1391,6 @@ void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
     case Instruction::Mul:
       // We need to cast integer arithmetic so that it is always performed
       // as unsigned, to avoid undefined behavior on overflow.
-      if (!OpTy->isIntOrIntVector()) break;
-      // FALL THROUGH
     case Instruction::LShr:
     case Instruction::UDiv:
     case Instruction::URem:
@@ -1505,8 +1507,6 @@ bool CWriter::writeInstructionCast(const Instruction &I) {
   case Instruction::Mul:
     // We need to cast integer arithmetic so that it is always performed
     // as unsigned, to avoid undefined behavior on overflow.
-    if (!Ty->isIntOrIntVector()) break;
-    // FALL THROUGH
   case Instruction::LShr:
   case Instruction::URem: 
   case Instruction::UDiv: 
@@ -1552,8 +1552,6 @@ void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
     case Instruction::Mul:
       // We need to cast integer arithmetic so that it is always performed
       // as unsigned, to avoid undefined behavior on overflow.
-      if (!OpTy->isIntOrIntVector()) break;
-      // FALL THROUGH
     case Instruction::LShr:
     case Instruction::UDiv:
     case Instruction::URem: // Cast to unsigned first
@@ -2606,6 +2604,10 @@ void CWriter::visitBinaryOperator(Instruction &I) {
     Out << "-(";
     writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
     Out << ")";
+  } else if (BinaryOperator::isFNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getFNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
   } else if (I.getOpcode() == Instruction::FRem) {
     // Output a call to fmod/fmodf instead of emitting a%b
     if (I.getType() == Type::FloatTy)
@@ -2630,9 +2632,12 @@ void CWriter::visitBinaryOperator(Instruction &I) {
     writeOperandWithCast(I.getOperand(0), I.getOpcode());
 
     switch (I.getOpcode()) {
-    case Instruction::Add:  Out << " + "; break;
-    case Instruction::Sub:  Out << " - "; break;
-    case Instruction::Mul:  Out << " * "; break;
+    case Instruction::Add:
+    case Instruction::FAdd: Out << " + "; break;
+    case Instruction::Sub:
+    case Instruction::FSub: Out << " - "; break;
+    case Instruction::Mul:
+    case Instruction::FMul: Out << " * "; break;
     case Instruction::URem:
     case Instruction::SRem:
     case Instruction::FRem: Out << " % "; break;
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 4082989..04a6829 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -865,8 +865,11 @@ namespace {
         Out << "Constant* " << constName << " = ConstantExpr::";
         switch (CE->getOpcode()) {
         case Instruction::Add:    Out << "getAdd(";  break;
+        case Instruction::FAdd:   Out << "getFAdd(";  break;
         case Instruction::Sub:    Out << "getSub("; break;
+        case Instruction::FSub:   Out << "getFSub("; break;
         case Instruction::Mul:    Out << "getMul("; break;
+        case Instruction::FMul:   Out << "getFMul("; break;
         case Instruction::UDiv:   Out << "getUDiv("; break;
         case Instruction::SDiv:   Out << "getSDiv("; break;
         case Instruction::FDiv:   Out << "getFDiv("; break;
@@ -1159,8 +1162,11 @@ namespace {
       break;
     }
     case Instruction::Add:
+    case Instruction::FAdd:
     case Instruction::Sub:
+    case Instruction::FSub:
     case Instruction::Mul:
+    case Instruction::FMul:
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::FDiv:
@@ -1176,8 +1182,11 @@ namespace {
       Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
       switch (I->getOpcode()) {
       case Instruction::Add: Out << "Instruction::Add"; break;
+      case Instruction::FAdd: Out << "Instruction::FAdd"; break;
       case Instruction::Sub: Out << "Instruction::Sub"; break;
+      case Instruction::FSub: Out << "Instruction::FSub"; break;
       case Instruction::Mul: Out << "Instruction::Mul"; break;
+      case Instruction::FMul: Out << "Instruction::FMul"; break;
       case Instruction::UDiv:Out << "Instruction::UDiv"; break;
       case Instruction::SDiv:Out << "Instruction::SDiv"; break;
       case Instruction::FDiv:Out << "Instruction::FDiv"; break;
diff --git a/lib/Target/IA64/IA64MachineFunctionInfo.h b/lib/Target/IA64/IA64MachineFunctionInfo.h
index fb93056..e6254d6 100644
--- a/lib/Target/IA64/IA64MachineFunctionInfo.h
+++ b/lib/Target/IA64/IA64MachineFunctionInfo.h
@@ -24,7 +24,7 @@ public:
   // by this machinefunction? (used to compute the appropriate
   // entry in the 'alloc' instruction at the top of the
   // machinefunction)
-  IA64FunctionInfo(MachineFunction& MF) { outRegsUsed=0; };
+  explicit IA64FunctionInfo(MachineFunction& MF) { outRegsUsed=0; };
 
 };
 
diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
index ada851d..37e5b1e 100644
--- a/lib/Target/MSIL/MSILWriter.cpp
+++ b/lib/Target/MSIL/MSILWriter.cpp
@@ -1060,12 +1060,15 @@ void MSILWriter::printInstruction(const Instruction* Inst) {
     break;
   // Binary
   case Instruction::Add:
+  case Instruction::FAdd:
     printBinaryInstruction("add",Left,Right);
     break;
   case Instruction::Sub:
+  case Instruction::FSub:
     printBinaryInstruction("sub",Left,Right);
     break;
-  case Instruction::Mul:  
+  case Instruction::Mul:
+  case Instruction::FMul:
     printBinaryInstruction("mul",Left,Right);
     break;
   case Instruction::UDiv:
@@ -1322,12 +1325,15 @@ void MSILWriter::printConstantExpr(const ConstantExpr* CE) {
     printSelectInstruction(CE->getOperand(0),CE->getOperand(1),CE->getOperand(2));
     break;
   case Instruction::Add:
+  case Instruction::FAdd:
     printBinaryInstruction("add",left,right);
     break;
   case Instruction::Sub:
+  case Instruction::FSub:
     printBinaryInstruction("sub",left,right);
     break;
   case Instruction::Mul:
+  case Instruction::FMul:
     printBinaryInstruction("mul",left,right);
     break;
   case Instruction::UDiv:
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index b94d7e4..1d26ae3 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -28,7 +28,8 @@ class MSP430MachineFunctionInfo : public MachineFunctionInfo {
 public:
   MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
 
-  MSP430MachineFunctionInfo(MachineFunction &MF) : CalleeSavedFrameSize(0) {}
+  explicit MSP430MachineFunctionInfo(MachineFunction &MF)
+    : CalleeSavedFrameSize(0) {}
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
index 0f83fd2..ac9a143 100644
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@@ -46,6 +46,16 @@ static const char *getIntrinsicName(unsigned opcode) {
   case PIC16ISD::MUL_I8: Basename = "mul.i8"; break;
   case RTLIB::MUL_I16: Basename = "mul.i16"; break;
   case RTLIB::MUL_I32: Basename = "mul.i32"; break;
+
+  case RTLIB::SDIV_I16: Basename = "sdiv.i16"; break;
+  case RTLIB::SDIV_I32: Basename = "sdiv.i32"; break;
+  case RTLIB::UDIV_I16: Basename = "udiv.i16"; break;
+  case RTLIB::UDIV_I32: Basename = "udiv.i32"; break;
+
+  case RTLIB::SREM_I16: Basename = "srem.i16"; break;
+  case RTLIB::SREM_I32: Basename = "srem.i32"; break;
+  case RTLIB::UREM_I16: Basename = "urem.i16"; break;
+  case RTLIB::UREM_I32: Basename = "urem.i32"; break;
   }
   
   std::string prefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
@@ -90,6 +100,20 @@ PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
   setLibcallName(RTLIB::MUL_I16, getIntrinsicName(RTLIB::MUL_I16));
   setLibcallName(RTLIB::MUL_I32, getIntrinsicName(RTLIB::MUL_I32));
 
+  // Signed division lib call names
+  setLibcallName(RTLIB::SDIV_I16, getIntrinsicName(RTLIB::SDIV_I16));
+  setLibcallName(RTLIB::SDIV_I32, getIntrinsicName(RTLIB::SDIV_I32));
+  // Unsigned division lib call names
+  setLibcallName(RTLIB::UDIV_I16, getIntrinsicName(RTLIB::UDIV_I16));
+  setLibcallName(RTLIB::UDIV_I32, getIntrinsicName(RTLIB::UDIV_I32));
+
+  // Signed remainder lib call names
+  setLibcallName(RTLIB::SREM_I16, getIntrinsicName(RTLIB::SREM_I16));
+  setLibcallName(RTLIB::SREM_I32, getIntrinsicName(RTLIB::SREM_I32));
+  // Unsigned remainder lib call names
+  setLibcallName(RTLIB::UREM_I16, getIntrinsicName(RTLIB::UREM_I16));
+  setLibcallName(RTLIB::UREM_I32, getIntrinsicName(RTLIB::UREM_I32));
+  
   setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
   setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
 
@@ -105,6 +129,7 @@ PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
   setOperationAction(ISD::ADDC,    MVT::i8,  Custom);
   setOperationAction(ISD::SUBE,    MVT::i8,  Custom);
   setOperationAction(ISD::SUBC,    MVT::i8,  Custom);
+  setOperationAction(ISD::SUB,    MVT::i8,  Custom);
   setOperationAction(ISD::ADD,    MVT::i8,  Custom);
   setOperationAction(ISD::ADD,    MVT::i16, Custom);
 
@@ -354,21 +379,11 @@ SDValue PIC16TargetLowering::ExpandFrameIndex(SDNode *N, SelectionDAG &DAG) {
   FrameIndexSDNode *FR = dyn_cast<FrameIndexSDNode>(SDValue(N,0));
   // FIXME there isn't really debug info here
   DebugLoc dl = FR->getDebugLoc();
-  // FIXME: Not used.
-  // int Index = FR->getIndex();
 
   // Expand FrameIndex like GlobalAddress and ExternalSymbol
   // Also use Offset field for lo and hi parts. The default 
   // offset is zero.
 
-  /*
-  SDValue Offset = DAG.getConstant(0, MVT::i8);
-  SDValue FI = DAG.getTargetFrameIndex(Index, MVT::i8);
-  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, FI, Offset);
-  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, FI, Offset);
-  return DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0), Lo, Hi);
-  */
-
   SDValue ES;
   int FrameOffset;
   SDValue FI = SDValue(N,0);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index a7744b8..87f8fb0b4 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -227,15 +227,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
-    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+    // This is just the low 32 bits of a (signed) fp->i64 conversion.
+    // We cannot do this with Promote because i64 is not a legal type.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 
     // FIXME: disable this lowered code.  This generates 64-bit register values,
     // and we don't model the fact that the top part is clobbered by calls.  We
     // need to flag these together so that the value isn't live across a call.
     //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-
-    // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
-    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
@@ -2858,7 +2857,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 }
 
 // FIXME: Split this code up when LegalizeDAGTypes lands.
-SDValue PPCTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                                            DebugLoc dl) {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
@@ -2867,9 +2866,11 @@ SDValue PPCTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
 
   SDValue Tmp;
   switch (Op.getValueType().getSimpleVT()) {
-  default: assert(0 && "Unhandled FP_TO_SINT type in custom expander!");
+  default: assert(0 && "Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
-    Tmp = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Src);
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
+                                                         PPCISD::FCTIDZ, 
+                      dl, MVT::f64, Src);
     break;
   case MVT::i64:
     Tmp = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Src);
@@ -3740,7 +3741,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
     return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
 
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
-  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG,
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
                                                        Op.getDebugLoc());
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
@@ -3834,7 +3836,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::FP_TO_SINT:
-    Results.push_back(LowerFP_TO_SINT(SDValue(N, 0), DAG, dl));
+    Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
   }
 }
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 7946474..b6d046f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -377,7 +377,7 @@ namespace llvm {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget);
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
-    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG, DebugLoc dl);
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, DebugLoc dl);
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG);
     SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG);
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 42883d7..b359dd3 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -59,7 +59,7 @@ private:
   bool HasFastCall;
 
 public:
-  PPCFunctionInfo(MachineFunction &MF) 
+  explicit PPCFunctionInfo(MachineFunction &MF) 
     : FramePointerSaveIndex(0),
       ReturnAddrSaveIndex(0),
       SpillsCR(false),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 5d5beeb..cb31506 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -908,6 +908,7 @@ void PPCRegisterInfo::determineFrameLayout(MachineFunction &MF) const {
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
   // to adjust the stack pointer (we fit in the Red Zone).
+  bool DisableRedZone = MF.getFunction()->hasFnAttr(Attribute::NoRedZone);
   if (!DisableRedZone &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 1b042dd..dea293b 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -41,7 +41,6 @@ namespace llvm {
   bool RealignStack;
   bool DisableJumpTables;
   bool StrongPHIElim;
-  bool DisableRedZone;
   bool AsmVerbosityDefault(false);
 }
 
@@ -86,11 +85,6 @@ GenerateSoftFloatCalls("soft-float",
   cl::location(UseSoftFloat),
   cl::init(false));
 static cl::opt<bool, true>
-GenerateNoImplicitFloats("no-implicit-float",
-  cl::desc("Don't generate implicit floating point instructions (x86-only)"),
-  cl::location(NoImplicitFloat),
-  cl::init(false));
-static cl::opt<bool, true>
 DontPlaceZerosInBSS("nozero-initialized-in-bss",
   cl::desc("Don't place zero-initialized symbols into bss section"),
   cl::location(NoZerosInBSS),
@@ -163,11 +157,6 @@ EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
   cl::desc("Use strong PHI elimination."),
   cl::location(StrongPHIElim),
   cl::init(false));
-static cl::opt<bool, true>
-DisableRedZoneOption("disable-red-zone",
-  cl::desc("Do not emit code that uses the red zone."),
-  cl::location(DisableRedZone),
-  cl::init(false));
 
 //---------------------------------------------------------------------------
 // TargetMachine Class
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index 4c3cc82..2604741 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -14,5 +14,6 @@
 #include "X86ELFWriterInfo.h"
 using namespace llvm;
 
-X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {}
+X86ELFWriterInfo::X86ELFWriterInfo(bool is64Bit) :
+  TargetELFWriterInfo(is64Bit ? EM_X86_64 : EM_386) {}
 X86ELFWriterInfo::~X86ELFWriterInfo() {}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
index 06e051a..acfa501 100644
--- a/lib/Target/X86/X86ELFWriterInfo.h
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -20,7 +20,7 @@ namespace llvm {
 
   class X86ELFWriterInfo : public TargetELFWriterInfo {
   public:
-    X86ELFWriterInfo();
+    X86ELFWriterInfo(bool is64Bit);
     virtual ~X86ELFWriterInfo();
   };
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1f507c3..ef60ff5 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -126,7 +126,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 
-  if (!UseSoftFloat && !NoImplicitFloat) {
+  if (!UseSoftFloat) {
     // SSE has no i16 to fp conversion, only i32
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
@@ -550,6 +550,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
   }
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -734,6 +738,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+    if (!DisableMMX && Subtarget->hasMMX()) {
+      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
+    }
   }
 
   if (Subtarget->hasSSE41()) {
@@ -868,11 +878,14 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
 /// determining it.
 MVT
 X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                       bool isSrcConst, bool isSrcStr) const {
+                                       bool isSrcConst, bool isSrcStr,
+                                       SelectionDAG &DAG) const {
   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
   // linux.  This is because the stack realignment code can't handle certain
   // cases like PR2962.  This should be removed when PR2962 is fixed.
-  if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) {
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
     if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
       return MVT::v4i32;
     if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
@@ -1404,11 +1417,12 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
       unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
                                                        TotalNumXMMRegs);
 
+      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
              "SSE register cannot be used when SSE is disabled!");
-      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) &&
+      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
              "SSE register cannot be used when SSE is disabled!");
-      if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1())
+      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
         // Kernel mode asks for SSE to be disabled, so don't push them
         // on the stack.
         TotalNumXMMRegs = 0;
@@ -2414,9 +2428,10 @@ bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
 /// specifies a shuffle of elements that is suitable for input to MOVSS,
 /// MOVSD, and MOVD, i.e. setting the lowest element.
 static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) {
-  int NumElts = VT.getVectorNumElements();
-  if (NumElts != 2 && NumElts != 4)
+  if (VT.getVectorElementType().getSizeInBits() < 32)
     return false;
+
+  int NumElts = VT.getVectorNumElements();
   
   if (!isUndefOrEqual(Mask[0], NumElts))
     return false;
@@ -3068,7 +3083,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   }
 
   // Special case for single non-zero, non-undef, element.
-  if (NumNonZero == 1 && NumElems <= 4) {
+  if (NumNonZero == 1) {
     unsigned Idx = CountTrailingZeros_32(NonZeros);
     SDValue Item = Op.getOperand(Idx);
 
@@ -3109,15 +3124,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
     // If we have a constant or non-constant insertion into the low element of
     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
-    // depending on what the source datatype is.  Because we can only get here
-    // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
-    if (Idx == 0 &&
-        // Don't do this for i64 values on x86-32.
-        (EVT != MVT::i64 || Subtarget->is64Bit())) {
-      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasSSE2(), DAG);
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0) {
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+      } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 ||
+          (EVT == MVT::i64 && Subtarget->is64Bit())) {
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+                                           DAG);
+      } else if (EVT == MVT::i16 || EVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+                                           Subtarget->hasSSE2(), DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
+      }
     }
 
     // Is it a vector logical left shift?
@@ -4248,7 +4272,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  if (EVT.getSizeInBits() == 16) {
+  if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
     // as its second argument.
     if (N1.getValueType() != MVT::i32)
@@ -4554,6 +4578,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   MVT SrcVT = Op.getOperand(0).getValueType();
+
+  if (SrcVT.isVector()) {
+    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
+      return Op;
+    }
+    return SDValue();
+  }
+
   assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
@@ -4845,6 +4877,14 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
 }
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType().isVector()) {
+    if (Op.getValueType() == MVT::v2i32 &&
+        Op.getOperand(0).getValueType() == MVT::v2f64) {
+      return Op;
+    }
+    return SDValue();
+  }
+
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
@@ -7675,8 +7715,9 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
     if (Elt.getOpcode() == ISD::UNDEF)
       continue;
 
-    if (!TLI.isConsecutiveLoad(Elt.getNode(), Base,
-                               EVT.getSizeInBits()/8, i, MFI))
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    LoadSDNode *LDBase = cast<LoadSDNode>(Base);
+    if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
       return false;
   }
   return true;
@@ -7751,44 +7792,82 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
 
   MVT VT = N->getValueType(0);
   MVT EVT = VT.getVectorElementType();
-  if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
-    // We are looking for load i64 and zero extend. We want to transform
-    // it before legalizer has a chance to expand it. Also look for i64
-    // BUILD_PAIR bit casted to f64.
-    return SDValue();
-  // This must be an insertion into a zero vector.
-  SDValue HighElt = N->getOperand(1);
-  if (!isZeroNode(HighElt))
-    return SDValue();
+  
+  // Before or during type legalization, we want to try and convert a
+  // build_vector of an i64 load and a zero value into vzext_movl before the 
+  // legalizer can break it up.  
+  // FIXME: does the case below remove the need to do this?
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) {
+    if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
+      return SDValue();
+    
+    // This must be an insertion into a zero vector.
+    SDValue HighElt = N->getOperand(1);
+    if (!isZeroNode(HighElt))
+      return SDValue();
+    
+    // Value must be a load.
+    SDNode *Base = N->getOperand(0).getNode();
+    if (!isa<LoadSDNode>(Base)) {
+      if (Base->getOpcode() != ISD::BIT_CONVERT)
+        return SDValue();
+      Base = Base->getOperand(0).getNode();
+      if (!isa<LoadSDNode>(Base))
+        return SDValue();
+    }
+    
+    // Transform it into VZEXT_LOAD addr.
+    LoadSDNode *LD = cast<LoadSDNode>(Base);
+    
+    // Load must not be an extload.
+    if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+      return SDValue();
+    
+    // Load type should legal type so we don't have to legalize it.
+    if (!TLI.isTypeLegal(VT))
+      return SDValue();
+    
+    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    TargetLowering::TargetLoweringOpt TLO(DAG);
+    TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
+    DCI.CommitTargetLoweringOpt(TLO);
+    return ResNode;
+  }
+
+  // The type legalizer will have broken apart v2i64 build_vector created during
+  // widening before the code which handles that case is run.  Look for build
+  // vector (load, load + 4, 0/undef, 0/undef)
+  if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+    LoadSDNode *LD0 = dyn_cast<LoadSDNode>(N->getOperand(0));
+    LoadSDNode *LD1 = dyn_cast<LoadSDNode>(N->getOperand(1));
+    if (!LD0 || !LD1)
+      return SDValue();
+    if (LD0->getExtensionType() != ISD::NON_EXTLOAD ||
+        LD1->getExtensionType() != ISD::NON_EXTLOAD)
+      return SDValue();
+    // Make sure the second elt is a consecutive load.
+    if (!TLI.isConsecutiveLoad(LD1, LD0, EVT.getSizeInBits()/8, 1,
+                               DAG.getMachineFunction().getFrameInfo()))
+      return SDValue();
 
-  // Value must be a load.
-  SDNode *Base = N->getOperand(0).getNode();
-  if (!isa<LoadSDNode>(Base)) {
-    if (Base->getOpcode() != ISD::BIT_CONVERT)
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    if (!isZeroNode(N2) && N2.getOpcode() != ISD::UNDEF)
       return SDValue();
-    Base = Base->getOperand(0).getNode();
-    if (!isa<LoadSDNode>(Base))
+    if (!isZeroNode(N3) && N3.getOpcode() != ISD::UNDEF)
       return SDValue();
+    
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LD0->getChain(), LD0->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    TargetLowering::TargetLoweringOpt TLO(DAG);
+    TLO.CombineTo(SDValue(LD0, 1), ResNode.getValue(1));
+    DCI.CommitTargetLoweringOpt(TLO);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
   }
-
-  // Transform it into VZEXT_LOAD addr.
-  LoadSDNode *LD = cast<LoadSDNode>(Base);
-
-  // Load must not be an extload.
-  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
-    return SDValue();
-
-  // Load type should legal type so we don't have to legalize it.
-  if (!TLI.isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-  SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-  SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-  TargetLowering::TargetLoweringOpt TLO(DAG);
-  TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
-  DCI.CommitTargetLoweringOpt(TLO);
-  return ResNode;
+  return SDValue();
 }
 
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
@@ -8242,7 +8321,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
-  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2();
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 
+    && Subtarget->hasSSE2();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 550f8bd..fb4eb68 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -378,7 +378,8 @@ namespace llvm {
     /// determining it.
     virtual
     MVT getOptimalMemOpType(uint64_t Size, unsigned Align,
-                            bool isSrcConst, bool isSrcStr) const;
+                            bool isSrcConst, bool isSrcStr,
+                            SelectionDAG &DAG) const;
     
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 2cd3733..8a9b7c9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2009,16 +2009,24 @@ bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   MachineFunction &MF = *MBB.getParent();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize);
+  unsigned CalleeFrameSize = 0;
   
   unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
+    const TargetRegisterClass *RegClass = CSI[i-1].getRegClass();
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
-    BuildMI(MBB, MI, DL, get(Opc))
-      .addReg(Reg, RegState::Kill);
+    if (RegClass != &X86::VR128RegClass) {
+      CalleeFrameSize += SlotSize;
+      BuildMI(MBB, MI, DL, get(Opc))
+        .addReg(Reg, RegState::Kill);
+    } else {
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), RegClass);
+    }
   }
+
+  X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
   return true;
 }
 
@@ -2036,7 +2044,12 @@ bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    BuildMI(MBB, MI, DL, get(Opc), Reg);
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass != &X86::VR128RegClass) {
+      BuildMI(MBB, MI, DL, get(Opc), Reg);
+    } else {
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass);
+    }
   }
   return true;
 }
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 8f287e1..43fadc2 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -577,41 +577,17 @@ def : Pat<(f64 (bitconvert (v4i16 VR64:$src))),
 def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 
-// Move scalar to MMX zero-extended
-// movd to MMX register zero-extends
-let AddedComplexity = 15 in {
-  def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))),
-           (MMX_MOVZDI2PDIrr GR32:$src)>; 
-  def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))))),
-           (MMX_MOVZDI2PDIrr GR32:$src)>; 
-}
-
 let AddedComplexity = 20 in {
-  def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (load_mmx addr:$src)))),
-            (MMX_MOVZDI2PDIrm addr:$src)>; 
-  def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (load_mmx addr:$src)))),
-            (MMX_MOVZDI2PDIrm addr:$src)>; 
   def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
             (MMX_MOVZDI2PDIrm addr:$src)>; 
 }
 
 // Clear top half.
 let AddedComplexity = 15 in {
-  def : Pat<(v8i8 (X86vzmovl VR64:$src)),
-            (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
-  def : Pat<(v4i16 (X86vzmovl VR64:$src)),
-            (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
   def : Pat<(v2i32 (X86vzmovl VR64:$src)),
             (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
 }
 
-// Scalar to v4i16 / v8i8. The source may be a GR32, but only the lower
-// 8 or 16-bits matter.
-def : Pat<(bc_v8i8  (v2i32 (scalar_to_vector GR32:$src))),
-          (MMX_MOVD64rr GR32:$src)>;
-def : Pat<(bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))),
-          (MMX_MOVD64rr GR32:$src)>;
-
 // Patterns to perform canonical versions of vector shuffling.
 let AddedComplexity = 10 in {
   def : Pat<(v8i8  (mmx_unpckl_undef VR64:$src, (undef))),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1fafa46..b44c7a6 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3447,7 +3447,7 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
 }
 
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
-defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovsxbq", int_x86_sse41_pmovzxbq>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
 
 // Common patterns involving scalar load
 def : Pat<(int_x86_sse41_pmovsxbq
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index 8a5ac2c..fafcf7e 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -73,14 +73,15 @@ public:
                              SRetReturnReg(0),
                              GlobalBaseReg(0) {}
   
-  X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
-                                                CalleeSavedFrameSize(0),
-                                                BytesToPopOnReturn(0),
-                                                DecorationStyle(None),
-                                                ReturnAddrIndex(0),
-                                                TailCallReturnAddrDelta(0),
-                                                SRetReturnReg(0),
-                                                GlobalBaseReg(0) {}
+  explicit X86MachineFunctionInfo(MachineFunction &MF)
+    : ForceFramePointer(false),
+      CalleeSavedFrameSize(0),
+      BytesToPopOnReturn(0),
+      DecorationStyle(None),
+      ReturnAddrIndex(0),
+      TailCallReturnAddrDelta(0),
+      SRetReturnReg(0),
+      GlobalBaseReg(0) {}
   
   bool getForceFramePointer() const { return ForceFramePointer;} 
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 5af1fb1..c733f26 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -751,10 +751,12 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone).
+  bool DisableRedZone = Fn->hasFnAttr(Attribute::NoRedZone);
   if (Is64Bit && !DisableRedZone &&
       !needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
-      !MFI->hasCalls()) {                          // No calls.
+      !MFI->hasCalls() &&                          // No calls.
+      !Subtarget->isTargetWin64()) {               // Win64 has no Red Zone
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (hasFP(MF)) MinSize += SlotSize;
     StackSize = std::max(MinSize,
@@ -820,13 +822,6 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
 
-  unsigned ReadyLabelId = 0;
-  if (needsFrameMoves) {
-    // Mark effective beginning of when frame pointer is ready.
-    ReadyLabelId = MMI->NextLabelID();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
-  }
-
   // Skip the callee-saved push instructions.
   while (MBBI != MBB.end() &&
          (MBBI->getOpcode() == X86::PUSH32r ||
@@ -836,20 +831,20 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
-  if (NumBytes) {   // adjust stack pointer: ESP -= numbytes
+  if (NumBytes) {   // Adjust stack pointer: ESP -= numbytes.
     if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
-      // Check, whether EAX is livein for this function
+      // Check, whether EAX is livein for this function.
       bool isEAXAlive = false;
       for (MachineRegisterInfo::livein_iterator
            II = MF.getRegInfo().livein_begin(),
            EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
         unsigned Reg = II->first;
         isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
-                      Reg == X86::AH || Reg == X86::AL);
+                      Reg == X86::AH  || Reg == X86::AL);
       }
 
-      // Function prologue calls _alloca to probe the stack when allocating
-      // more than 4k bytes in one go. Touching the stack at 4K increments is
+      // Function prologue calls _alloca to probe the stack when allocating more
+      // than 4k bytes in one go. Touching the stack at 4K increments is
       // necessary to ensure that the guard pages used by the OS virtual memory
       // manager are allocated in correct sequence.
       if (!isEAXAlive) {
@@ -861,12 +856,14 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
         // Save EAX
         BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
           .addReg(X86::EAX, RegState::Kill);
+
         // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
         // allocated bytes for EAX.
-        BuildMI(MBB, MBBI, DL, 
-                TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
+        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+          .addImm(NumBytes-4);
         BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
           .addExternalSymbol("_alloca");
+
         // Restore EAX
         MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
                                                 X86::EAX),
@@ -878,6 +875,7 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
       // merge the two. This can be the case when tail call elimination is
       // enabled and the callee has more arguments then the caller.
       NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+
       // If there is an ADD32ri or SUB32ri of ESP immediately after this
       // instruction, merge the two instructions.
       mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
@@ -887,8 +885,13 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
     }
   }
 
-  if (needsFrameMoves)
+  if (needsFrameMoves) {
+    // Mark effective beginning of when frame pointer is ready.
+    unsigned ReadyLabelId = 0;
+    ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
     emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
+  }
 }
 
 void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 8264462..88ab247 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -133,7 +133,8 @@ X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
     DataLayout(Subtarget.getDataLayout()),
     FrameInfo(TargetFrameInfo::StackGrowsDown,
               Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
-    InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
+    InstrInfo(*this), JITInfo(*this), TLInfo(*this),
+    ELFWriterInfo(Subtarget.is64Bit()) {
   DefRelocModel = getRelocationModel();
   // FIXME: Correctly select PIC model for Win64 stuff
   if (getRelocationModel() == Reloc::Default) {
@@ -213,6 +214,13 @@ bool X86TargetMachine::addAssemblyEmitter(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel,
                                           bool Verbose,
                                           raw_ostream &Out) {
+  // FIXME: Move this somewhere else!
+  // On Darwin, override 64-bit static relocation to pic_ since the
+  // assembler doesn't support it.
+  if (DefRelocModel == Reloc::Static &&
+      Subtarget.isTargetDarwin() && Subtarget.is64Bit())
+    setRelocationModel(Reloc::PIC_);
+
   assert(AsmPrinterCtor && "AsmPrinter was not linked in");
   if (AsmPrinterCtor)
     PM.add(AsmPrinterCtor(Out, *this, OptLevel, Verbose));
diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
index 43adb0f..124a011 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -40,7 +40,7 @@ public:
     FPSpillSlot(0),
     VarArgsFrameIndex(0) {}
   
-  XCoreFunctionInfo(MachineFunction &MF) :
+  explicit XCoreFunctionInfo(MachineFunction &MF) :
     UsesLR(false),
     LRSpillSlot(0),
     FPSpillSlot(0),
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 733dfa9..673d38b 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -59,7 +59,8 @@ cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
 /// two values.
 namespace {
   struct VISIBILITY_HIDDEN Expression {
-    enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+    enum ExpressionOpcode { ADD, FADD, SUB, FSUB, MUL, FMUL,
+                            UDIV, SDIV, FDIV, UREM, SREM,
                             FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
                             ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
                             ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
@@ -200,8 +201,11 @@ Expression::ExpressionOpcode ValueTable::getOpcode(BinaryOperator* BO) {
   default: // THIS SHOULD NEVER HAPPEN
     assert(0 && "Binary operator with unknown opcode?");
   case Instruction::Add:  return Expression::ADD;
+  case Instruction::FAdd: return Expression::FADD;
   case Instruction::Sub:  return Expression::SUB;
+  case Instruction::FSub: return Expression::FSUB;
   case Instruction::Mul:  return Expression::MUL;
+  case Instruction::FMul: return Expression::FMUL;
   case Instruction::UDiv: return Expression::UDIV;
   case Instruction::SDiv: return Expression::SDIV;
   case Instruction::FDiv: return Expression::FDIV;
diff --git a/lib/Transforms/Scalar/GVNPRE.cpp b/lib/Transforms/Scalar/GVNPRE.cpp
index e3b0937..0f3153f 100644
--- a/lib/Transforms/Scalar/GVNPRE.cpp
+++ b/lib/Transforms/Scalar/GVNPRE.cpp
@@ -55,7 +55,8 @@ namespace {
 /// two values.
 
 struct Expression {
-  enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+  enum ExpressionOpcode { ADD, FADD, SUB, FSUB, MUL, FMUL,
+                          UDIV, SDIV, FDIV, UREM, SREM,
                           FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
                           ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
                           ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
@@ -202,10 +203,16 @@ Expression::ExpressionOpcode
   switch(BO->getOpcode()) {
     case Instruction::Add:
       return Expression::ADD;
+    case Instruction::FAdd:
+      return Expression::FADD;
     case Instruction::Sub:
       return Expression::SUB;
+    case Instruction::FSub:
+      return Expression::FSUB;
     case Instruction::Mul:
       return Expression::MUL;
+    case Instruction::FMul:
+      return Expression::FMUL;
     case Instruction::UDiv:
       return Expression::UDIV;
     case Instruction::SDiv:
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index af61eae..83503fd 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -754,7 +754,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH) {
   BinaryOperator *Incr =
     dyn_cast<BinaryOperator>(PH->getIncomingValue(BackEdge));
   if (!Incr) return;
-  if (Incr->getOpcode() != Instruction::Add) return;
+  if (Incr->getOpcode() != Instruction::FAdd) return;
   ConstantFP *IncrValue = NULL;
   unsigned IncrVIndex = 1;
   if (Incr->getOperand(1) == PH)
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
index e6f854f..97bd34c 100644
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -167,8 +167,11 @@ namespace {
     //   otherwise    - Change was made, replace I with returned instruction
     //
     Instruction *visitAdd(BinaryOperator &I);
+    Instruction *visitFAdd(BinaryOperator &I);
     Instruction *visitSub(BinaryOperator &I);
+    Instruction *visitFSub(BinaryOperator &I);
     Instruction *visitMul(BinaryOperator &I);
+    Instruction *visitFMul(BinaryOperator &I);
     Instruction *visitURem(BinaryOperator &I);
     Instruction *visitSRem(BinaryOperator &I);
     Instruction *visitFRem(BinaryOperator &I);
@@ -403,7 +406,8 @@ X("instcombine", "Combine redundant instructions");
 //   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
 static unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (BinaryOperator::isNeg(V) || BinaryOperator::isNot(V))
+    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
+        BinaryOperator::isNot(V))
       return 3;
     return 4;
   }
@@ -576,6 +580,25 @@ static inline Value *dyn_castNegVal(Value *V) {
   return 0;
 }
 
+// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the
+// instruction if the LHS is a constant negative zero (which is the 'negate'
+// form).
+//
+static inline Value *dyn_castFNegVal(Value *V) {
+  if (BinaryOperator::isFNeg(V))
+    return BinaryOperator::getFNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantFP *C = dyn_cast<ConstantFP>(V))
+    return ConstantExpr::getFNeg(C);
+
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V))
+    if (C->getType()->getElementType()->isFloatingPoint())
+      return ConstantExpr::getFNeg(C);
+
+  return 0;
+}
+
 static inline Value *dyn_castNotVal(Value *V) {
   if (BinaryOperator::isNot(V))
     return BinaryOperator::getNotArgument(V);
@@ -1733,12 +1756,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
           default: assert(0 && "Case stmts out of sync!");
           case Intrinsic::x86_sse_sub_ss:
           case Intrinsic::x86_sse2_sub_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::CreateSub(LHS, RHS,
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateFSub(LHS, RHS,
                                                         II->getName()), *II);
             break;
           case Intrinsic::x86_sse_mul_ss:
           case Intrinsic::x86_sse2_mul_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::CreateMul(LHS, RHS,
+            TmpV = InsertNewInstBefore(BinaryOperator::CreateFMul(LHS, RHS,
                                                          II->getName()), *II);
             break;
           }
@@ -2052,14 +2075,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       return ReplaceInstUsesWith(I, RHS);
 
     // X + 0 --> X
-    if (!I.getType()->isFPOrFPVector()) { // NOTE: -0 + +0 = +0.
-      if (RHSC->isNullValue())
-        return ReplaceInstUsesWith(I, LHS);
-    } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
-      if (CFP->isExactlyValue(ConstantFP::getNegativeZero
-                              (I.getType())->getValueAPF()))
-        return ReplaceInstUsesWith(I, LHS);
-    }
+    if (RHSC->isNullValue())
+      return ReplaceInstUsesWith(I, LHS);
 
     if (ConstantInt *CI = dyn_cast<ConstantInt>(RHSC)) {
       // X + (signbit) --> X ^ signbit
@@ -2317,11 +2334,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         return SelectInst::Create(SI->getCondition(), A, N);
     }
   }
-  
-  // Check for X+0.0.  Simplify it to X if we know X is not -0.0.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
-    if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
-      return ReplaceInstUsesWith(I, LHS);
 
   // Check for (add (sext x), y), see if we can merge this into an
   // integer add followed by a sext.
@@ -2359,7 +2371,42 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       }
     }
   }
-  
+
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+    // X + 0 --> X
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->isExactlyValue(ConstantFP::getNegativeZero
+                              (I.getType())->getValueAPF()))
+        return ReplaceInstUsesWith(I, LHS);
+    }
+
+    if (isa<PHINode>(LHS))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castFNegVal(LHS))
+    return BinaryOperator::CreateFSub(RHS, LHSV);
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castFNegVal(RHS))
+      return BinaryOperator::CreateFSub(LHS, V);
+
+  // Check for X+0.0.  Simplify it to X if we know X is not -0.0.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+    if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
+      return ReplaceInstUsesWith(I, LHS);
+
   // Check for (add double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
@@ -2407,8 +2454,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (Op0 == Op1 &&                        // sub X, X  -> 0
-      !I.getType()->isFPOrFPVector())
+  if (Op0 == Op1)                        // sub X, X  -> 0
     return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
 
   // If this is a 'B = x-(-A)', change to B = x+A...
@@ -2469,8 +2515,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return BinaryOperator::CreateXor(Op0, Op1);
 
   if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
-    if (Op1I->getOpcode() == Instruction::Add &&
-        !Op0->getType()->isFPOrFPVector()) {
+    if (Op1I->getOpcode() == Instruction::Add) {
       if (Op1I->getOperand(0) == Op0)              // X-(X+Y) == -Y
         return BinaryOperator::CreateNeg(Op1I->getOperand(1), I.getName());
       else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
@@ -2487,8 +2532,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       // Replace (x - (y - z)) with (x + (z - y)) if the (y - z) subexpression
       // is not used by anyone else...
       //
-      if (Op1I->getOpcode() == Instruction::Sub &&
-          !Op1I->getType()->isFPOrFPVector()) {
+      if (Op1I->getOpcode() == Instruction::Sub) {
         // Swap the two operands of the subexpr...
         Value *IIOp0 = Op1I->getOperand(0), *IIOp1 = Op1I->getOperand(1);
         Op1I->setOperand(0, IIOp1);
@@ -2526,18 +2570,17 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     }
   }
 
-  if (!Op0->getType()->isFPOrFPVector())
-    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
-      if (Op0I->getOpcode() == Instruction::Add) {
-        if (Op0I->getOperand(0) == Op1)             // (Y+X)-Y == X
-          return ReplaceInstUsesWith(I, Op0I->getOperand(1));
-        else if (Op0I->getOperand(1) == Op1)        // (X+Y)-Y == X
-          return ReplaceInstUsesWith(I, Op0I->getOperand(0));
-      } else if (Op0I->getOpcode() == Instruction::Sub) {
-        if (Op0I->getOperand(0) == Op1)             // (X-Y)-X == -Y
-          return BinaryOperator::CreateNeg(Op0I->getOperand(1), I.getName());
-      }
+  if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+    if (Op0I->getOpcode() == Instruction::Add) {
+      if (Op0I->getOperand(0) == Op1)             // (Y+X)-Y == X
+        return ReplaceInstUsesWith(I, Op0I->getOperand(1));
+      else if (Op0I->getOperand(1) == Op1)        // (X+Y)-Y == X
+        return ReplaceInstUsesWith(I, Op0I->getOperand(0));
+    } else if (Op0I->getOpcode() == Instruction::Sub) {
+      if (Op0I->getOperand(0) == Op1)             // (X-Y)-X == -Y
+        return BinaryOperator::CreateNeg(Op0I->getOperand(1), I.getName());
     }
+  }
 
   ConstantInt *C1;
   if (Value *X = dyn_castFoldableMul(Op0, C1)) {
@@ -2551,6 +2594,40 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   return 0;
 }
 
+Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // If this is a 'B = x-(-A)', change to B = x+A...
+  if (Value *V = dyn_castFNegVal(Op1))
+    return BinaryOperator::CreateFAdd(Op0, V);
+
+  if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
+    if (Op1I->getOpcode() == Instruction::FAdd) {
+      if (Op1I->getOperand(0) == Op0)              // X-(X+Y) == -Y
+        return BinaryOperator::CreateFNeg(Op1I->getOperand(1), I.getName());
+      else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
+        return BinaryOperator::CreateFNeg(Op1I->getOperand(0), I.getName());
+    }
+
+    if (Op1I->hasOneUse()) {
+      // Replace (x - (y - z)) with (x + (z - y)) if the (y - z) subexpression
+      // is not used by anyone else...
+      //
+      if (Op1I->getOpcode() == Instruction::FSub) {
+        // Swap the two operands of the subexpr...
+        Value *IIOp0 = Op1I->getOperand(0), *IIOp1 = Op1I->getOperand(1);
+        Op1I->setOperand(0, IIOp1);
+        Op1I->setOperand(1, IIOp0);
+
+        // Create the new top level fadd instruction...
+        return BinaryOperator::CreateFAdd(Op0, Op1);
+      }
+    }
+  }
+
+  return 0;
+}
+
 /// isSignBitCheck - Given an exploded icmp instruction, return true if the
 /// comparison only checks the sign bit.  If it only checks the sign bit, set
 /// TrueIfSigned if the result of the comparison is true when the input value is
@@ -2585,7 +2662,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   bool Changed = SimplifyCommutative(I);
   Value *Op0 = I.getOperand(0);
 
-  if (isa<UndefValue>(I.getOperand(1)))              // undef * X -> 0
+  // TODO: If Op1 is undef and Op0 is finite, return zero.
+  if (!I.getType()->isFPOrFPVector() &&
+      isa<UndefValue>(I.getOperand(1)))              // undef * X -> 0
     return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
 
   // Simplify mul instructions with a constant RHS...
@@ -2611,17 +2690,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         return BinaryOperator::CreateShl(Op0,
                  ConstantInt::get(Op0->getType(), Val.logBase2()));
       }
-    } else if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1)) {
-      if (Op1F->isNullValue())
-        return ReplaceInstUsesWith(I, Op1);
-
-      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
-      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
-      if (Op1F->isExactlyValue(1.0))
-        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
     } else if (isa<VectorType>(Op1->getType())) {
-      if (isa<ConstantAggregateZero>(Op1))
-        return ReplaceInstUsesWith(I, Op1);
+      // TODO: If Op1 is all zeros and Op0 is all finite, return all zeros.
 
       if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
         if (Op1V->isAllOnesValue())              // X * -1 == 0 - X
@@ -2629,9 +2699,6 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
         // As above, vector X*splat(1.0) -> X in all defined cases.
         if (Constant *Splat = Op1V->getSplatValue()) {
-          if (ConstantFP *F = dyn_cast<ConstantFP>(Splat))
-            if (F->isExactlyValue(1.0))
-              return ReplaceInstUsesWith(I, Op0);
           if (ConstantInt *CI = dyn_cast<ConstantInt>(Splat))
             if (CI->equalsInt(1))
               return ReplaceInstUsesWith(I, Op0);
@@ -2755,6 +2822,45 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   return Changed ? &I : 0;
 }
 
+Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0);
+
+  // Simplify mul instructions with a constant RHS...
+  if (Constant *Op1 = dyn_cast<Constant>(I.getOperand(1))) {
+    if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1)) {
+      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
+      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
+      if (Op1F->isExactlyValue(1.0))
+        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
+    } else if (isa<VectorType>(Op1->getType())) {
+      if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1)) {
+        // As above, vector X*splat(1.0) -> X in all defined cases.
+        if (Constant *Splat = Op1V->getSplatValue()) {
+          if (ConstantFP *F = dyn_cast<ConstantFP>(Splat))
+            if (F->isExactlyValue(1.0))
+              return ReplaceInstUsesWith(I, Op0);
+        }
+      }
+    }
+
+    // Try to fold constant mul into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *Op0v = dyn_castFNegVal(Op0))     // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castFNegVal(I.getOperand(1)))
+      return BinaryOperator::CreateFMul(Op0v, Op1v);
+
+  return Changed ? &I : 0;
+}
+
 /// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select
 /// instruction.
 bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
@@ -8562,17 +8668,17 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
   
-  // If we have fptrunc(add (fpextend x), (fpextend y)), where x and y are
+  // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
   // smaller than the destination type, we can eliminate the truncate by doing
-  // the add as the smaller type.  This applies to add/sub/mul/div as well as
+  // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well as
   // many builtins (sqrt, etc).
   BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
   if (OpI && OpI->hasOneUse()) {
     switch (OpI->getOpcode()) {
     default: break;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
     case Instruction::FDiv:
     case Instruction::FRem:
       const Type *SrcTy = OpI->getType();
@@ -9322,11 +9428,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
         // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))).  This is
         // even legal for FP.
-        if (TI->getOpcode() == Instruction::Sub &&
-            FI->getOpcode() == Instruction::Add) {
+        if ((TI->getOpcode() == Instruction::Sub &&
+             FI->getOpcode() == Instruction::Add) ||
+            (TI->getOpcode() == Instruction::FSub &&
+             FI->getOpcode() == Instruction::FAdd)) {
           AddOp = FI; SubOp = TI;
-        } else if (FI->getOpcode() == Instruction::Sub &&
-                   TI->getOpcode() == Instruction::Add) {
+        } else if ((FI->getOpcode() == Instruction::Sub &&
+                    TI->getOpcode() == Instruction::Add) ||
+                   (FI->getOpcode() == Instruction::FSub &&
+                    TI->getOpcode() == Instruction::FAdd)) {
           AddOp = TI; SubOp = FI;
         }
 
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 92270b5..944f409 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2268,7 +2268,8 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
       /* create new increment. '++d' in above example. */
       ConstantFP *CFP = ConstantFP::get(DestTy, C->getZExtValue());
       BinaryOperator *NewIncr = 
-        BinaryOperator::Create(Incr->getOpcode(),
+        BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+                                 Instruction::FAdd : Instruction::FSub,
                                NewPH, CFP, "IV.S.next.", Incr);
 
       NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
@@ -2424,24 +2425,14 @@ void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
 
   // Get the terminating condition for the loop if possible (this isn't
   // necessarily in the latch, or a block that's a predecessor of the header).
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() != 1) return;
+  if (!L->getExitBlock())
+    return; // More than one loop exit blocks.
 
   // Okay, there is one exit block.  Try to find the condition that causes the
   // loop to be exited.
-  BasicBlock *ExitBlock = ExitBlocks[0];
-
-  BasicBlock *ExitingBlock = 0;
-  for (pred_iterator PI = pred_begin(ExitBlock), E = pred_end(ExitBlock);
-       PI != E; ++PI)
-    if (L->contains(*PI)) {
-      if (ExitingBlock == 0)
-        ExitingBlock = *PI;
-      else
-        return; // More than one block exiting!
-    }
-  assert(ExitingBlock && "No exits from loop, something is broken!");
+  BasicBlock *ExitingBlock = L->getExitingBlock();
+  if (!ExitingBlock)
+    return; // More than one block exiting!
 
   // Okay, we've computed the exiting block.  See what condition causes us to
   // exit.
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 4b00640..59989c9 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -1009,7 +1009,7 @@ struct VISIBILITY_HIDDEN PowOpt : public LibCallOptimization {
     if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
       return Op1;
     if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
-      return B.CreateMul(Op1, Op1, "pow2");
+      return B.CreateFMul(Op1, Op1, "pow2");
     if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
       return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
     return 0;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 2cde765..bcc6b81 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -419,9 +419,6 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
       case Instruction::LShr:
       case Instruction::AShr:
       case Instruction::ICmp:
-      case Instruction::FCmp:
-        if (I->getOperand(0)->getType()->isFPOrFPVector())
-          return false;  // FP arithmetic might trap.
         break;   // These are all cheap and non-trapping instructions.
       }
 
@@ -1012,9 +1009,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
   default: return false;  // Not safe / profitable to hoist.
   case Instruction::Add:
   case Instruction::Sub:
-    // FP arithmetic might trap. Not worth doing for vector ops.
-    if (HInst->getType()->isFloatingPoint() 
-        || isa<VectorType>(HInst->getType()))
+    // Not worth doing for vector ops.
+    if (isa<VectorType>(HInst->getType()))
       return false;
     break;
   case Instruction::And:
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index 5a8fad9..8dfbd1d 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -59,6 +59,10 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += "ssp ";
   if (Attrs & Attribute::StackProtectReq)
     Result += "sspreq ";
+  if (Attrs & Attribute::NoRedZone)
+    Result += "noredzone ";
+  if (Attrs & Attribute::NoImplicitFloat)
+    Result += "noimplicitfloat ";
   if (Attrs & Attribute::Alignment) {
     Result += "align ";
     Result += utostr(Attribute::getAlignmentFromAttrs(Attrs));
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index 7e4902f..1d293cc 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -602,10 +602,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       return Constant::getNullValue(C1->getType());
     case Instruction::UDiv:
     case Instruction::SDiv:
-    case Instruction::FDiv:
     case Instruction::URem:
     case Instruction::SRem:
-    case Instruction::FRem:
       if (!isa<UndefValue>(C2))                    // undef / X -> 0
         return Constant::getNullValue(C1->getType());
       return const_cast<Constant*>(C2);            // X / undef -> undef
@@ -783,13 +781,13 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       switch (Opcode) {
       default:                   
         break;
-      case Instruction::Add:
+      case Instruction::FAdd:
         (void)C3V.add(C2V, APFloat::rmNearestTiesToEven);
         return ConstantFP::get(C3V);
-      case Instruction::Sub:     
+      case Instruction::FSub:
         (void)C3V.subtract(C2V, APFloat::rmNearestTiesToEven);
         return ConstantFP::get(C3V);
-      case Instruction::Mul:
+      case Instruction::FMul:
         (void)C3V.multiply(C2V, APFloat::rmNearestTiesToEven);
         return ConstantFP::get(C3V);
       case Instruction::FDiv:
@@ -808,12 +806,18 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       switch (Opcode) {
       default:
         break;
-      case Instruction::Add: 
+      case Instruction::Add:
         return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getAdd);
-      case Instruction::Sub: 
+      case Instruction::FAdd:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getFAdd);
+      case Instruction::Sub:
         return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getSub);
-      case Instruction::Mul: 
+      case Instruction::FSub:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getFSub);
+      case Instruction::Mul:
         return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getMul);
+      case Instruction::FMul:
+        return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getFMul);
       case Instruction::UDiv:
         return EvalVectorOp(CP1, CP2, VTy, ConstantExpr::getUDiv);
       case Instruction::SDiv:
@@ -851,7 +855,9 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     // other way if possible.
     switch (Opcode) {
     case Instruction::Add:
+    case Instruction::FAdd:
     case Instruction::Mul:
+    case Instruction::FMul:
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
@@ -862,6 +868,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::Sub:
+    case Instruction::FSub:
     case Instruction::SDiv:
     case Instruction::UDiv:
     case Instruction::FDiv:
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 97f3ac9..69c503d 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -775,26 +775,46 @@ const SmallVector<unsigned, 4> &ConstantExpr::getIndices() const {
 /// specify the full Instruction::OPCODE identifier.
 ///
 Constant *ConstantExpr::getNeg(Constant *C) {
+  // API compatibility: Adjust integer opcodes to floating-point opcodes.
+  if (C->getType()->isFPOrFPVector())
+    return getFNeg(C);
+  assert(C->getType()->isIntOrIntVector() &&
+         "Cannot NEG a nonintegral value!");
   return get(Instruction::Sub,
              ConstantExpr::getZeroValueForNegationExpr(C->getType()),
              C);
 }
+Constant *ConstantExpr::getFNeg(Constant *C) {
+  assert(C->getType()->isFPOrFPVector() &&
+         "Cannot FNEG a non-floating-point value!");
+  return get(Instruction::FSub,
+             ConstantExpr::getZeroValueForNegationExpr(C->getType()),
+             C);
+}
 Constant *ConstantExpr::getNot(Constant *C) {
-  assert((isa<IntegerType>(C->getType()) ||
-            cast<VectorType>(C->getType())->getElementType()->isInteger()) &&
-          "Cannot NOT a nonintegral value!");
+  assert(C->getType()->isIntOrIntVector() &&
+         "Cannot NOT a nonintegral value!");
   return get(Instruction::Xor, C,
              Constant::getAllOnesValue(C->getType()));
 }
 Constant *ConstantExpr::getAdd(Constant *C1, Constant *C2) {
   return get(Instruction::Add, C1, C2);
 }
+Constant *ConstantExpr::getFAdd(Constant *C1, Constant *C2) {
+  return get(Instruction::FAdd, C1, C2);
+}
 Constant *ConstantExpr::getSub(Constant *C1, Constant *C2) {
   return get(Instruction::Sub, C1, C2);
 }
+Constant *ConstantExpr::getFSub(Constant *C1, Constant *C2) {
+  return get(Instruction::FSub, C1, C2);
+}
 Constant *ConstantExpr::getMul(Constant *C1, Constant *C2) {
   return get(Instruction::Mul, C1, C2);
 }
+Constant *ConstantExpr::getFMul(Constant *C1, Constant *C2) {
+  return get(Instruction::FMul, C1, C2);
+}
 Constant *ConstantExpr::getUDiv(Constant *C1, Constant *C2) {
   return get(Instruction::UDiv, C1, C2);
 }
@@ -2142,15 +2162,28 @@ Constant *ConstantExpr::getCompareTy(unsigned short predicate,
 }
 
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2) {
+  // API compatibility: Adjust integer opcodes to floating-point opcodes.
+  if (C1->getType()->isFPOrFPVector()) {
+    if (Opcode == Instruction::Add) Opcode = Instruction::FAdd;
+    else if (Opcode == Instruction::Sub) Opcode = Instruction::FSub;
+    else if (Opcode == Instruction::Mul) Opcode = Instruction::FMul;
+  }
 #ifndef NDEBUG
   switch (Opcode) {
-  case Instruction::Add: 
+  case Instruction::Add:
   case Instruction::Sub:
-  case Instruction::Mul: 
+  case Instruction::Mul:
     assert(C1->getType() == C2->getType() && "Op types should be identical!");
-    assert((C1->getType()->isInteger() || C1->getType()->isFloatingPoint() ||
-            isa<VectorType>(C1->getType())) &&
-           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    assert(C1->getType()->isIntOrIntVector() &&
+           "Tried to create an integer operation on a non-integer type!");
+    break;
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isFPOrFPVector() &&
+           "Tried to create a floating-point operation on a "
+           "non-floating-point type!");
     break;
   case Instruction::UDiv: 
   case Instruction::SDiv: 
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 9e030b7..7556b8e 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -101,8 +101,11 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
 
   // Standard binary operators...
   case Add: return "add";
+  case FAdd: return "fadd";
   case Sub: return "sub";
+  case FSub: return "fsub";
   case Mul: return "mul";
+  case FMul: return "fmul";
   case UDiv: return "udiv";
   case SDiv: return "sdiv";
   case FDiv: return "fdiv";
@@ -330,19 +333,13 @@ bool Instruction::mayThrow() const {
 
 /// isAssociative - Return true if the instruction is associative:
 ///
-///   Associative operators satisfy:  x op (y op z) === (x op y) op z)
+///   Associative operators satisfy:  x op (y op z) === (x op y) op z
 ///
-/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative, when not
-/// applied to floating point types.
+/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative.
 ///
 bool Instruction::isAssociative(unsigned Opcode, const Type *Ty) {
-  if (Opcode == And || Opcode == Or || Opcode == Xor)
-    return true;
-
-  // Add/Mul reassociate unless they are FP or FP vectors.
-  if (Opcode == Add || Opcode == Mul)
-    return !Ty->isFPOrFPVector();
-  return 0;
+  return Opcode == And || Opcode == Or || Opcode == Xor ||
+         Opcode == Add || Opcode == Mul;
 }
 
 /// isCommutative - Return true if the instruction is commutative:
@@ -355,7 +352,9 @@ bool Instruction::isAssociative(unsigned Opcode, const Type *Ty) {
 bool Instruction::isCommutative(unsigned op) {
   switch (op) {
   case Add:
+  case FAdd:
   case Mul:
+  case FMul:
   case And:
   case Or:
   case Xor:
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index fe30271..4c228fe 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -1502,29 +1502,43 @@ const Type* ExtractValueInst::getIndexedType(const Type *Agg,
 //                             BinaryOperator Class
 //===----------------------------------------------------------------------===//
 
+/// AdjustIType - Map Add, Sub, and Mul to FAdd, FSub, and FMul when the
+/// type is floating-point, to help provide compatibility with an older API.
+///
+static BinaryOperator::BinaryOps AdjustIType(BinaryOperator::BinaryOps iType,
+                                             const Type *Ty) {
+  // API compatibility: Adjust integer opcodes to floating-point opcodes.
+  if (Ty->isFPOrFPVector()) {
+    if (iType == BinaryOperator::Add) iType = BinaryOperator::FAdd;
+    else if (iType == BinaryOperator::Sub) iType = BinaryOperator::FSub;
+    else if (iType == BinaryOperator::Mul) iType = BinaryOperator::FMul;
+  }
+  return iType;
+}
+
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                                const Type *Ty, const std::string &Name,
                                Instruction *InsertBefore)
-  : Instruction(Ty, iType,
+  : Instruction(Ty, AdjustIType(iType, Ty),
                 OperandTraits<BinaryOperator>::op_begin(this),
                 OperandTraits<BinaryOperator>::operands(this),
                 InsertBefore) {
   Op<0>() = S1;
   Op<1>() = S2;
-  init(iType);
+  init(AdjustIType(iType, Ty));
   setName(Name);
 }
 
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
                                const Type *Ty, const std::string &Name,
                                BasicBlock *InsertAtEnd)
-  : Instruction(Ty, iType,
+  : Instruction(Ty, AdjustIType(iType, Ty),
                 OperandTraits<BinaryOperator>::op_begin(this),
                 OperandTraits<BinaryOperator>::operands(this),
                 InsertAtEnd) {
   Op<0>() = S1;
   Op<1>() = S2;
-  init(iType);
+  init(AdjustIType(iType, Ty));
   setName(Name);
 }
 
@@ -1537,12 +1551,19 @@ void BinaryOperator::init(BinaryOps iType) {
 #ifndef NDEBUG
   switch (iType) {
   case Add: case Sub:
-  case Mul: 
+  case Mul:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert(getType()->isIntOrIntVector() &&
+           "Tried to create an integer operation on a non-integer type!");
+    break;
+  case FAdd: case FSub:
+  case FMul:
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isInteger() || getType()->isFloatingPoint() ||
-            isa<VectorType>(getType())) &&
-          "Tried to create an arithmetic operation on a non-arithmetic type!");
+    assert(getType()->isFPOrFPVector() &&
+           "Tried to create a floating-point operation on a "
+           "non-floating-point type!");
     break;
   case UDiv: 
   case SDiv: 
@@ -1631,6 +1652,22 @@ BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const std::string &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
+BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const std::string &Name,
+                                           Instruction *InsertBefore) {
+  Value *zero = ConstantExpr::getZeroValueForNegationExpr(Op->getType());
+  return new BinaryOperator(Instruction::FSub,
+                            zero, Op,
+                            Op->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const std::string &Name,
+                                           BasicBlock *InsertAtEnd) {
+  Value *zero = ConstantExpr::getZeroValueForNegationExpr(Op->getType());
+  return new BinaryOperator(Instruction::FSub,
+                            zero, Op,
+                            Op->getType(), Name, InsertAtEnd);
+}
+
 BinaryOperator *BinaryOperator::CreateNot(Value *Op, const std::string &Name,
                                           Instruction *InsertBefore) {
   Constant *C;
@@ -1679,6 +1716,14 @@ bool BinaryOperator::isNeg(const Value *V) {
   return false;
 }
 
+bool BinaryOperator::isFNeg(const Value *V) {
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    if (Bop->getOpcode() == Instruction::FSub)
+      return Bop->getOperand(0) ==
+             ConstantExpr::getZeroValueForNegationExpr(Bop->getType());
+  return false;
+}
+
 bool BinaryOperator::isNot(const Value *V) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     return (Bop->getOpcode() == Instruction::Xor &&
@@ -1696,6 +1741,15 @@ const Value *BinaryOperator::getNegArgument(const Value *BinOp) {
   return getNegArgument(const_cast<Value*>(BinOp));
 }
 
+Value *BinaryOperator::getFNegArgument(Value *BinOp) {
+  assert(isFNeg(BinOp) && "getFNegArgument from non-'fneg' instruction!");
+  return cast<BinaryOperator>(BinOp)->getOperand(1);
+}
+
+const Value *BinaryOperator::getFNegArgument(const Value *BinOp) {
+  return getFNegArgument(const_cast<Value*>(BinOp));
+}
+
 Value *BinaryOperator::getNotArgument(Value *BinOp) {
   assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
   BinaryOperator *BO = cast<BinaryOperator>(BinOp);
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 59ec3be..b047d0c 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -1069,13 +1069,40 @@ void Verifier::visitBinaryOperator(BinaryOperator &B) {
           "Both operands to a binary operator are not of the same type!", &B);
 
   switch (B.getOpcode()) {
+  // Check that integer arithmetic operators are only used with
+  // integral operands.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    Assert1(B.getType()->isIntOrIntVector(),
+            "Integer arithmetic operators only work with integral types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Integer arithmetic operators must have same type "
+            "for operands and result!", &B);
+    break;
+  // Check that floating-point arithmetic operators are only used with
+  // floating-point operands.
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+    Assert1(B.getType()->isFPOrFPVector(),
+            "Floating-point arithmetic operators only work with "
+            "floating-point types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Floating-point arithmetic operators must have same type "
+            "for operands and result!", &B);
+    break;
   // Check that logical operators are only used with integral operands.
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-    Assert1(B.getType()->isInteger() ||
-            (isa<VectorType>(B.getType()) && 
-             cast<VectorType>(B.getType())->getElementType()->isInteger()),
+    Assert1(B.getType()->isIntOrIntVector(),
             "Logical operators only work with integral types!", &B);
     Assert1(B.getType() == B.getOperand(0)->getType(),
             "Logical operators must have same type for operands and result!",
@@ -1084,22 +1111,13 @@ void Verifier::visitBinaryOperator(BinaryOperator &B) {
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
-    Assert1(B.getType()->isInteger() ||
-            (isa<VectorType>(B.getType()) && 
-             cast<VectorType>(B.getType())->getElementType()->isInteger()),
+    Assert1(B.getType()->isIntOrIntVector(),
             "Shifts only work with integral types!", &B);
     Assert1(B.getType() == B.getOperand(0)->getType(),
             "Shift return type must be same as operands!", &B);
-    /* FALL THROUGH */
-  default:
-    // Arithmetic operators only work on integer or fp values
-    Assert1(B.getType() == B.getOperand(0)->getType(),
-            "Arithmetic operators must have same type for operands and result!",
-            &B);
-    Assert1(B.getType()->isInteger() || B.getType()->isFloatingPoint() ||
-            isa<VectorType>(B.getType()),
-            "Arithmetic operators must have integer, fp, or vector type!", &B);
     break;
+  default:
+    assert(0 && "Unknown BinaryOperator opcode!");
   }
 
   visitInstruction(B);